- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
175 lines
7 KiB
Python
175 lines
7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze Japan Prefecture Coverage
|
|
|
|
Maps prefecture codes to ISO 3166-2 codes and analyzes geographic distribution.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
import json
|
|
|
|
# ISO 3166-2:JP Prefecture Code Mapping
|
|
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:JP
|
|
PREFECTURE_CODES = {
|
|
# Hokkaido
|
|
'HO': {'iso': 'JP-01', 'name': 'Hokkaido', 'region': 'Hokkaido'},
|
|
|
|
# Tohoku
|
|
'AO': {'iso': 'JP-02', 'name': 'Aomori', 'region': 'Tohoku'},
|
|
'IW': {'iso': 'JP-03', 'name': 'Iwate', 'region': 'Tohoku'},
|
|
'MI': {'iso': 'JP-04', 'name': 'Miyagi', 'region': 'Tohoku'},
|
|
'AK': {'iso': 'JP-05', 'name': 'Akita', 'region': 'Tohoku'},
|
|
'YA': {'iso': 'JP-06', 'name': 'Yamagata', 'region': 'Tohoku'},
|
|
'FU': {'iso': 'JP-07', 'name': 'Fukushima', 'region': 'Tohoku'},
|
|
|
|
# Kanto
|
|
'IB': {'iso': 'JP-08', 'name': 'Ibaraki', 'region': 'Kanto'},
|
|
'TC': {'iso': 'JP-09', 'name': 'Tochigi', 'region': 'Kanto'},
|
|
'GU': {'iso': 'JP-10', 'name': 'Gunma', 'region': 'Kanto'},
|
|
'SA': {'iso': 'JP-11', 'name': 'Saitama', 'region': 'Kanto'},
|
|
'CH': {'iso': 'JP-12', 'name': 'Chiba', 'region': 'Kanto'},
|
|
'TO': {'iso': 'JP-13', 'name': 'Tokyo', 'region': 'Kanto'},
|
|
'KN': {'iso': 'JP-14', 'name': 'Kanagawa', 'region': 'Kanto'},
|
|
|
|
# Chubu
|
|
'NI': {'iso': 'JP-15', 'name': 'Niigata', 'region': 'Chubu'},
|
|
'TY': {'iso': 'JP-16', 'name': 'Toyama', 'region': 'Chubu'},
|
|
'IS': {'iso': 'JP-17', 'name': 'Ishikawa', 'region': 'Chubu'},
|
|
'FK': {'iso': 'JP-18', 'name': 'Fukui', 'region': 'Chubu'},
|
|
'YM': {'iso': 'JP-19', 'name': 'Yamanashi', 'region': 'Chubu'},
|
|
'NA': {'iso': 'JP-20', 'name': 'Nagano', 'region': 'Chubu'},
|
|
'GI': {'iso': 'JP-21', 'name': 'Gifu', 'region': 'Chubu'},
|
|
'SZ': {'iso': 'JP-22', 'name': 'Shizuoka', 'region': 'Chubu'},
|
|
'AI': {'iso': 'JP-23', 'name': 'Aichi', 'region': 'Chubu'},
|
|
|
|
# Kansai
|
|
'ME': {'iso': 'JP-24', 'name': 'Mie', 'region': 'Kansai'},
|
|
'SH': {'iso': 'JP-25', 'name': 'Shiga', 'region': 'Kansai'},
|
|
'KY': {'iso': 'JP-26', 'name': 'Kyoto', 'region': 'Kansai'},
|
|
'OS': {'iso': 'JP-27', 'name': 'Osaka', 'region': 'Kansai'},
|
|
'HY': {'iso': 'JP-28', 'name': 'Hyogo', 'region': 'Kansai'},
|
|
'NR': {'iso': 'JP-29', 'name': 'Nara', 'region': 'Kansai'},
|
|
'WA': {'iso': 'JP-30', 'name': 'Wakayama', 'region': 'Kansai'},
|
|
|
|
# Chugoku
|
|
'TT': {'iso': 'JP-31', 'name': 'Tottori', 'region': 'Chugoku'},
|
|
'SM': {'iso': 'JP-32', 'name': 'Shimane', 'region': 'Chugoku'},
|
|
'OK': {'iso': 'JP-33', 'name': 'Okayama', 'region': 'Chugoku'},
|
|
'HI': {'iso': 'JP-34', 'name': 'Hiroshima', 'region': 'Chugoku'},
|
|
'YG': {'iso': 'JP-35', 'name': 'Yamaguchi', 'region': 'Chugoku'},
|
|
|
|
# Shikoku
|
|
'TK': {'iso': 'JP-36', 'name': 'Tokushima', 'region': 'Shikoku'},
|
|
'KA': {'iso': 'JP-37', 'name': 'Kagawa', 'region': 'Shikoku'},
|
|
'EH': {'iso': 'JP-38', 'name': 'Ehime', 'region': 'Shikoku'},
|
|
'KO': {'iso': 'JP-39', 'name': 'Kochi', 'region': 'Shikoku'},
|
|
|
|
# Kyushu
|
|
'FO': {'iso': 'JP-40', 'name': 'Fukuoka', 'region': 'Kyushu'},
|
|
'SG': {'iso': 'JP-41', 'name': 'Saga', 'region': 'Kyushu'},
|
|
'NS': {'iso': 'JP-42', 'name': 'Nagasaki', 'region': 'Kyushu'},
|
|
'KM': {'iso': 'JP-43', 'name': 'Kumamoto', 'region': 'Kyushu'},
|
|
'OI': {'iso': 'JP-44', 'name': 'Oita', 'region': 'Kyushu'},
|
|
'MZ': {'iso': 'JP-45', 'name': 'Miyazaki', 'region': 'Kyushu'},
|
|
'KS': {'iso': 'JP-46', 'name': 'Kagoshima', 'region': 'Kyushu'},
|
|
'ON': {'iso': 'JP-47', 'name': 'Okinawa', 'region': 'Kyushu'},
|
|
}
|
|
|
|
|
|
def analyze_prefecture_coverage():
|
|
"""Analyze prefecture coverage in Japan dataset."""
|
|
|
|
# Load dataset
|
|
data_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml'
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
records = yaml.safe_load(f)
|
|
|
|
print(f"Analyzing {len(records)} institutions...")
|
|
print()
|
|
|
|
# Extract prefecture codes from GHCIDs
|
|
prefecture_counts = Counter()
|
|
region_counts = Counter()
|
|
missing_prefectures = []
|
|
|
|
for record in records:
|
|
ghcid = record.get('ghcid', '')
|
|
if ghcid:
|
|
parts = ghcid.split('-')
|
|
if len(parts) >= 2:
|
|
pref_code = parts[1]
|
|
prefecture_counts[pref_code] += 1
|
|
|
|
# Map to region
|
|
if pref_code in PREFECTURE_CODES:
|
|
region = PREFECTURE_CODES[pref_code]['region']
|
|
region_counts[region] += 1
|
|
|
|
# Find missing prefectures
|
|
represented_prefs = set(prefecture_counts.keys())
|
|
all_prefs = set(PREFECTURE_CODES.keys())
|
|
missing_prefs = all_prefs - represented_prefs
|
|
|
|
# Print results
|
|
print("=" * 80)
|
|
print("PREFECTURE COVERAGE ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Total Prefectures in Dataset: {len(represented_prefs)}/47")
|
|
print(f"Missing Prefectures: {len(missing_prefs)}")
|
|
print()
|
|
|
|
# By region
|
|
print("INSTITUTIONS BY REGION")
|
|
print("-" * 80)
|
|
for region, count in sorted(region_counts.items(), key=lambda x: -x[1]):
|
|
percentage = (count / len(records) * 100)
|
|
print(f" {region:15s} {count:6,} ({percentage:5.2f}%)")
|
|
print()
|
|
|
|
# Top 20 prefectures
|
|
print("TOP 20 PREFECTURES")
|
|
print("-" * 80)
|
|
for pref_code, count in prefecture_counts.most_common(20):
|
|
pref_info = PREFECTURE_CODES.get(pref_code, {'name': 'Unknown', 'iso': 'N/A'})
|
|
percentage = (count / len(records) * 100)
|
|
print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} {count:6,} ({percentage:5.2f}%)")
|
|
print()
|
|
|
|
# Missing prefectures
|
|
if missing_prefs:
|
|
print("MISSING PREFECTURES")
|
|
print("-" * 80)
|
|
for pref_code in sorted(missing_prefs):
|
|
pref_info = PREFECTURE_CODES[pref_code]
|
|
print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} - {pref_info['region']}")
|
|
print()
|
|
print("Note: These prefectures have no institutions in the ISIL registry dataset.")
|
|
print("This may indicate:")
|
|
print(" - True absence of ISIL-registered institutions")
|
|
print(" - Institutions not yet registered with National Diet Library")
|
|
print(" - Data quality or coverage issues")
|
|
print()
|
|
|
|
# Export mapping
|
|
export_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'prefecture_analysis.json'
|
|
export_data = {
|
|
'total_institutions': len(records),
|
|
'represented_prefectures': len(represented_prefs),
|
|
'missing_prefectures': len(missing_prefs),
|
|
'prefecture_counts': dict(prefecture_counts),
|
|
'region_counts': dict(region_counts),
|
|
'prefecture_mapping': PREFECTURE_CODES,
|
|
'missing_prefecture_codes': list(missing_prefs)
|
|
}
|
|
|
|
with open(export_path, 'w', encoding='utf-8') as f:
|
|
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Prefecture analysis exported to: {export_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
analyze_prefecture_coverage()
|