glam/scripts/analyze_japan_prefectures.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

175 lines
7 KiB
Python

#!/usr/bin/env python3
"""
Analyze Japan Prefecture Coverage
Maps prefecture codes to ISO 3166-2 codes and analyzes geographic distribution.
"""
import yaml
from pathlib import Path
from collections import Counter
import json
# ISO 3166-2:JP Prefecture Code Mapping
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:JP
PREFECTURE_CODES = {
# Hokkaido
'HO': {'iso': 'JP-01', 'name': 'Hokkaido', 'region': 'Hokkaido'},
# Tohoku
'AO': {'iso': 'JP-02', 'name': 'Aomori', 'region': 'Tohoku'},
'IW': {'iso': 'JP-03', 'name': 'Iwate', 'region': 'Tohoku'},
'MI': {'iso': 'JP-04', 'name': 'Miyagi', 'region': 'Tohoku'},
'AK': {'iso': 'JP-05', 'name': 'Akita', 'region': 'Tohoku'},
'YA': {'iso': 'JP-06', 'name': 'Yamagata', 'region': 'Tohoku'},
'FU': {'iso': 'JP-07', 'name': 'Fukushima', 'region': 'Tohoku'},
# Kanto
'IB': {'iso': 'JP-08', 'name': 'Ibaraki', 'region': 'Kanto'},
'TC': {'iso': 'JP-09', 'name': 'Tochigi', 'region': 'Kanto'},
'GU': {'iso': 'JP-10', 'name': 'Gunma', 'region': 'Kanto'},
'SA': {'iso': 'JP-11', 'name': 'Saitama', 'region': 'Kanto'},
'CH': {'iso': 'JP-12', 'name': 'Chiba', 'region': 'Kanto'},
'TO': {'iso': 'JP-13', 'name': 'Tokyo', 'region': 'Kanto'},
'KN': {'iso': 'JP-14', 'name': 'Kanagawa', 'region': 'Kanto'},
# Chubu
'NI': {'iso': 'JP-15', 'name': 'Niigata', 'region': 'Chubu'},
'TY': {'iso': 'JP-16', 'name': 'Toyama', 'region': 'Chubu'},
'IS': {'iso': 'JP-17', 'name': 'Ishikawa', 'region': 'Chubu'},
'FK': {'iso': 'JP-18', 'name': 'Fukui', 'region': 'Chubu'},
'YM': {'iso': 'JP-19', 'name': 'Yamanashi', 'region': 'Chubu'},
'NA': {'iso': 'JP-20', 'name': 'Nagano', 'region': 'Chubu'},
'GI': {'iso': 'JP-21', 'name': 'Gifu', 'region': 'Chubu'},
'SZ': {'iso': 'JP-22', 'name': 'Shizuoka', 'region': 'Chubu'},
'AI': {'iso': 'JP-23', 'name': 'Aichi', 'region': 'Chubu'},
# Kansai
'ME': {'iso': 'JP-24', 'name': 'Mie', 'region': 'Kansai'},
'SH': {'iso': 'JP-25', 'name': 'Shiga', 'region': 'Kansai'},
'KY': {'iso': 'JP-26', 'name': 'Kyoto', 'region': 'Kansai'},
'OS': {'iso': 'JP-27', 'name': 'Osaka', 'region': 'Kansai'},
'HY': {'iso': 'JP-28', 'name': 'Hyogo', 'region': 'Kansai'},
'NR': {'iso': 'JP-29', 'name': 'Nara', 'region': 'Kansai'},
'WA': {'iso': 'JP-30', 'name': 'Wakayama', 'region': 'Kansai'},
# Chugoku
'TT': {'iso': 'JP-31', 'name': 'Tottori', 'region': 'Chugoku'},
'SM': {'iso': 'JP-32', 'name': 'Shimane', 'region': 'Chugoku'},
'OK': {'iso': 'JP-33', 'name': 'Okayama', 'region': 'Chugoku'},
'HI': {'iso': 'JP-34', 'name': 'Hiroshima', 'region': 'Chugoku'},
'YG': {'iso': 'JP-35', 'name': 'Yamaguchi', 'region': 'Chugoku'},
# Shikoku
'TK': {'iso': 'JP-36', 'name': 'Tokushima', 'region': 'Shikoku'},
'KA': {'iso': 'JP-37', 'name': 'Kagawa', 'region': 'Shikoku'},
'EH': {'iso': 'JP-38', 'name': 'Ehime', 'region': 'Shikoku'},
'KO': {'iso': 'JP-39', 'name': 'Kochi', 'region': 'Shikoku'},
# Kyushu
'FO': {'iso': 'JP-40', 'name': 'Fukuoka', 'region': 'Kyushu'},
'SG': {'iso': 'JP-41', 'name': 'Saga', 'region': 'Kyushu'},
'NS': {'iso': 'JP-42', 'name': 'Nagasaki', 'region': 'Kyushu'},
'KM': {'iso': 'JP-43', 'name': 'Kumamoto', 'region': 'Kyushu'},
'OI': {'iso': 'JP-44', 'name': 'Oita', 'region': 'Kyushu'},
'MZ': {'iso': 'JP-45', 'name': 'Miyazaki', 'region': 'Kyushu'},
'KS': {'iso': 'JP-46', 'name': 'Kagoshima', 'region': 'Kyushu'},
'ON': {'iso': 'JP-47', 'name': 'Okinawa', 'region': 'Kyushu'},
}
def analyze_prefecture_coverage():
"""Analyze prefecture coverage in Japan dataset."""
# Load dataset
data_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml'
with open(data_path, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
print(f"Analyzing {len(records)} institutions...")
print()
# Extract prefecture codes from GHCIDs
prefecture_counts = Counter()
region_counts = Counter()
missing_prefectures = []
for record in records:
ghcid = record.get('ghcid', '')
if ghcid:
parts = ghcid.split('-')
if len(parts) >= 2:
pref_code = parts[1]
prefecture_counts[pref_code] += 1
# Map to region
if pref_code in PREFECTURE_CODES:
region = PREFECTURE_CODES[pref_code]['region']
region_counts[region] += 1
# Find missing prefectures
represented_prefs = set(prefecture_counts.keys())
all_prefs = set(PREFECTURE_CODES.keys())
missing_prefs = all_prefs - represented_prefs
# Print results
print("=" * 80)
print("PREFECTURE COVERAGE ANALYSIS")
print("=" * 80)
print()
print(f"Total Prefectures in Dataset: {len(represented_prefs)}/47")
print(f"Missing Prefectures: {len(missing_prefs)}")
print()
# By region
print("INSTITUTIONS BY REGION")
print("-" * 80)
for region, count in sorted(region_counts.items(), key=lambda x: -x[1]):
percentage = (count / len(records) * 100)
print(f" {region:15s} {count:6,} ({percentage:5.2f}%)")
print()
# Top 20 prefectures
print("TOP 20 PREFECTURES")
print("-" * 80)
for pref_code, count in prefecture_counts.most_common(20):
pref_info = PREFECTURE_CODES.get(pref_code, {'name': 'Unknown', 'iso': 'N/A'})
percentage = (count / len(records) * 100)
print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} {count:6,} ({percentage:5.2f}%)")
print()
# Missing prefectures
if missing_prefs:
print("MISSING PREFECTURES")
print("-" * 80)
for pref_code in sorted(missing_prefs):
pref_info = PREFECTURE_CODES[pref_code]
print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} - {pref_info['region']}")
print()
print("Note: These prefectures have no institutions in the ISIL registry dataset.")
print("This may indicate:")
print(" - True absence of ISIL-registered institutions")
print(" - Institutions not yet registered with National Diet Library")
print(" - Data quality or coverage issues")
print()
# Export mapping
export_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'prefecture_analysis.json'
export_data = {
'total_institutions': len(records),
'represented_prefectures': len(represented_prefs),
'missing_prefectures': len(missing_prefs),
'prefecture_counts': dict(prefecture_counts),
'region_counts': dict(region_counts),
'prefecture_mapping': PREFECTURE_CODES,
'missing_prefecture_codes': list(missing_prefs)
}
with open(export_path, 'w', encoding='utf-8') as f:
json.dump(export_data, f, indent=2, ensure_ascii=False)
print(f"✅ Prefecture analysis exported to: {export_path}")
if __name__ == '__main__':
analyze_prefecture_coverage()