#!/usr/bin/env python3 """ Analyze Japan Prefecture Coverage Maps prefecture codes to ISO 3166-2 codes and analyzes geographic distribution. """ import yaml from pathlib import Path from collections import Counter import json # ISO 3166-2:JP Prefecture Code Mapping # Source: https://en.wikipedia.org/wiki/ISO_3166-2:JP PREFECTURE_CODES = { # Hokkaido 'HO': {'iso': 'JP-01', 'name': 'Hokkaido', 'region': 'Hokkaido'}, # Tohoku 'AO': {'iso': 'JP-02', 'name': 'Aomori', 'region': 'Tohoku'}, 'IW': {'iso': 'JP-03', 'name': 'Iwate', 'region': 'Tohoku'}, 'MI': {'iso': 'JP-04', 'name': 'Miyagi', 'region': 'Tohoku'}, 'AK': {'iso': 'JP-05', 'name': 'Akita', 'region': 'Tohoku'}, 'YA': {'iso': 'JP-06', 'name': 'Yamagata', 'region': 'Tohoku'}, 'FU': {'iso': 'JP-07', 'name': 'Fukushima', 'region': 'Tohoku'}, # Kanto 'IB': {'iso': 'JP-08', 'name': 'Ibaraki', 'region': 'Kanto'}, 'TC': {'iso': 'JP-09', 'name': 'Tochigi', 'region': 'Kanto'}, 'GU': {'iso': 'JP-10', 'name': 'Gunma', 'region': 'Kanto'}, 'SA': {'iso': 'JP-11', 'name': 'Saitama', 'region': 'Kanto'}, 'CH': {'iso': 'JP-12', 'name': 'Chiba', 'region': 'Kanto'}, 'TO': {'iso': 'JP-13', 'name': 'Tokyo', 'region': 'Kanto'}, 'KN': {'iso': 'JP-14', 'name': 'Kanagawa', 'region': 'Kanto'}, # Chubu 'NI': {'iso': 'JP-15', 'name': 'Niigata', 'region': 'Chubu'}, 'TY': {'iso': 'JP-16', 'name': 'Toyama', 'region': 'Chubu'}, 'IS': {'iso': 'JP-17', 'name': 'Ishikawa', 'region': 'Chubu'}, 'FK': {'iso': 'JP-18', 'name': 'Fukui', 'region': 'Chubu'}, 'YM': {'iso': 'JP-19', 'name': 'Yamanashi', 'region': 'Chubu'}, 'NA': {'iso': 'JP-20', 'name': 'Nagano', 'region': 'Chubu'}, 'GI': {'iso': 'JP-21', 'name': 'Gifu', 'region': 'Chubu'}, 'SZ': {'iso': 'JP-22', 'name': 'Shizuoka', 'region': 'Chubu'}, 'AI': {'iso': 'JP-23', 'name': 'Aichi', 'region': 'Chubu'}, # Kansai 'ME': {'iso': 'JP-24', 'name': 'Mie', 'region': 'Kansai'}, 'SH': {'iso': 'JP-25', 'name': 'Shiga', 'region': 'Kansai'}, 'KY': {'iso': 'JP-26', 'name': 'Kyoto', 'region': 'Kansai'}, 'OS': {'iso': 'JP-27', 'name': 'Osaka', 'region': 'Kansai'}, 'HY': {'iso': 'JP-28', 'name': 'Hyogo', 'region': 'Kansai'}, 'NR': {'iso': 'JP-29', 'name': 'Nara', 'region': 'Kansai'}, 'WA': {'iso': 'JP-30', 'name': 'Wakayama', 'region': 'Kansai'}, # Chugoku 'TT': {'iso': 'JP-31', 'name': 'Tottori', 'region': 'Chugoku'}, 'SM': {'iso': 'JP-32', 'name': 'Shimane', 'region': 'Chugoku'}, 'OK': {'iso': 'JP-33', 'name': 'Okayama', 'region': 'Chugoku'}, 'HI': {'iso': 'JP-34', 'name': 'Hiroshima', 'region': 'Chugoku'}, 'YG': {'iso': 'JP-35', 'name': 'Yamaguchi', 'region': 'Chugoku'}, # Shikoku 'TK': {'iso': 'JP-36', 'name': 'Tokushima', 'region': 'Shikoku'}, 'KA': {'iso': 'JP-37', 'name': 'Kagawa', 'region': 'Shikoku'}, 'EH': {'iso': 'JP-38', 'name': 'Ehime', 'region': 'Shikoku'}, 'KO': {'iso': 'JP-39', 'name': 'Kochi', 'region': 'Shikoku'}, # Kyushu 'FO': {'iso': 'JP-40', 'name': 'Fukuoka', 'region': 'Kyushu'}, 'SG': {'iso': 'JP-41', 'name': 'Saga', 'region': 'Kyushu'}, 'NS': {'iso': 'JP-42', 'name': 'Nagasaki', 'region': 'Kyushu'}, 'KM': {'iso': 'JP-43', 'name': 'Kumamoto', 'region': 'Kyushu'}, 'OI': {'iso': 'JP-44', 'name': 'Oita', 'region': 'Kyushu'}, 'MZ': {'iso': 'JP-45', 'name': 'Miyazaki', 'region': 'Kyushu'}, 'KS': {'iso': 'JP-46', 'name': 'Kagoshima', 'region': 'Kyushu'}, 'ON': {'iso': 'JP-47', 'name': 'Okinawa', 'region': 'Kyushu'}, } def analyze_prefecture_coverage(): """Analyze prefecture coverage in Japan dataset.""" # Load dataset data_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml' with open(data_path, 'r', encoding='utf-8') as f: records = yaml.safe_load(f) print(f"Analyzing {len(records)} institutions...") print() # Extract prefecture codes from GHCIDs prefecture_counts = Counter() region_counts = Counter() missing_prefectures = [] for record in records: ghcid = record.get('ghcid', '') if ghcid: parts = ghcid.split('-') if len(parts) >= 2: pref_code = parts[1] prefecture_counts[pref_code] += 1 # Map to region if pref_code in PREFECTURE_CODES: region = PREFECTURE_CODES[pref_code]['region'] region_counts[region] += 1 # Find missing prefectures represented_prefs = set(prefecture_counts.keys()) all_prefs = set(PREFECTURE_CODES.keys()) missing_prefs = all_prefs - represented_prefs # Print results print("=" * 80) print("PREFECTURE COVERAGE ANALYSIS") print("=" * 80) print() print(f"Total Prefectures in Dataset: {len(represented_prefs)}/47") print(f"Missing Prefectures: {len(missing_prefs)}") print() # By region print("INSTITUTIONS BY REGION") print("-" * 80) for region, count in sorted(region_counts.items(), key=lambda x: -x[1]): percentage = (count / len(records) * 100) print(f" {region:15s} {count:6,} ({percentage:5.2f}%)") print() # Top 20 prefectures print("TOP 20 PREFECTURES") print("-" * 80) for pref_code, count in prefecture_counts.most_common(20): pref_info = PREFECTURE_CODES.get(pref_code, {'name': 'Unknown', 'iso': 'N/A'}) percentage = (count / len(records) * 100) print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} {count:6,} ({percentage:5.2f}%)") print() # Missing prefectures if missing_prefs: print("MISSING PREFECTURES") print("-" * 80) for pref_code in sorted(missing_prefs): pref_info = PREFECTURE_CODES[pref_code] print(f" {pref_code} ({pref_info['iso']}) {pref_info['name']:15s} - {pref_info['region']}") print() print("Note: These prefectures have no institutions in the ISIL registry dataset.") print("This may indicate:") print(" - True absence of ISIL-registered institutions") print(" - Institutions not yet registered with National Diet Library") print(" - Data quality or coverage issues") print() # Export mapping export_path = Path(__file__).parent.parent / 'data' / 'instances' / 'japan' / 'prefecture_analysis.json' export_data = { 'total_institutions': len(records), 'represented_prefectures': len(represented_prefs), 'missing_prefectures': len(missing_prefs), 'prefecture_counts': dict(prefecture_counts), 'region_counts': dict(region_counts), 'prefecture_mapping': PREFECTURE_CODES, 'missing_prefecture_codes': list(missing_prefs) } with open(export_path, 'w', encoding='utf-8') as f: json.dump(export_data, f, indent=2, ensure_ascii=False) print(f"✅ Prefecture analysis exported to: {export_path}") if __name__ == '__main__': analyze_prefecture_coverage()