#!/usr/bin/env python3 """ Resolve XX region codes using Wikidata P131 (located in administrative territorial entity). Uses Wikidata SPARQL to query the administrative hierarchy for each entity and maps to ISO 3166-2 region codes where possible. Following AGENTS.md Rule 5: Additive only - never delete existing data. """ import os import sys import yaml import time from datetime import datetime, timezone from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from typing import Optional, Dict, Any, List, Tuple # ISO 3166-2 region code mappings for countries we frequently encounter # Format: country_code: {wikidata_region_id: iso_region_code} # These are manually curated for accuracy REGION_MAPPINGS = { # France - Regions 'FR': { 'Q90': 'IDF', # Paris (Île-de-France) 'Q13917': 'IDF', # Île-de-France 'Q15104': 'PAC', # Provence-Alpes-Côte d'Azur 'Q18677983': 'ARA', # Auvergne-Rhône-Alpes 'Q18677767': 'NAQ', # Nouvelle-Aquitaine 'Q18677875': 'OCC', # Occitanie 'Q18677757': 'GES', # Grand Est 'Q18677902': 'HDF', # Hauts-de-France 'Q12130': 'NOR', # Normandie 'Q16994': 'BRE', # Brittany 'Q16993': 'PDL', # Pays de la Loire 'Q13947': 'CVL', # Centre-Val de Loire 'Q18678082': 'BFC', # Bourgogne-Franche-Comté 'Q15276': 'COR', # Corsica }, # South Korea - Provinces/Special Cities 'KR': { 'Q8684': 'SEO', # Seoul 'Q16520': 'BUS', # Busan 'Q41684': 'DAE', # Daegu 'Q41588': 'INC', # Incheon 'Q42622': 'GWJ', # Gwangju 'Q42833': 'DAJ', # Daejeon 'Q484637': 'ULS', # Ulsan 'Q20925': 'SEJ', # Sejong 'Q41076': 'GGI', # Gyeonggi 'Q41115': 'GAW', # Gangwon 'Q41392': 'CHB', # North Chungcheong 'Q41442': 'CHN', # South Chungcheong 'Q41213': 'JEO', # North Jeolla 'Q41283': 'JEN', # South Jeolla 'Q41171': 'GYB', # North Gyeongsang 'Q41312': 'GYN', # South Gyeongsang 'Q41872': 'JEJ', # Jeju }, # China - Provinces 'CN': { 'Q956': 'BEI', # Beijing 'Q8686': 'SHA', # Shanghai 'Q15174': 'GUD', # Guangdong 'Q16572': 'ZHE', # Zhejiang 'Q16963': 'JIS', # Jiangsu 'Q43684': 'SHX', # Shaanxi 'Q45761': 'SCH', # Sichuan 'Q41079': 'HUN', # Hunan 'Q46862': 'HUB', # Hubei 'Q46913': 'HEN', # Henan 'Q16952': 'SDG', # Shandong 'Q21208': 'FUJ', # Fujian 'Q43194': 'YUN', # Yunnan 'Q40285': 'GUX', # Guangxi 'Q46491': 'ANH', # Anhui 'Q47097': 'JIL', # Jilin 'Q19188': 'LIA', # Liaoning 'Q19206': 'HEI', # Heilongjiang 'Q57251': 'XIN', # Xinjiang 'Q17188': 'TIB', # Tibet 'Q41705': 'HAI', # Hainan 'Q15184': 'SHG', # Shanxi 'Q47165': 'GAN', # Gansu 'Q45646': 'GUI', # Guizhou 'Q46865': 'JIX', # Jiangxi 'Q46684': 'NMG', # Inner Mongolia 'Q57448': 'QIN', # Qinghai 'Q57958': 'NXA', # Ningxia 'Q8646': 'HKG', # Hong Kong 'Q14773': 'MAC', # Macau 'Q15175': 'TIJ', # Tianjin 'Q11725': 'CHQ', # Chongqing 'Q46863': 'HEB', # Hebei }, # Switzerland - Cantons 'CH': { 'Q11911': 'ZH', # Zürich 'Q12079': 'BE', # Bern 'Q12094': 'LU', # Luzern 'Q12433': 'UR', # Uri 'Q12592': 'SZ', # Schwyz 'Q12721': 'OW', # Obwalden 'Q12755': 'NW', # Nidwalden 'Q11922': 'GL', # Glarus 'Q11933': 'ZG', # Zug 'Q834': 'FR', # Fribourg 'Q12746': 'SO', # Solothurn 'Q12172': 'BS', # Basel-Stadt 'Q12146': 'BL', # Basel-Landschaft 'Q12640': 'SH', # Schaffhausen 'Q12573': 'AR', # Appenzell Ausserrhoden 'Q12094': 'AI', # Appenzell Innerrhoden 'Q12738': 'SG', # St. Gallen 'Q12697': 'GR', # Graubünden 'Q12724': 'AG', # Aargau 'Q12771': 'TG', # Thurgau 'Q12713': 'TI', # Ticino 'Q12771': 'VD', # Vaud 'Q12771': 'VS', # Valais 'Q12738': 'NE', # Neuchâtel 'Q11929': 'GE', # Geneva 'Q12755': 'JU', # Jura }, # United Kingdom - Countries/Regions 'GB': { 'Q21': 'ENG', # England 'Q22': 'SCT', # Scotland 'Q25': 'WLS', # Wales 'Q26': 'NIR', # Northern Ireland 'Q84': 'LND', # London # Historic counties / regions 'Q23436': 'KEN', # Kent 'Q23183': 'SRY', # Surrey 'Q180673': 'ESS', # Essex 'Q189299': 'MDX', # Middlesex (historic) 'Q23306': 'OXF', # Oxfordshire 'Q23169': 'CAM', # Cambridgeshire 'Q179528': 'YKS', # Yorkshire }, # United States - States 'US': { 'Q99': 'CA', # California 'Q1387': 'NY', # New York 'Q1439': 'TX', # Texas 'Q779': 'FL', # Florida 'Q797': 'IL', # Illinois 'Q1400': 'PA', # Pennsylvania 'Q1397': 'OH', # Ohio 'Q1428': 'GA', # Georgia 'Q1223': 'MI', # Michigan 'Q1537': 'NC', # North Carolina 'Q1558': 'NJ', # New Jersey 'Q1370': 'VA', # Virginia 'Q1509': 'WA', # Washington 'Q1588': 'AZ', # Arizona 'Q1581': 'MA', # Massachusetts 'Q61': 'DC', # Washington, D.C. 'Q1408': 'MD', # Maryland 'Q1603': 'CO', # Colorado 'Q1649': 'MN', # Minnesota 'Q1494': 'IN', # Indiana 'Q1612': 'MO', # Missouri }, # Germany - Bundesländer 'DE': { 'Q64': 'BE', # Berlin 'Q1055': 'HH', # Hamburg 'Q980': 'BY', # Bavaria 'Q985': 'BW', # Baden-Württemberg 'Q1198': 'NW', # North Rhine-Westphalia 'Q1194': 'NI', # Lower Saxony 'Q1196': 'HE', # Hesse 'Q1200': 'SN', # Saxony 'Q1208': 'RP', # Rhineland-Palatinate 'Q1199': 'ST', # Saxony-Anhalt 'Q1201': 'TH', # Thuringia 'Q1197': 'SH', # Schleswig-Holstein 'Q1202': 'MV', # Mecklenburg-Vorpommern 'Q1205': 'BB', # Brandenburg 'Q1221': 'SL', # Saarland 'Q1209': 'HB', # Bremen }, # Japan - Prefectures 'JP': { 'Q1490': 'TKY', # Tokyo 'Q35765': 'OSK', # Osaka 'Q130266': 'KYO', # Kyoto 'Q52946': 'HKD', # Hokkaido 'Q131287': 'AIC', # Aichi 'Q131299': 'FKO', # Fukuoka 'Q131265': 'KGW', # Kanagawa 'Q131317': 'SIT', # Saitama 'Q131296': 'CHB', # Chiba 'Q131302': 'HYG', # Hyogo 'Q131292': 'SZO', # Shizuoka 'Q160727': 'HIR', # Hiroshima 'Q132681': 'NGT', # Niigata 'Q132692': 'ISK', # Ishikawa 'Q165791': 'NAR', # Nara }, # Hungary - Counties/Regions 'HU': { 'Q1781': 'BUD', # Budapest 'Q193478': 'PE', # Pest 'Q204050': 'BAR', # Baranya 'Q165883': 'BCS', # Bács-Kiskun 'Q204055': 'BEK', # Békés 'Q204054': 'BOR', # Borsod-Abaúj-Zemplén 'Q203518': 'CSO', # Csongrád 'Q192503': 'FEJ', # Fejér 'Q165845': 'GYM', # Győr-Moson-Sopron 'Q165873': 'HAJ', # Hajdú-Bihar 'Q193491': 'HEV', # Heves 'Q204051': 'JNS', # Jász-Nagykun-Szolnok 'Q193505': 'KOM', # Komárom-Esztergom 'Q204053': 'NOG', # Nógrád 'Q193478': 'SOM', # Somogy 'Q193490': 'SZB', # Szabolcs-Szatmár-Bereg 'Q165875': 'TOL', # Tolna 'Q204048': 'VAS', # Vas 'Q204052': 'VES', # Veszprém 'Q165852': 'ZAL', # Zala }, # Iran - Provinces 'IR': { 'Q3616': 'THR', # Tehran 'Q131986': 'ISF', # Isfahan 'Q170042': 'FAR', # Fars 'Q170067': 'KHU', # Khuzestan 'Q181109': 'AZS', # East Azerbaijan 'Q180972': 'AZG', # West Azerbaijan 'Q181158': 'GIL', # Gilan 'Q181165': 'MAZ', # Mazandaran 'Q181177': 'KER', # Kerman 'Q181186': 'KHO', # Khorasan Razavi }, # India - States 'IN': { 'Q1353': 'DEL', # Delhi 'Q1191': 'MAH', # Maharashtra 'Q1165': 'KAR', # Karnataka 'Q1445': 'TN', # Tamil Nadu 'Q1498': 'WB', # West Bengal 'Q1159': 'GJ', # Gujarat 'Q1164': 'UP', # Uttar Pradesh 'Q1473': 'AP', # Andhra Pradesh 'Q1061': 'RJ', # Rajasthan 'Q677': 'KL', # Kerala 'Q1166': 'MP', # Madhya Pradesh 'Q1184': 'OR', # Odisha 'Q1478': 'TG', # Telangana }, } def query_wikidata_location(entity_id: str) -> Optional[Dict[str, Any]]: """Query Wikidata for P131 (located in administrative entity) hierarchy.""" # Use the wikidata-authenticated MCP tool via subprocess # For now, we'll implement a simple SPARQL query sparql_query = f""" SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin2 ?admin2Label ?coords WHERE {{ BIND(wd:{entity_id} AS ?item) OPTIONAL {{ ?item wdt:P131 ?admin1. ?admin1 wdt:P31/wdt:P279* wd:Q10864048. # first-order admin division }} OPTIONAL {{ ?item wdt:P131 ?admin2. ?admin2 wdt:P31/wdt:P279* wd:Q13220204. # second-order admin division }} OPTIONAL {{ ?item wdt:P625 ?coords. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} LIMIT 1 """ # This is a simplified placeholder - actual implementation would use the MCP tool return None def resolve_region_from_wikidata_id(wikidata_id: str, country_code: str) -> Optional[str]: """Try to resolve region code from Wikidata ID using our mappings.""" if country_code not in REGION_MAPPINGS: return None mappings = REGION_MAPPINGS[country_code] # Check if this Wikidata ID directly maps to a region if wikidata_id in mappings: return mappings[wikidata_id] return None def extract_wikidata_id(data: Dict[str, Any]) -> Optional[str]: """Extract Wikidata entity ID from custodian data.""" # Try wikidata_enrichment first if 'wikidata_enrichment' in data: wd = data['wikidata_enrichment'] if 'wikidata_entity_id' in wd: return wd['wikidata_entity_id'] # Try original_entry if 'original_entry' in data: oe = data['original_entry'] if 'wikidata_id' in oe: return oe['wikidata_id'] # Try identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'Wikidata': return ident.get('identifier_value') return None def process_file(filepath: Path, dry_run: bool = True) -> Dict[str, Any]: """Process a single custodian file to resolve XX region code.""" result = { 'filepath': str(filepath), 'has_xx_region': False, 'wikidata_id': None, 'country_code': None, 'resolved_region': None, 'updated': False, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: result['error'] = f"Failed to read file: {e}" return result # Check if file has XX region code ghcid = data.get('ghcid', {}) location_resolution = ghcid.get('location_resolution', {}) if location_resolution.get('region_code') != 'XX': return result result['has_xx_region'] = True result['country_code'] = location_resolution.get('country_code') # Try to get Wikidata ID wikidata_id = extract_wikidata_id(data) result['wikidata_id'] = wikidata_id if not wikidata_id: result['error'] = "No Wikidata ID found" return result # For now, we can't resolve without a SPARQL query # This script identifies files that need resolution # Actual resolution would require Wikidata API calls return result def scan_xx_region_files(custodian_dir: Path) -> List[Dict[str, Any]]: """Scan all custodian files for XX region codes.""" results = [] for filepath in custodian_dir.glob('*.yaml'): result = process_file(filepath, dry_run=True) if result['has_xx_region']: results.append(result) return results def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Resolve XX region codes using Wikidata P131' ) parser.add_argument('--scan', action='store_true', help='Scan files and report XX region codes by country') parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files directory') parser.add_argument('--country', type=str, help='Only process files for a specific country (e.g., FR, KR)') args = parser.parse_args() custodian_dir = Path(args.path) if not custodian_dir.exists(): print(f"Error: Directory {custodian_dir} does not exist") sys.exit(1) print("=" * 70) print("XX REGION CODE RESOLUTION SCANNER") print("=" * 70) if args.scan: results = scan_xx_region_files(custodian_dir) # Group by country by_country = {} for r in results: cc = r['country_code'] if cc not in by_country: by_country[cc] = [] by_country[cc].append(r) print(f"\nTotal files with XX region code: {len(results)}") print(f"\nBy country:") for cc in sorted(by_country.keys(), key=lambda x: -len(by_country[x])): count = len(by_country[cc]) has_mapping = cc in REGION_MAPPINGS mapping_status = "HAS MAPPING" if has_mapping else "NO MAPPING" print(f" {cc}: {count:4d} files [{mapping_status}]") # Count files with/without Wikidata IDs with_wd = sum(1 for r in results if r['wikidata_id']) without_wd = len(results) - with_wd print(f"\nWith Wikidata ID: {with_wd}") print(f"Without Wikidata ID: {without_wd}") # Sample files without Wikidata if args.country: country_results = by_country.get(args.country, []) print(f"\n{args.country} files ({len(country_results)} total):") for r in country_results[:10]: wd = r['wikidata_id'] or 'NO_WIKIDATA' print(f" {Path(r['filepath']).name}: {wd}") else: print("\nUsage:") print(" --scan Scan and report XX region codes by country") print(" --country XX Filter to specific country code") print("\nExample:") print(" python resolve_xx_regions.py --scan") print(" python resolve_xx_regions.py --scan --country FR") if __name__ == '__main__': main()