glam/scripts/resolve_xx_regions.py
2025-12-07 00:26:01 +01:00

456 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Resolve XX region codes using Wikidata P131 (located in administrative territorial entity).
Uses Wikidata SPARQL to query the administrative hierarchy for each entity
and maps to ISO 3166-2 region codes where possible.
Following AGENTS.md Rule 5: Additive only - never delete existing data.
"""
import os
import sys
import yaml
import time
from datetime import datetime, timezone
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from typing import Optional, Dict, Any, List, Tuple
# ISO 3166-2 region code mappings for countries we frequently encounter
# Format: country_code: {wikidata_region_id: iso_region_code}
# These are manually curated for accuracy
REGION_MAPPINGS = {
# France - Regions
'FR': {
'Q90': 'IDF', # Paris (Île-de-France)
'Q13917': 'IDF', # Île-de-France
'Q15104': 'PAC', # Provence-Alpes-Côte d'Azur
'Q18677983': 'ARA', # Auvergne-Rhône-Alpes
'Q18677767': 'NAQ', # Nouvelle-Aquitaine
'Q18677875': 'OCC', # Occitanie
'Q18677757': 'GES', # Grand Est
'Q18677902': 'HDF', # Hauts-de-France
'Q12130': 'NOR', # Normandie
'Q16994': 'BRE', # Brittany
'Q16993': 'PDL', # Pays de la Loire
'Q13947': 'CVL', # Centre-Val de Loire
'Q18678082': 'BFC', # Bourgogne-Franche-Comté
'Q15276': 'COR', # Corsica
},
# South Korea - Provinces/Special Cities
'KR': {
'Q8684': 'SEO', # Seoul
'Q16520': 'BUS', # Busan
'Q41684': 'DAE', # Daegu
'Q41588': 'INC', # Incheon
'Q42622': 'GWJ', # Gwangju
'Q42833': 'DAJ', # Daejeon
'Q484637': 'ULS', # Ulsan
'Q20925': 'SEJ', # Sejong
'Q41076': 'GGI', # Gyeonggi
'Q41115': 'GAW', # Gangwon
'Q41392': 'CHB', # North Chungcheong
'Q41442': 'CHN', # South Chungcheong
'Q41213': 'JEO', # North Jeolla
'Q41283': 'JEN', # South Jeolla
'Q41171': 'GYB', # North Gyeongsang
'Q41312': 'GYN', # South Gyeongsang
'Q41872': 'JEJ', # Jeju
},
# China - Provinces
'CN': {
'Q956': 'BEI', # Beijing
'Q8686': 'SHA', # Shanghai
'Q15174': 'GUD', # Guangdong
'Q16572': 'ZHE', # Zhejiang
'Q16963': 'JIS', # Jiangsu
'Q43684': 'SHX', # Shaanxi
'Q45761': 'SCH', # Sichuan
'Q41079': 'HUN', # Hunan
'Q46862': 'HUB', # Hubei
'Q46913': 'HEN', # Henan
'Q16952': 'SDG', # Shandong
'Q21208': 'FUJ', # Fujian
'Q43194': 'YUN', # Yunnan
'Q40285': 'GUX', # Guangxi
'Q46491': 'ANH', # Anhui
'Q47097': 'JIL', # Jilin
'Q19188': 'LIA', # Liaoning
'Q19206': 'HEI', # Heilongjiang
'Q57251': 'XIN', # Xinjiang
'Q17188': 'TIB', # Tibet
'Q41705': 'HAI', # Hainan
'Q15184': 'SHG', # Shanxi
'Q47165': 'GAN', # Gansu
'Q45646': 'GUI', # Guizhou
'Q46865': 'JIX', # Jiangxi
'Q46684': 'NMG', # Inner Mongolia
'Q57448': 'QIN', # Qinghai
'Q57958': 'NXA', # Ningxia
'Q8646': 'HKG', # Hong Kong
'Q14773': 'MAC', # Macau
'Q15175': 'TIJ', # Tianjin
'Q11725': 'CHQ', # Chongqing
'Q46863': 'HEB', # Hebei
},
# Switzerland - Cantons
'CH': {
'Q11911': 'ZH', # Zürich
'Q12079': 'BE', # Bern
'Q12094': 'LU', # Luzern
'Q12433': 'UR', # Uri
'Q12592': 'SZ', # Schwyz
'Q12721': 'OW', # Obwalden
'Q12755': 'NW', # Nidwalden
'Q11922': 'GL', # Glarus
'Q11933': 'ZG', # Zug
'Q834': 'FR', # Fribourg
'Q12746': 'SO', # Solothurn
'Q12172': 'BS', # Basel-Stadt
'Q12146': 'BL', # Basel-Landschaft
'Q12640': 'SH', # Schaffhausen
'Q12573': 'AR', # Appenzell Ausserrhoden
'Q12094': 'AI', # Appenzell Innerrhoden
'Q12738': 'SG', # St. Gallen
'Q12697': 'GR', # Graubünden
'Q12724': 'AG', # Aargau
'Q12771': 'TG', # Thurgau
'Q12713': 'TI', # Ticino
'Q12771': 'VD', # Vaud
'Q12771': 'VS', # Valais
'Q12738': 'NE', # Neuchâtel
'Q11929': 'GE', # Geneva
'Q12755': 'JU', # Jura
},
# United Kingdom - Countries/Regions
'GB': {
'Q21': 'ENG', # England
'Q22': 'SCT', # Scotland
'Q25': 'WLS', # Wales
'Q26': 'NIR', # Northern Ireland
'Q84': 'LND', # London
# Historic counties / regions
'Q23436': 'KEN', # Kent
'Q23183': 'SRY', # Surrey
'Q180673': 'ESS', # Essex
'Q189299': 'MDX', # Middlesex (historic)
'Q23306': 'OXF', # Oxfordshire
'Q23169': 'CAM', # Cambridgeshire
'Q179528': 'YKS', # Yorkshire
},
# United States - States
'US': {
'Q99': 'CA', # California
'Q1387': 'NY', # New York
'Q1439': 'TX', # Texas
'Q779': 'FL', # Florida
'Q797': 'IL', # Illinois
'Q1400': 'PA', # Pennsylvania
'Q1397': 'OH', # Ohio
'Q1428': 'GA', # Georgia
'Q1223': 'MI', # Michigan
'Q1537': 'NC', # North Carolina
'Q1558': 'NJ', # New Jersey
'Q1370': 'VA', # Virginia
'Q1509': 'WA', # Washington
'Q1588': 'AZ', # Arizona
'Q1581': 'MA', # Massachusetts
'Q61': 'DC', # Washington, D.C.
'Q1408': 'MD', # Maryland
'Q1603': 'CO', # Colorado
'Q1649': 'MN', # Minnesota
'Q1494': 'IN', # Indiana
'Q1612': 'MO', # Missouri
},
# Germany - Bundesländer
'DE': {
'Q64': 'BE', # Berlin
'Q1055': 'HH', # Hamburg
'Q980': 'BY', # Bavaria
'Q985': 'BW', # Baden-Württemberg
'Q1198': 'NW', # North Rhine-Westphalia
'Q1194': 'NI', # Lower Saxony
'Q1196': 'HE', # Hesse
'Q1200': 'SN', # Saxony
'Q1208': 'RP', # Rhineland-Palatinate
'Q1199': 'ST', # Saxony-Anhalt
'Q1201': 'TH', # Thuringia
'Q1197': 'SH', # Schleswig-Holstein
'Q1202': 'MV', # Mecklenburg-Vorpommern
'Q1205': 'BB', # Brandenburg
'Q1221': 'SL', # Saarland
'Q1209': 'HB', # Bremen
},
# Japan - Prefectures
'JP': {
'Q1490': 'TKY', # Tokyo
'Q35765': 'OSK', # Osaka
'Q130266': 'KYO', # Kyoto
'Q52946': 'HKD', # Hokkaido
'Q131287': 'AIC', # Aichi
'Q131299': 'FKO', # Fukuoka
'Q131265': 'KGW', # Kanagawa
'Q131317': 'SIT', # Saitama
'Q131296': 'CHB', # Chiba
'Q131302': 'HYG', # Hyogo
'Q131292': 'SZO', # Shizuoka
'Q160727': 'HIR', # Hiroshima
'Q132681': 'NGT', # Niigata
'Q132692': 'ISK', # Ishikawa
'Q165791': 'NAR', # Nara
},
# Hungary - Counties/Regions
'HU': {
'Q1781': 'BUD', # Budapest
'Q193478': 'PE', # Pest
'Q204050': 'BAR', # Baranya
'Q165883': 'BCS', # Bács-Kiskun
'Q204055': 'BEK', # Békés
'Q204054': 'BOR', # Borsod-Abaúj-Zemplén
'Q203518': 'CSO', # Csongrád
'Q192503': 'FEJ', # Fejér
'Q165845': 'GYM', # Győr-Moson-Sopron
'Q165873': 'HAJ', # Hajdú-Bihar
'Q193491': 'HEV', # Heves
'Q204051': 'JNS', # Jász-Nagykun-Szolnok
'Q193505': 'KOM', # Komárom-Esztergom
'Q204053': 'NOG', # Nógrád
'Q193478': 'SOM', # Somogy
'Q193490': 'SZB', # Szabolcs-Szatmár-Bereg
'Q165875': 'TOL', # Tolna
'Q204048': 'VAS', # Vas
'Q204052': 'VES', # Veszprém
'Q165852': 'ZAL', # Zala
},
# Iran - Provinces
'IR': {
'Q3616': 'THR', # Tehran
'Q131986': 'ISF', # Isfahan
'Q170042': 'FAR', # Fars
'Q170067': 'KHU', # Khuzestan
'Q181109': 'AZS', # East Azerbaijan
'Q180972': 'AZG', # West Azerbaijan
'Q181158': 'GIL', # Gilan
'Q181165': 'MAZ', # Mazandaran
'Q181177': 'KER', # Kerman
'Q181186': 'KHO', # Khorasan Razavi
},
# India - States
'IN': {
'Q1353': 'DEL', # Delhi
'Q1191': 'MAH', # Maharashtra
'Q1165': 'KAR', # Karnataka
'Q1445': 'TN', # Tamil Nadu
'Q1498': 'WB', # West Bengal
'Q1159': 'GJ', # Gujarat
'Q1164': 'UP', # Uttar Pradesh
'Q1473': 'AP', # Andhra Pradesh
'Q1061': 'RJ', # Rajasthan
'Q677': 'KL', # Kerala
'Q1166': 'MP', # Madhya Pradesh
'Q1184': 'OR', # Odisha
'Q1478': 'TG', # Telangana
},
}
def query_wikidata_location(entity_id: str) -> Optional[Dict[str, Any]]:
"""Query Wikidata for P131 (located in administrative entity) hierarchy."""
# Use the wikidata-authenticated MCP tool via subprocess
# For now, we'll implement a simple SPARQL query
sparql_query = f"""
SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin2 ?admin2Label ?coords WHERE {{
BIND(wd:{entity_id} AS ?item)
OPTIONAL {{
?item wdt:P131 ?admin1.
?admin1 wdt:P31/wdt:P279* wd:Q10864048. # first-order admin division
}}
OPTIONAL {{
?item wdt:P131 ?admin2.
?admin2 wdt:P31/wdt:P279* wd:Q13220204. # second-order admin division
}}
OPTIONAL {{
?item wdt:P625 ?coords.
}}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
LIMIT 1
"""
# This is a simplified placeholder - actual implementation would use the MCP tool
return None
def resolve_region_from_wikidata_id(wikidata_id: str, country_code: str) -> Optional[str]:
"""Try to resolve region code from Wikidata ID using our mappings."""
if country_code not in REGION_MAPPINGS:
return None
mappings = REGION_MAPPINGS[country_code]
# Check if this Wikidata ID directly maps to a region
if wikidata_id in mappings:
return mappings[wikidata_id]
return None
def extract_wikidata_id(data: Dict[str, Any]) -> Optional[str]:
"""Extract Wikidata entity ID from custodian data."""
# Try wikidata_enrichment first
if 'wikidata_enrichment' in data:
wd = data['wikidata_enrichment']
if 'wikidata_entity_id' in wd:
return wd['wikidata_entity_id']
# Try original_entry
if 'original_entry' in data:
oe = data['original_entry']
if 'wikidata_id' in oe:
return oe['wikidata_id']
# Try identifiers
if 'identifiers' in data:
for ident in data['identifiers']:
if ident.get('identifier_scheme') == 'Wikidata':
return ident.get('identifier_value')
return None
def process_file(filepath: Path, dry_run: bool = True) -> Dict[str, Any]:
"""Process a single custodian file to resolve XX region code."""
result = {
'filepath': str(filepath),
'has_xx_region': False,
'wikidata_id': None,
'country_code': None,
'resolved_region': None,
'updated': False,
'error': None
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
result['error'] = f"Failed to read file: {e}"
return result
# Check if file has XX region code
ghcid = data.get('ghcid', {})
location_resolution = ghcid.get('location_resolution', {})
if location_resolution.get('region_code') != 'XX':
return result
result['has_xx_region'] = True
result['country_code'] = location_resolution.get('country_code')
# Try to get Wikidata ID
wikidata_id = extract_wikidata_id(data)
result['wikidata_id'] = wikidata_id
if not wikidata_id:
result['error'] = "No Wikidata ID found"
return result
# For now, we can't resolve without a SPARQL query
# This script identifies files that need resolution
# Actual resolution would require Wikidata API calls
return result
def scan_xx_region_files(custodian_dir: Path) -> List[Dict[str, Any]]:
"""Scan all custodian files for XX region codes."""
results = []
for filepath in custodian_dir.glob('*.yaml'):
result = process_file(filepath, dry_run=True)
if result['has_xx_region']:
results.append(result)
return results
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Resolve XX region codes using Wikidata P131'
)
parser.add_argument('--scan', action='store_true',
help='Scan files and report XX region codes by country')
parser.add_argument('--path', type=str, default='data/custodian',
help='Path to custodian files directory')
parser.add_argument('--country', type=str,
help='Only process files for a specific country (e.g., FR, KR)')
args = parser.parse_args()
custodian_dir = Path(args.path)
if not custodian_dir.exists():
print(f"Error: Directory {custodian_dir} does not exist")
sys.exit(1)
print("=" * 70)
print("XX REGION CODE RESOLUTION SCANNER")
print("=" * 70)
if args.scan:
results = scan_xx_region_files(custodian_dir)
# Group by country
by_country = {}
for r in results:
cc = r['country_code']
if cc not in by_country:
by_country[cc] = []
by_country[cc].append(r)
print(f"\nTotal files with XX region code: {len(results)}")
print(f"\nBy country:")
for cc in sorted(by_country.keys(), key=lambda x: -len(by_country[x])):
count = len(by_country[cc])
has_mapping = cc in REGION_MAPPINGS
mapping_status = "HAS MAPPING" if has_mapping else "NO MAPPING"
print(f" {cc}: {count:4d} files [{mapping_status}]")
# Count files with/without Wikidata IDs
with_wd = sum(1 for r in results if r['wikidata_id'])
without_wd = len(results) - with_wd
print(f"\nWith Wikidata ID: {with_wd}")
print(f"Without Wikidata ID: {without_wd}")
# Sample files without Wikidata
if args.country:
country_results = by_country.get(args.country, [])
print(f"\n{args.country} files ({len(country_results)} total):")
for r in country_results[:10]:
wd = r['wikidata_id'] or 'NO_WIKIDATA'
print(f" {Path(r['filepath']).name}: {wd}")
else:
print("\nUsage:")
print(" --scan Scan and report XX region codes by country")
print(" --country XX Filter to specific country code")
print("\nExample:")
print(" python resolve_xx_regions.py --scan")
print(" python resolve_xx_regions.py --scan --country FR")
if __name__ == '__main__':
main()