456 lines
15 KiB
Python
456 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve XX region codes using Wikidata P131 (located in administrative territorial entity).
|
|
|
|
Uses Wikidata SPARQL to query the administrative hierarchy for each entity
|
|
and maps to ISO 3166-2 region codes where possible.
|
|
|
|
Following AGENTS.md Rule 5: Additive only - never delete existing data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
|
|
# ISO 3166-2 region code mappings for countries we frequently encounter
|
|
# Format: country_code: {wikidata_region_id: iso_region_code}
|
|
# These are manually curated for accuracy
|
|
|
|
REGION_MAPPINGS = {
|
|
# France - Regions
|
|
'FR': {
|
|
'Q90': 'IDF', # Paris (Île-de-France)
|
|
'Q13917': 'IDF', # Île-de-France
|
|
'Q15104': 'PAC', # Provence-Alpes-Côte d'Azur
|
|
'Q18677983': 'ARA', # Auvergne-Rhône-Alpes
|
|
'Q18677767': 'NAQ', # Nouvelle-Aquitaine
|
|
'Q18677875': 'OCC', # Occitanie
|
|
'Q18677757': 'GES', # Grand Est
|
|
'Q18677902': 'HDF', # Hauts-de-France
|
|
'Q12130': 'NOR', # Normandie
|
|
'Q16994': 'BRE', # Brittany
|
|
'Q16993': 'PDL', # Pays de la Loire
|
|
'Q13947': 'CVL', # Centre-Val de Loire
|
|
'Q18678082': 'BFC', # Bourgogne-Franche-Comté
|
|
'Q15276': 'COR', # Corsica
|
|
},
|
|
# South Korea - Provinces/Special Cities
|
|
'KR': {
|
|
'Q8684': 'SEO', # Seoul
|
|
'Q16520': 'BUS', # Busan
|
|
'Q41684': 'DAE', # Daegu
|
|
'Q41588': 'INC', # Incheon
|
|
'Q42622': 'GWJ', # Gwangju
|
|
'Q42833': 'DAJ', # Daejeon
|
|
'Q484637': 'ULS', # Ulsan
|
|
'Q20925': 'SEJ', # Sejong
|
|
'Q41076': 'GGI', # Gyeonggi
|
|
'Q41115': 'GAW', # Gangwon
|
|
'Q41392': 'CHB', # North Chungcheong
|
|
'Q41442': 'CHN', # South Chungcheong
|
|
'Q41213': 'JEO', # North Jeolla
|
|
'Q41283': 'JEN', # South Jeolla
|
|
'Q41171': 'GYB', # North Gyeongsang
|
|
'Q41312': 'GYN', # South Gyeongsang
|
|
'Q41872': 'JEJ', # Jeju
|
|
},
|
|
# China - Provinces
|
|
'CN': {
|
|
'Q956': 'BEI', # Beijing
|
|
'Q8686': 'SHA', # Shanghai
|
|
'Q15174': 'GUD', # Guangdong
|
|
'Q16572': 'ZHE', # Zhejiang
|
|
'Q16963': 'JIS', # Jiangsu
|
|
'Q43684': 'SHX', # Shaanxi
|
|
'Q45761': 'SCH', # Sichuan
|
|
'Q41079': 'HUN', # Hunan
|
|
'Q46862': 'HUB', # Hubei
|
|
'Q46913': 'HEN', # Henan
|
|
'Q16952': 'SDG', # Shandong
|
|
'Q21208': 'FUJ', # Fujian
|
|
'Q43194': 'YUN', # Yunnan
|
|
'Q40285': 'GUX', # Guangxi
|
|
'Q46491': 'ANH', # Anhui
|
|
'Q47097': 'JIL', # Jilin
|
|
'Q19188': 'LIA', # Liaoning
|
|
'Q19206': 'HEI', # Heilongjiang
|
|
'Q57251': 'XIN', # Xinjiang
|
|
'Q17188': 'TIB', # Tibet
|
|
'Q41705': 'HAI', # Hainan
|
|
'Q15184': 'SHG', # Shanxi
|
|
'Q47165': 'GAN', # Gansu
|
|
'Q45646': 'GUI', # Guizhou
|
|
'Q46865': 'JIX', # Jiangxi
|
|
'Q46684': 'NMG', # Inner Mongolia
|
|
'Q57448': 'QIN', # Qinghai
|
|
'Q57958': 'NXA', # Ningxia
|
|
'Q8646': 'HKG', # Hong Kong
|
|
'Q14773': 'MAC', # Macau
|
|
'Q15175': 'TIJ', # Tianjin
|
|
'Q11725': 'CHQ', # Chongqing
|
|
'Q46863': 'HEB', # Hebei
|
|
},
|
|
# Switzerland - Cantons
|
|
'CH': {
|
|
'Q11911': 'ZH', # Zürich
|
|
'Q12079': 'BE', # Bern
|
|
'Q12094': 'LU', # Luzern
|
|
'Q12433': 'UR', # Uri
|
|
'Q12592': 'SZ', # Schwyz
|
|
'Q12721': 'OW', # Obwalden
|
|
'Q12755': 'NW', # Nidwalden
|
|
'Q11922': 'GL', # Glarus
|
|
'Q11933': 'ZG', # Zug
|
|
'Q834': 'FR', # Fribourg
|
|
'Q12746': 'SO', # Solothurn
|
|
'Q12172': 'BS', # Basel-Stadt
|
|
'Q12146': 'BL', # Basel-Landschaft
|
|
'Q12640': 'SH', # Schaffhausen
|
|
'Q12573': 'AR', # Appenzell Ausserrhoden
|
|
'Q12094': 'AI', # Appenzell Innerrhoden
|
|
'Q12738': 'SG', # St. Gallen
|
|
'Q12697': 'GR', # Graubünden
|
|
'Q12724': 'AG', # Aargau
|
|
'Q12771': 'TG', # Thurgau
|
|
'Q12713': 'TI', # Ticino
|
|
'Q12771': 'VD', # Vaud
|
|
'Q12771': 'VS', # Valais
|
|
'Q12738': 'NE', # Neuchâtel
|
|
'Q11929': 'GE', # Geneva
|
|
'Q12755': 'JU', # Jura
|
|
},
|
|
# United Kingdom - Countries/Regions
|
|
'GB': {
|
|
'Q21': 'ENG', # England
|
|
'Q22': 'SCT', # Scotland
|
|
'Q25': 'WLS', # Wales
|
|
'Q26': 'NIR', # Northern Ireland
|
|
'Q84': 'LND', # London
|
|
# Historic counties / regions
|
|
'Q23436': 'KEN', # Kent
|
|
'Q23183': 'SRY', # Surrey
|
|
'Q180673': 'ESS', # Essex
|
|
'Q189299': 'MDX', # Middlesex (historic)
|
|
'Q23306': 'OXF', # Oxfordshire
|
|
'Q23169': 'CAM', # Cambridgeshire
|
|
'Q179528': 'YKS', # Yorkshire
|
|
},
|
|
# United States - States
|
|
'US': {
|
|
'Q99': 'CA', # California
|
|
'Q1387': 'NY', # New York
|
|
'Q1439': 'TX', # Texas
|
|
'Q779': 'FL', # Florida
|
|
'Q797': 'IL', # Illinois
|
|
'Q1400': 'PA', # Pennsylvania
|
|
'Q1397': 'OH', # Ohio
|
|
'Q1428': 'GA', # Georgia
|
|
'Q1223': 'MI', # Michigan
|
|
'Q1537': 'NC', # North Carolina
|
|
'Q1558': 'NJ', # New Jersey
|
|
'Q1370': 'VA', # Virginia
|
|
'Q1509': 'WA', # Washington
|
|
'Q1588': 'AZ', # Arizona
|
|
'Q1581': 'MA', # Massachusetts
|
|
'Q61': 'DC', # Washington, D.C.
|
|
'Q1408': 'MD', # Maryland
|
|
'Q1603': 'CO', # Colorado
|
|
'Q1649': 'MN', # Minnesota
|
|
'Q1494': 'IN', # Indiana
|
|
'Q1612': 'MO', # Missouri
|
|
},
|
|
# Germany - Bundesländer
|
|
'DE': {
|
|
'Q64': 'BE', # Berlin
|
|
'Q1055': 'HH', # Hamburg
|
|
'Q980': 'BY', # Bavaria
|
|
'Q985': 'BW', # Baden-Württemberg
|
|
'Q1198': 'NW', # North Rhine-Westphalia
|
|
'Q1194': 'NI', # Lower Saxony
|
|
'Q1196': 'HE', # Hesse
|
|
'Q1200': 'SN', # Saxony
|
|
'Q1208': 'RP', # Rhineland-Palatinate
|
|
'Q1199': 'ST', # Saxony-Anhalt
|
|
'Q1201': 'TH', # Thuringia
|
|
'Q1197': 'SH', # Schleswig-Holstein
|
|
'Q1202': 'MV', # Mecklenburg-Vorpommern
|
|
'Q1205': 'BB', # Brandenburg
|
|
'Q1221': 'SL', # Saarland
|
|
'Q1209': 'HB', # Bremen
|
|
},
|
|
# Japan - Prefectures
|
|
'JP': {
|
|
'Q1490': 'TKY', # Tokyo
|
|
'Q35765': 'OSK', # Osaka
|
|
'Q130266': 'KYO', # Kyoto
|
|
'Q52946': 'HKD', # Hokkaido
|
|
'Q131287': 'AIC', # Aichi
|
|
'Q131299': 'FKO', # Fukuoka
|
|
'Q131265': 'KGW', # Kanagawa
|
|
'Q131317': 'SIT', # Saitama
|
|
'Q131296': 'CHB', # Chiba
|
|
'Q131302': 'HYG', # Hyogo
|
|
'Q131292': 'SZO', # Shizuoka
|
|
'Q160727': 'HIR', # Hiroshima
|
|
'Q132681': 'NGT', # Niigata
|
|
'Q132692': 'ISK', # Ishikawa
|
|
'Q165791': 'NAR', # Nara
|
|
},
|
|
# Hungary - Counties/Regions
|
|
'HU': {
|
|
'Q1781': 'BUD', # Budapest
|
|
'Q193478': 'PE', # Pest
|
|
'Q204050': 'BAR', # Baranya
|
|
'Q165883': 'BCS', # Bács-Kiskun
|
|
'Q204055': 'BEK', # Békés
|
|
'Q204054': 'BOR', # Borsod-Abaúj-Zemplén
|
|
'Q203518': 'CSO', # Csongrád
|
|
'Q192503': 'FEJ', # Fejér
|
|
'Q165845': 'GYM', # Győr-Moson-Sopron
|
|
'Q165873': 'HAJ', # Hajdú-Bihar
|
|
'Q193491': 'HEV', # Heves
|
|
'Q204051': 'JNS', # Jász-Nagykun-Szolnok
|
|
'Q193505': 'KOM', # Komárom-Esztergom
|
|
'Q204053': 'NOG', # Nógrád
|
|
'Q193478': 'SOM', # Somogy
|
|
'Q193490': 'SZB', # Szabolcs-Szatmár-Bereg
|
|
'Q165875': 'TOL', # Tolna
|
|
'Q204048': 'VAS', # Vas
|
|
'Q204052': 'VES', # Veszprém
|
|
'Q165852': 'ZAL', # Zala
|
|
},
|
|
# Iran - Provinces
|
|
'IR': {
|
|
'Q3616': 'THR', # Tehran
|
|
'Q131986': 'ISF', # Isfahan
|
|
'Q170042': 'FAR', # Fars
|
|
'Q170067': 'KHU', # Khuzestan
|
|
'Q181109': 'AZS', # East Azerbaijan
|
|
'Q180972': 'AZG', # West Azerbaijan
|
|
'Q181158': 'GIL', # Gilan
|
|
'Q181165': 'MAZ', # Mazandaran
|
|
'Q181177': 'KER', # Kerman
|
|
'Q181186': 'KHO', # Khorasan Razavi
|
|
},
|
|
# India - States
|
|
'IN': {
|
|
'Q1353': 'DEL', # Delhi
|
|
'Q1191': 'MAH', # Maharashtra
|
|
'Q1165': 'KAR', # Karnataka
|
|
'Q1445': 'TN', # Tamil Nadu
|
|
'Q1498': 'WB', # West Bengal
|
|
'Q1159': 'GJ', # Gujarat
|
|
'Q1164': 'UP', # Uttar Pradesh
|
|
'Q1473': 'AP', # Andhra Pradesh
|
|
'Q1061': 'RJ', # Rajasthan
|
|
'Q677': 'KL', # Kerala
|
|
'Q1166': 'MP', # Madhya Pradesh
|
|
'Q1184': 'OR', # Odisha
|
|
'Q1478': 'TG', # Telangana
|
|
},
|
|
}
|
|
|
|
|
|
def query_wikidata_location(entity_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Query Wikidata for P131 (located in administrative entity) hierarchy."""
|
|
# Use the wikidata-authenticated MCP tool via subprocess
|
|
# For now, we'll implement a simple SPARQL query
|
|
|
|
sparql_query = f"""
|
|
SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin2 ?admin2Label ?coords WHERE {{
|
|
BIND(wd:{entity_id} AS ?item)
|
|
|
|
OPTIONAL {{
|
|
?item wdt:P131 ?admin1.
|
|
?admin1 wdt:P31/wdt:P279* wd:Q10864048. # first-order admin division
|
|
}}
|
|
|
|
OPTIONAL {{
|
|
?item wdt:P131 ?admin2.
|
|
?admin2 wdt:P31/wdt:P279* wd:Q13220204. # second-order admin division
|
|
}}
|
|
|
|
OPTIONAL {{
|
|
?item wdt:P625 ?coords.
|
|
}}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
|
|
}}
|
|
LIMIT 1
|
|
"""
|
|
|
|
# This is a simplified placeholder - actual implementation would use the MCP tool
|
|
return None
|
|
|
|
|
|
def resolve_region_from_wikidata_id(wikidata_id: str, country_code: str) -> Optional[str]:
|
|
"""Try to resolve region code from Wikidata ID using our mappings."""
|
|
if country_code not in REGION_MAPPINGS:
|
|
return None
|
|
|
|
mappings = REGION_MAPPINGS[country_code]
|
|
|
|
# Check if this Wikidata ID directly maps to a region
|
|
if wikidata_id in mappings:
|
|
return mappings[wikidata_id]
|
|
|
|
return None
|
|
|
|
|
|
def extract_wikidata_id(data: Dict[str, Any]) -> Optional[str]:
|
|
"""Extract Wikidata entity ID from custodian data."""
|
|
# Try wikidata_enrichment first
|
|
if 'wikidata_enrichment' in data:
|
|
wd = data['wikidata_enrichment']
|
|
if 'wikidata_entity_id' in wd:
|
|
return wd['wikidata_entity_id']
|
|
|
|
# Try original_entry
|
|
if 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if 'wikidata_id' in oe:
|
|
return oe['wikidata_id']
|
|
|
|
# Try identifiers
|
|
if 'identifiers' in data:
|
|
for ident in data['identifiers']:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
return ident.get('identifier_value')
|
|
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = True) -> Dict[str, Any]:
|
|
"""Process a single custodian file to resolve XX region code."""
|
|
result = {
|
|
'filepath': str(filepath),
|
|
'has_xx_region': False,
|
|
'wikidata_id': None,
|
|
'country_code': None,
|
|
'resolved_region': None,
|
|
'updated': False,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
result['error'] = f"Failed to read file: {e}"
|
|
return result
|
|
|
|
# Check if file has XX region code
|
|
ghcid = data.get('ghcid', {})
|
|
location_resolution = ghcid.get('location_resolution', {})
|
|
|
|
if location_resolution.get('region_code') != 'XX':
|
|
return result
|
|
|
|
result['has_xx_region'] = True
|
|
result['country_code'] = location_resolution.get('country_code')
|
|
|
|
# Try to get Wikidata ID
|
|
wikidata_id = extract_wikidata_id(data)
|
|
result['wikidata_id'] = wikidata_id
|
|
|
|
if not wikidata_id:
|
|
result['error'] = "No Wikidata ID found"
|
|
return result
|
|
|
|
# For now, we can't resolve without a SPARQL query
|
|
# This script identifies files that need resolution
|
|
# Actual resolution would require Wikidata API calls
|
|
|
|
return result
|
|
|
|
|
|
def scan_xx_region_files(custodian_dir: Path) -> List[Dict[str, Any]]:
|
|
"""Scan all custodian files for XX region codes."""
|
|
results = []
|
|
|
|
for filepath in custodian_dir.glob('*.yaml'):
|
|
result = process_file(filepath, dry_run=True)
|
|
if result['has_xx_region']:
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Resolve XX region codes using Wikidata P131'
|
|
)
|
|
parser.add_argument('--scan', action='store_true',
|
|
help='Scan files and report XX region codes by country')
|
|
parser.add_argument('--path', type=str, default='data/custodian',
|
|
help='Path to custodian files directory')
|
|
parser.add_argument('--country', type=str,
|
|
help='Only process files for a specific country (e.g., FR, KR)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.path)
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory {custodian_dir} does not exist")
|
|
sys.exit(1)
|
|
|
|
print("=" * 70)
|
|
print("XX REGION CODE RESOLUTION SCANNER")
|
|
print("=" * 70)
|
|
|
|
if args.scan:
|
|
results = scan_xx_region_files(custodian_dir)
|
|
|
|
# Group by country
|
|
by_country = {}
|
|
for r in results:
|
|
cc = r['country_code']
|
|
if cc not in by_country:
|
|
by_country[cc] = []
|
|
by_country[cc].append(r)
|
|
|
|
print(f"\nTotal files with XX region code: {len(results)}")
|
|
print(f"\nBy country:")
|
|
for cc in sorted(by_country.keys(), key=lambda x: -len(by_country[x])):
|
|
count = len(by_country[cc])
|
|
has_mapping = cc in REGION_MAPPINGS
|
|
mapping_status = "HAS MAPPING" if has_mapping else "NO MAPPING"
|
|
print(f" {cc}: {count:4d} files [{mapping_status}]")
|
|
|
|
# Count files with/without Wikidata IDs
|
|
with_wd = sum(1 for r in results if r['wikidata_id'])
|
|
without_wd = len(results) - with_wd
|
|
print(f"\nWith Wikidata ID: {with_wd}")
|
|
print(f"Without Wikidata ID: {without_wd}")
|
|
|
|
# Sample files without Wikidata
|
|
if args.country:
|
|
country_results = by_country.get(args.country, [])
|
|
print(f"\n{args.country} files ({len(country_results)} total):")
|
|
for r in country_results[:10]:
|
|
wd = r['wikidata_id'] or 'NO_WIKIDATA'
|
|
print(f" {Path(r['filepath']).name}: {wd}")
|
|
|
|
else:
|
|
print("\nUsage:")
|
|
print(" --scan Scan and report XX region codes by country")
|
|
print(" --country XX Filter to specific country code")
|
|
print("\nExample:")
|
|
print(" python resolve_xx_regions.py --scan")
|
|
print(" python resolve_xx_regions.py --scan --country FR")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|