#!/usr/bin/env python3 """ Resolve XX region codes using Wikidata P131 hierarchy. This script handles files that lack coordinates by: 1. Querying Wikidata P131 (located in administrative entity) chain 2. Following the chain until finding an entity with P300 (ISO 3166-2 code) 3. Using hardcoded mappings for entities without P300 Following AGENTS.md Rules: - Rule 5: Additive only - never delete existing data - GHCID settlement standardization: Use proper settlements only IMPORTANT: This script only resolves REGION codes (XX -> proper region). For city/settlement resolution, use resolve_locations_geonames.py which requires coordinates. """ import os import sys import yaml import json import re import urllib.request import urllib.parse from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List, Tuple # Direct mapping of Wikidata admin entities to ISO 3166-2 codes # This is for entities that don't have P300 but we know the mapping WIKIDATA_TO_ISO = { # Australian states 'Q3258': 'AU-NSW', # New South Wales 'Q36687': 'AU-VIC', # Victoria 'Q36074': 'AU-QLD', # Queensland 'Q35850': 'AU-WA', # Western Australia 'Q35715': 'AU-SA', # South Australia 'Q34366': 'AU-TAS', # Tasmania 'Q3235': 'AU-ACT', # Australian Capital Territory 'Q3373': 'AU-NT', # Northern Territory # Swiss cantons 'Q11943': 'CH-ZH', # Zurich 'Q11911': 'CH-BE', # Bern 'Q12146': 'CH-LU', # Lucerne 'Q12172': 'CH-UR', # Uri 'Q12174': 'CH-SZ', # Schwyz 'Q12193': 'CH-OW', # Obwalden 'Q12191': 'CH-NW', # Nidwalden 'Q12262': 'CH-GL', # Glarus 'Q12226': 'CH-ZG', # Zug 'Q12640': 'CH-FR', # Fribourg 'Q12433': 'CH-SO', # Solothurn 'Q12503': 'CH-BS', # Basel-Stadt 'Q12536': 'CH-BL', # Basel-Landschaft 'Q12079': 'CH-SH', # Schaffhausen 'Q12094': 'CH-AR', # Appenzell Ausserrhoden 'Q12106': 'CH-AI', # Appenzell Innerrhoden 'Q12121': 'CH-SG', # St. Gallen 'Q12697': 'CH-GR', # Graubunden 'Q12738': 'CH-AG', # Aargau 'Q12755': 'CH-TG', # Thurgau 'Q12713': 'CH-TI', # Ticino 'Q12771': 'CH-VD', # Vaud 'Q12800': 'CH-VS', # Valais 'Q12592': 'CH-NE', # Neuchatel 'Q12573': 'CH-GE', # Geneva 'Q12596': 'CH-JU', # Jura # Argentine provinces 'Q1486': 'AR-C', # Buenos Aires (city) 'Q44754': 'AR-C', # Autonomous City of Buenos Aires 'Q44757': 'AR-B', # Buenos Aires Province 'Q44758': 'AR-K', # Catamarca 'Q44759': 'AR-H', # Chaco 'Q44760': 'AR-U', # Chubut 'Q44761': 'AR-X', # Cordoba 'Q44762': 'AR-W', # Corrientes 'Q44763': 'AR-E', # Entre Rios 'Q44764': 'AR-P', # Formosa 'Q44765': 'AR-Y', # Jujuy 'Q44766': 'AR-L', # La Pampa 'Q44767': 'AR-F', # La Rioja 'Q44768': 'AR-M', # Mendoza 'Q44769': 'AR-N', # Misiones 'Q44770': 'AR-Q', # Neuquen 'Q44771': 'AR-R', # Rio Negro 'Q44772': 'AR-A', # Salta 'Q44773': 'AR-J', # San Juan 'Q44774': 'AR-D', # San Luis 'Q44775': 'AR-Z', # Santa Cruz 'Q44776': 'AR-S', # Santa Fe 'Q44777': 'AR-G', # Santiago del Estero 'Q44778': 'AR-V', # Tierra del Fuego 'Q44779': 'AR-T', # Tucuman # Bangladesh divisions 'Q240042': 'BD-A', # Barisal 'Q331265': 'BD-B', # Chittagong 'Q309068': 'BD-C', # Dhaka 'Q321140': 'BD-D', # Khulna 'Q326015': 'BD-H', # Mymensingh 'Q326004': 'BD-E', # Rajshahi 'Q326088': 'BD-F', # Rangpur 'Q331258': 'BD-G', # Sylhet # Bolivian departments 'Q334620': 'BO-C', # Cochabamba 'Q334632': 'BO-H', # Chuquisaca 'Q334649': 'BO-L', # La Paz 'Q334665': 'BO-O', # Oruro 'Q334678': 'BO-P', # Potosi 'Q334699': 'BO-S', # Santa Cruz 'Q334711': 'BO-T', # Tarija 'Q334724': 'BO-B', # Beni 'Q334735': 'BO-N', # Pando # Singapore (city-state - no subdivisions) 'Q334': 'SG-SG', # Singapore # Sint Maarten 'Q26273': 'SX-SX', # Sint Maarten # UK countries/nations 'Q21': 'GB-ENG', # England 'Q22': 'GB-SCT', # Scotland 'Q25': 'GB-WLS', # Wales 'Q26': 'GB-NIR', # Northern Ireland # South Korean special cities and provinces 'Q8684': 'KR-11', # Seoul 'Q16520': 'KR-26', # Busan 'Q41848': 'KR-27', # Daegu 'Q40674': 'KR-28', # Incheon 'Q41295': 'KR-29', # Gwangju 'Q42622': 'KR-30', # Daejeon 'Q42420': 'KR-31', # Ulsan 'Q20960': 'KR-41', # Gyeonggi 'Q41079': 'KR-42', # Gangwon 'Q41392': 'KR-43', # North Chungcheong 'Q41394': 'KR-44', # South Chungcheong 'Q41585': 'KR-45', # North Jeolla 'Q41587': 'KR-46', # South Jeolla 'Q41171': 'KR-47', # North Gyeongsang 'Q41158': 'KR-48', # South Gyeongsang 'Q28227': 'KR-49', # Jeju 'Q483134': 'KR-50', # Sejong # Estonia counties 'Q189539': 'EE-37', # Harju County (Tallinn) 'Q192611': 'EE-39', # Hiiu County 'Q180297': 'EE-44', # Ida-Viru County 'Q188808': 'EE-49', # Jõgeva County 'Q190093': 'EE-51', # Järva County 'Q190086': 'EE-57', # Lääne County 'Q190085': 'EE-59', # Lääne-Viru County 'Q189537': 'EE-65', # Põlva County 'Q189544': 'EE-67', # Pärnu County 'Q189542': 'EE-70', # Rapla County 'Q189553': 'EE-74', # Saare County 'Q189530': 'EE-78', # Tartu County 'Q189554': 'EE-82', # Valga County 'Q189556': 'EE-84', # Viljandi County 'Q189538': 'EE-86', # Võru County # Thai regions/provinces 'Q464862': 'TH-10', # Bangkok (Krung Thep Maha Nakhon) # Indian states 'Q1159': 'IN-AP', # Andhra Pradesh 'Q1508': 'IN-AR', # Arunachal Pradesh 'Q1164': 'IN-AS', # Assam 'Q1165': 'IN-BR', # Bihar 'Q1168': 'IN-CT', # Chhattisgarh 'Q1171': 'IN-GA', # Goa 'Q1061': 'IN-GJ', # Gujarat 'Q1174': 'IN-HR', # Haryana 'Q1177': 'IN-HP', # Himachal Pradesh 'Q1180': 'IN-JH', # Jharkhand 'Q1185': 'IN-KA', # Karnataka 'Q1186': 'IN-KL', # Kerala 'Q1191': 'IN-MP', # Madhya Pradesh 'Q1191': 'IN-MH', # Maharashtra 'Q1193': 'IN-MN', # Manipur 'Q1195': 'IN-ML', # Meghalaya 'Q1502': 'IN-MZ', # Mizoram 'Q1497': 'IN-NL', # Nagaland 'Q22048': 'IN-OR', # Odisha 'Q22424': 'IN-PB', # Punjab 'Q1437': 'IN-RJ', # Rajasthan 'Q1505': 'IN-SK', # Sikkim 'Q1445': 'IN-TN', # Tamil Nadu 'Q677037': 'IN-TG', # Telangana 'Q1344': 'IN-TR', # Tripura 'Q1498': 'IN-UP', # Uttar Pradesh 'Q1499': 'IN-UT', # Uttarakhand 'Q1356': 'IN-WB', # West Bengal # Mexican states 'Q30965': 'MX-AGU', # Aguascalientes 'Q30967': 'MX-BCN', # Baja California 'Q46508': 'MX-BCS', # Baja California Sur 'Q58731': 'MX-CAM', # Campeche 'Q61076': 'MX-COA', # Coahuila 'Q61077': 'MX-COL', # Colima 'Q61079': 'MX-CHP', # Chiapas 'Q61080': 'MX-CHH', # Chihuahua 'Q1489': 'MX-CMX', # Mexico City (CDMX) 'Q61083': 'MX-DUR', # Durango 'Q61084': 'MX-GUA', # Guanajuato 'Q61085': 'MX-GRO', # Guerrero 'Q61086': 'MX-HID', # Hidalgo 'Q61087': 'MX-JAL', # Jalisco 'Q61088': 'MX-MEX', # State of Mexico 'Q61089': 'MX-MIC', # Michoacan 'Q61090': 'MX-MOR', # Morelos 'Q61091': 'MX-NAY', # Nayarit 'Q61092': 'MX-NLE', # Nuevo Leon 'Q61093': 'MX-OAX', # Oaxaca 'Q61094': 'MX-PUE', # Puebla 'Q61095': 'MX-QUE', # Queretaro 'Q61096': 'MX-ROO', # Quintana Roo 'Q61097': 'MX-SLP', # San Luis Potosi 'Q61098': 'MX-SIN', # Sinaloa 'Q61099': 'MX-SON', # Sonora 'Q61100': 'MX-TAB', # Tabasco 'Q61101': 'MX-TAM', # Tamaulipas 'Q61102': 'MX-TLA', # Tlaxcala 'Q61103': 'MX-VER', # Veracruz 'Q61104': 'MX-YUC', # Yucatan 'Q61105': 'MX-ZAC', # Zacatecas # Egyptian governorates 'Q85': 'EG-C', # Cairo 'Q87': 'EG-ALX', # Alexandria 'Q204060': 'EG-GZ', # Giza # Dominican Republic provinces 'Q18393': 'DO-01', # Distrito Nacional (Santo Domingo) # Jamaica parishes 'Q3534362': 'JM-01', # Kingston # Jamaican capital 'Q34692': 'JM-01', # Kingston city # Ukrainian oblasts 'Q1899': 'UA-30', # Kyiv 'Q7525': 'UA-05', # Vinnytsia Oblast 'Q7526': 'UA-07', # Volyn Oblast 'Q7528': 'UA-12', # Dnipropetrovsk Oblast 'Q7530': 'UA-14', # Donetsk Oblast 'Q7531': 'UA-18', # Zhytomyr Oblast 'Q7532': 'UA-21', # Zakarpattia Oblast 'Q7533': 'UA-23', # Zaporizhzhia Oblast 'Q7534': 'UA-26', # Ivano-Frankivsk Oblast 'Q7535': 'UA-32', # Kyiv Oblast 'Q7536': 'UA-35', # Kirovohrad Oblast 'Q7537': 'UA-09', # Luhansk Oblast 'Q7538': 'UA-46', # Lviv Oblast 'Q7539': 'UA-48', # Mykolaiv Oblast 'Q7540': 'UA-51', # Odesa Oblast 'Q7541': 'UA-53', # Poltava Oblast 'Q7542': 'UA-56', # Rivne Oblast 'Q7543': 'UA-59', # Sumy Oblast 'Q7544': 'UA-61', # Ternopil Oblast 'Q7545': 'UA-63', # Kharkiv Oblast 'Q7546': 'UA-65', # Kherson Oblast 'Q7547': 'UA-68', # Khmelnytskyi Oblast 'Q7548': 'UA-71', # Cherkasy Oblast 'Q7549': 'UA-74', # Chernivtsi Oblast 'Q7550': 'UA-77', # Chernihiv Oblast # Iranian provinces 'Q160766': 'IR-30', # Razavi Khorasan (Mashhad) 'Q170416': 'IR-23', # Tehran # Mozambique provinces 'Q182329': 'MZ-MPM', # Maputo Province 'Q182323': 'MZ-L', # Maputo City # Czech regions (kraje) 'Q1085': 'CZ-10', # Prague (capital city) 'Q193702': 'CZ-10', # Prague (region) 'Q18473': 'CZ-20', # Central Bohemian Region (Středočeský kraj) 'Q18475': 'CZ-31', # South Bohemian Region (Jihočeský kraj) 'Q18471': 'CZ-32', # Plzeň Region (Plzeňský kraj) 'Q18461': 'CZ-41', # Karlovy Vary Region (Karlovarský kraj) 'Q18476': 'CZ-42', # Ústí nad Labem Region (Ústecký kraj) 'Q18465': 'CZ-51', # Liberec Region (Liberecký kraj) 'Q18463': 'CZ-52', # Hradec Králové Region (Královéhradecký kraj) 'Q18468': 'CZ-53', # Pardubice Region (Pardubický kraj) 'Q18478': 'CZ-63', # Vysočina Region 'Q18460': 'CZ-64', # South Moravian Region (Jihomoravský kraj) 'Q18467': 'CZ-71', # Olomouc Region (Olomoucký kraj) 'Q18479': 'CZ-72', # Zlín Region (Zlínský kraj) 'Q18466': 'CZ-80', # Moravian-Silesian Region (Moravskoslezský kraj) # Czech major cities (to their regions) 'Q14960': 'CZ-64', # Brno -> South Moravian 'Q81137': 'CZ-80', # Ostrava -> Moravian-Silesian 'Q157311': 'CZ-32', # Plzeň -> Plzeň Region 'Q81938': 'CZ-51', # Liberec -> Liberec Region 'Q81979': 'CZ-71', # Olomouc -> Olomouc Region 'Q80284': 'CZ-31', # České Budějovice -> South Bohemian 'Q82057': 'CZ-52', # Hradec Králové -> Hradec Králové Region 'Q82197': 'CZ-42', # Ústí nad Labem -> Ústí nad Labem Region 'Q82463': 'CZ-53', # Pardubice -> Pardubice Region # Belgian regions and provinces 'Q31': 'BE-VLG', # Flanders 'Q234': 'BE-WAL', # Wallonia 'Q240': 'BE-BRU', # Brussels-Capital Region # Flemish provinces 'Q1112': 'BE-VAN', # Antwerp 'Q1114': 'BE-VLI', # Limburg (Belgium) 'Q1116': 'BE-VBR', # Flemish Brabant 'Q1117': 'BE-VOV', # East Flanders 'Q1118': 'BE-VWV', # West Flanders # Walloon provinces 'Q1127': 'BE-WBR', # Walloon Brabant 'Q1128': 'BE-WHT', # Hainaut 'Q1130': 'BE-WLG', # Liège 'Q1131': 'BE-WLX', # Luxembourg (Belgium) 'Q1132': 'BE-WNA', # Namur # Belgian major cities (to their provinces) 'Q12988': 'BE-BRU', # Brussels -> Brussels-Capital 'Q12892': 'BE-VAN', # Antwerp city -> Antwerp province 'Q12996': 'BE-VOV', # Ghent -> East Flanders 'Q12994': 'BE-VWV', # Bruges -> West Flanders 'Q118958': 'BE-WLG', # Liège city -> Liège province 'Q162022': 'BE-WHT', # Charleroi -> Hainaut 'Q162163': 'BE-VLI', # Hasselt -> Limburg 'Q12990': 'BE-VBR', # Leuven -> Flemish Brabant 'Q162176': 'BE-WNA', # Namur city -> Namur province # Bulgarian oblasts (provinces) 'Q7921': 'BG-22', # Sofia City 'Q188812': 'BG-23', # Sofia Province 'Q215072': 'BG-01', # Blagoevgrad 'Q215129': 'BG-02', # Burgas 'Q215165': 'BG-08', # Dobrich 'Q215196': 'BG-07', # Gabrovo 'Q215235': 'BG-26', # Haskovo 'Q215270': 'BG-09', # Kardzhali 'Q215303': 'BG-10', # Kyustendil 'Q215340': 'BG-11', # Lovech 'Q215378': 'BG-12', # Montana 'Q215407': 'BG-13', # Pazardzhik 'Q215446': 'BG-14', # Pernik 'Q215475': 'BG-15', # Pleven 'Q215504': 'BG-16', # Plovdiv 'Q215538': 'BG-17', # Razgrad 'Q215565': 'BG-18', # Ruse 'Q215605': 'BG-27', # Shumen 'Q215636': 'BG-19', # Silistra 'Q215666': 'BG-20', # Sliven 'Q215696': 'BG-21', # Smolyan 'Q215727': 'BG-24', # Stara Zagora 'Q215758': 'BG-25', # Targovishte 'Q215787': 'BG-03', # Varna 'Q215820': 'BG-04', # Veliko Tarnovo 'Q215856': 'BG-05', # Vidin 'Q215882': 'BG-06', # Vratsa 'Q215917': 'BG-28', # Yambol # Bulgarian major cities 'Q472': 'BG-22', # Sofia city -> Sofia City 'Q35825': 'BG-16', # Plovdiv city -> Plovdiv 'Q36367': 'BG-03', # Varna city -> Varna 'Q37106': 'BG-02', # Burgas city -> Burgas 'Q37252': 'BG-18', # Ruse city -> Ruse # Philippine regions/NCR 'Q13580': 'PH-00', # Metro Manila (NCR) 'Q13586': 'PH-05', # Bicol Region # Oman governorates 'Q193076': 'OM-MA', # Muscat # Uzbekistan regions 'Q269': 'UZ-TK', # Tashkent # Denmark regions 'Q26073': 'DK-84', # Capital Region of Denmark # Netherlands provinces (for completeness) 'Q694': 'NL-NH', # North Holland 'Q695': 'NL-ZH', # South Holland 'Q696': 'NL-UT', # Utrecht 'Q772': 'NL-GE', # Gelderland 'Q775': 'NL-LI', # Limburg 'Q776': 'NL-NB', # North Brabant 'Q777': 'NL-OV', # Overijssel 'Q778': 'NL-FR', # Friesland 'Q779': 'NL-GR', # Groningen 'Q780': 'NL-DR', # Drenthe 'Q781': 'NL-FL', # Flevoland 'Q782': 'NL-ZE', # Zeeland # French regions (new 2016 regions) 'Q13917': 'FR-IDF', # Île-de-France 'Q12130': 'FR-CVL', # Centre-Val de Loire 'Q18578': 'FR-BFC', # Bourgogne-Franche-Comté 'Q18677': 'FR-NOR', # Normandy 'Q18677': 'FR-HDF', # Hauts-de-France 'Q18677': 'FR-GES', # Grand Est 'Q18677': 'FR-PDL', # Pays de la Loire 'Q12130': 'FR-BRE', # Brittany 'Q18677': 'FR-NAQ', # Nouvelle-Aquitaine 'Q18677': 'FR-OCC', # Occitanie 'Q18677': 'FR-ARA', # Auvergne-Rhône-Alpes 'Q18677': 'FR-PAC', # Provence-Alpes-Côte d'Azur 'Q14112': 'FR-COR', # Corsica 'Q90': 'FR-IDF', # Paris -> Île-de-France } def query_p131_chain_for_entity(qid: str) -> Optional[str]: """ Query the P131 chain for a specific Wikidata entity to find its ISO 3166-2 region code. Returns the ISO code if found, None otherwise. """ # First check if this entity itself is in our mapping if qid in WIKIDATA_TO_ISO: return WIKIDATA_TO_ISO[qid] # Query P131 chain with P300 codes query = f""" SELECT ?admin ?adminLabel ?iso_code WHERE {{ wd:{qid} wdt:P131* ?admin. OPTIONAL {{ ?admin wdt:P300 ?iso_code. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} LIMIT 30 """ url = "https://query.wikidata.org/sparql" headers = { 'Accept': 'application/sparql-results+json', 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' } data = urllib.parse.urlencode({'query': query}).encode('utf-8') try: request = urllib.request.Request(url, data=data, headers=headers) with urllib.request.urlopen(request, timeout=60) as response: result = json.loads(response.read().decode('utf-8')) bindings = result.get('results', {}).get('bindings', []) except Exception as e: print(f" P131 chain query error for {qid}: {e}") return None # Look for ISO code in results for row in bindings: # Check P300 ISO code iso_code = row.get('iso_code', {}).get('value', '') if iso_code and '-' in iso_code: return iso_code # Check our hardcoded mapping admin_uri = row.get('admin', {}).get('value', '') if admin_uri: admin_qid = admin_uri.split('/')[-1] if admin_qid in WIKIDATA_TO_ISO: return WIKIDATA_TO_ISO[admin_qid] return None def get_location_entities(qid: str) -> List[str]: """ Get location-related entities for a Wikidata entity. Checks P131 (located in), P159 (headquarters), P276 (location), P17 (country). Returns list of QIDs to check for P131 chain. """ query = f""" SELECT ?prop ?value WHERE {{ VALUES ?prop {{ wdt:P131 wdt:P159 wdt:P276 wdt:P17 }} wd:{qid} ?prop ?value. }} """ url = "https://query.wikidata.org/sparql" headers = { 'Accept': 'application/sparql-results+json', 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' } data = urllib.parse.urlencode({'query': query}).encode('utf-8') try: request = urllib.request.Request(url, data=data, headers=headers) with urllib.request.urlopen(request, timeout=60) as response: result = json.loads(response.read().decode('utf-8')) bindings = result.get('results', {}).get('bindings', []) except Exception as e: print(f" Location entities query error for {qid}: {e}") return [] entities = [] for row in bindings: value_uri = row.get('value', {}).get('value', '') if value_uri: value_qid = value_uri.split('/')[-1] if value_qid not in entities: entities.append(value_qid) return entities def query_p131_chain(qid: str) -> Optional[str]: """ Query the P131 chain for a Wikidata entity to find its ISO 3166-2 region code. Tries multiple strategies: 1. Entity's own P131 chain 2. P159 (headquarters) entity's P131 chain 3. P276 (location) entity's P131 chain Returns the ISO code if found, None otherwise. """ # First check if this entity itself is in our mapping if qid in WIKIDATA_TO_ISO: return WIKIDATA_TO_ISO[qid] # Try entity's own P131 chain iso_code = query_p131_chain_for_entity(qid) if iso_code: return iso_code # Get location-related entities (P131, P159, P276) location_entities = get_location_entities(qid) # Try each location entity's P131 chain for loc_qid in location_entities: iso_code = query_p131_chain_for_entity(loc_qid) if iso_code: return iso_code return None def update_file_with_region(filepath: Path, iso_code: str, admin_label: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]: """Update a custodian file with resolved region code.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f" Error reading {filepath}: {e}") return False, None if 'ghcid' not in data: return False, None ghcid = data['ghcid'] if 'location_resolution' not in ghcid: ghcid['location_resolution'] = {} loc_res = ghcid['location_resolution'] country_code = loc_res.get('country_code', '') if not country_code: return False, None old_region = loc_res.get('region_code', 'XX') if old_region != 'XX': return False, None # Extract region part from ISO code (e.g., "AR-B" -> "B", "CH-GE" -> "GE") if '-' in iso_code: parts = iso_code.split('-') iso_country = parts[0] region_code = parts[1] # Verify country matches if iso_country != country_code: print(f" Warning: ISO country {iso_country} != file country {country_code}") return False, None else: region_code = iso_code # Update location resolution loc_res['region_code'] = region_code loc_res['region_name'] = admin_label loc_res['method'] = 'WIKIDATA_P131' loc_res['iso_code_source'] = iso_code loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() # Update GHCID string old_ghcid = ghcid.get('ghcid_current', '') new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') if new_ghcid != old_ghcid: ghcid['ghcid_current'] = new_ghcid if 'ghcid_history' not in ghcid: ghcid['ghcid_history'] = [] ghcid['ghcid_history'].append({ 'ghcid': new_ghcid, 'valid_from': datetime.now(timezone.utc).isoformat(), 'reason': f"Region resolved via Wikidata P131: XX->{region_code} ({admin_label})" }) # Add provenance note if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] elif isinstance(data['provenance']['notes'], str): data['provenance']['notes'] = [data['provenance']['notes']] data['provenance']['notes'].append( f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " f"XX->{region_code} via Wikidata P131 ({admin_label})" ) # Determine new filename new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') new_filepath = filepath.parent / new_filename if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) if new_filepath != filepath and not new_filepath.exists(): filepath.rename(new_filepath) return True, new_filepath if new_filepath != filepath else None def main(): """Main entry point.""" import argparse import time parser = argparse.ArgumentParser( description='Resolve XX region codes using Wikidata P131 hierarchy' ) parser.add_argument('--apply', action='store_true', help='Actually apply the fixes (default: dry run)') parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files directory') parser.add_argument('--limit', type=int, default=100, help='Limit number of files to process') parser.add_argument('--country', type=str, help='Only process files for a specific country') args = parser.parse_args() custodian_dir = Path(args.path) if not custodian_dir.exists(): print(f"Error: Directory {custodian_dir} does not exist") sys.exit(1) dry_run = not args.apply print("=" * 70) print("REGION RESOLUTION VIA WIKIDATA P131 HIERARCHY") print("=" * 70) print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") print() print("NOTE: This script only resolves region codes (XX).") print(" For city/settlement resolution, use resolve_locations_geonames.py") print() # Find files with XX region codes files_to_process = [] for filepath in custodian_dir.glob('*-XX-*.yaml'): files_to_process.append(filepath) print(f"Found {len(files_to_process)} files with XX region codes") # Load files and extract Wikidata IDs file_data = [] for filepath in files_to_process[:args.limit]: try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get country code country = None if 'ghcid' in data and 'location_resolution' in data['ghcid']: country = data['ghcid']['location_resolution'].get('country_code') if not country: continue if args.country and country != args.country: continue # Get Wikidata ID from various locations wikidata_id = None if 'wikidata_enrichment' in data: wikidata_id = data['wikidata_enrichment'].get('wikidata_entity_id') if not wikidata_id and 'original_entry' in data: wikidata_id = data['original_entry'].get('wikidata_id') # Also check identifiers list if not wikidata_id and 'original_entry' in data: for ident in data['original_entry'].get('identifiers', []): if ident.get('identifier_scheme') == 'Wikidata': wikidata_id = ident.get('identifier_value') break if not wikidata_id: continue file_data.append({ 'filepath': filepath, 'data': data, 'country': country, 'wikidata_id': wikidata_id }) except Exception as e: print(f"Error loading {filepath}: {e}") print(f"Processing {len(file_data)} files with Wikidata IDs") print() # Process each file resolved = 0 renamed = 0 failed = 0 for f in file_data: filepath = f['filepath'] qid = f['wikidata_id'] country = f['country'] print(f"Processing {filepath.name} ({qid})...") # Query P131 chain for ISO code iso_code = query_p131_chain(qid) if not iso_code: print(f" No ISO code found") failed += 1 time.sleep(0.5) # Rate limiting continue # Extract admin label from ISO code for provenance admin_label = iso_code # Use ISO code as label if we don't have a name # Update file success, new_path = update_file_with_region(filepath, iso_code, admin_label, dry_run=dry_run) if success: resolved += 1 if new_path: renamed += 1 print(f" {filepath.name} -> {new_path.name} ({iso_code})") else: print(f" Updated: {filepath.name} ({iso_code})") else: failed += 1 print(f" Failed to update") time.sleep(0.5) # Rate limiting print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {len(file_data)}") print(f"Resolved: {resolved}") print(f"Renamed: {renamed}") print(f"Failed: {failed}") if dry_run: print() print("This was a DRY RUN. Use --apply to make changes.") if __name__ == '__main__': main()