glam/scripts/resolve_locations_p131.py
kempersc 63a6bccd9b fix: remove custodian files with invalid GHCID special characters
Remove 229 custodian YAML files containing invalid characters in GHCIDs:
- Ampersand (&) in abbreviations (e.g., BM&HS, UNL&AG, DR&IMSM)
- Parentheses in abbreviations (e.g., WHO(RA, VK(, SL()
- Unicode characters in filenames (Ö, Ä, Å, É, İ, Ż, etc.)

These files are replaced with corrected versions using alphabetic-only
abbreviations per AGENTS.md Rule 8 (Special Characters MUST Be Excluded).

Related scripts updated for location resolution.
2025-12-07 14:23:50 +01:00

780 lines
27 KiB
Python

#!/usr/bin/env python3
"""
Resolve XX region codes using Wikidata P131 hierarchy.
This script handles files that lack coordinates by:
1. Querying Wikidata P131 (located in administrative entity) chain
2. Following the chain until finding an entity with P300 (ISO 3166-2 code)
3. Using hardcoded mappings for entities without P300
Following AGENTS.md Rules:
- Rule 5: Additive only - never delete existing data
- GHCID settlement standardization: Use proper settlements only
IMPORTANT: This script only resolves REGION codes (XX -> proper region).
For city/settlement resolution, use resolve_locations_geonames.py which requires coordinates.
"""
import os
import sys
import yaml
import json
import re
import urllib.request
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
# Direct mapping of Wikidata admin entities to ISO 3166-2 codes
# This is for entities that don't have P300 but we know the mapping
WIKIDATA_TO_ISO = {
# Australian states
'Q3258': 'AU-NSW', # New South Wales
'Q36687': 'AU-VIC', # Victoria
'Q36074': 'AU-QLD', # Queensland
'Q35850': 'AU-WA', # Western Australia
'Q35715': 'AU-SA', # South Australia
'Q34366': 'AU-TAS', # Tasmania
'Q3235': 'AU-ACT', # Australian Capital Territory
'Q3373': 'AU-NT', # Northern Territory
# Swiss cantons
'Q11943': 'CH-ZH', # Zurich
'Q11911': 'CH-BE', # Bern
'Q12146': 'CH-LU', # Lucerne
'Q12172': 'CH-UR', # Uri
'Q12174': 'CH-SZ', # Schwyz
'Q12193': 'CH-OW', # Obwalden
'Q12191': 'CH-NW', # Nidwalden
'Q12262': 'CH-GL', # Glarus
'Q12226': 'CH-ZG', # Zug
'Q12640': 'CH-FR', # Fribourg
'Q12433': 'CH-SO', # Solothurn
'Q12503': 'CH-BS', # Basel-Stadt
'Q12536': 'CH-BL', # Basel-Landschaft
'Q12079': 'CH-SH', # Schaffhausen
'Q12094': 'CH-AR', # Appenzell Ausserrhoden
'Q12106': 'CH-AI', # Appenzell Innerrhoden
'Q12121': 'CH-SG', # St. Gallen
'Q12697': 'CH-GR', # Graubunden
'Q12738': 'CH-AG', # Aargau
'Q12755': 'CH-TG', # Thurgau
'Q12713': 'CH-TI', # Ticino
'Q12771': 'CH-VD', # Vaud
'Q12800': 'CH-VS', # Valais
'Q12592': 'CH-NE', # Neuchatel
'Q12573': 'CH-GE', # Geneva
'Q12596': 'CH-JU', # Jura
# Argentine provinces
'Q1486': 'AR-C', # Buenos Aires (city)
'Q44754': 'AR-C', # Autonomous City of Buenos Aires
'Q44757': 'AR-B', # Buenos Aires Province
'Q44758': 'AR-K', # Catamarca
'Q44759': 'AR-H', # Chaco
'Q44760': 'AR-U', # Chubut
'Q44761': 'AR-X', # Cordoba
'Q44762': 'AR-W', # Corrientes
'Q44763': 'AR-E', # Entre Rios
'Q44764': 'AR-P', # Formosa
'Q44765': 'AR-Y', # Jujuy
'Q44766': 'AR-L', # La Pampa
'Q44767': 'AR-F', # La Rioja
'Q44768': 'AR-M', # Mendoza
'Q44769': 'AR-N', # Misiones
'Q44770': 'AR-Q', # Neuquen
'Q44771': 'AR-R', # Rio Negro
'Q44772': 'AR-A', # Salta
'Q44773': 'AR-J', # San Juan
'Q44774': 'AR-D', # San Luis
'Q44775': 'AR-Z', # Santa Cruz
'Q44776': 'AR-S', # Santa Fe
'Q44777': 'AR-G', # Santiago del Estero
'Q44778': 'AR-V', # Tierra del Fuego
'Q44779': 'AR-T', # Tucuman
# Bangladesh divisions
'Q240042': 'BD-A', # Barisal
'Q331265': 'BD-B', # Chittagong
'Q309068': 'BD-C', # Dhaka
'Q321140': 'BD-D', # Khulna
'Q326015': 'BD-H', # Mymensingh
'Q326004': 'BD-E', # Rajshahi
'Q326088': 'BD-F', # Rangpur
'Q331258': 'BD-G', # Sylhet
# Bolivian departments
'Q334620': 'BO-C', # Cochabamba
'Q334632': 'BO-H', # Chuquisaca
'Q334649': 'BO-L', # La Paz
'Q334665': 'BO-O', # Oruro
'Q334678': 'BO-P', # Potosi
'Q334699': 'BO-S', # Santa Cruz
'Q334711': 'BO-T', # Tarija
'Q334724': 'BO-B', # Beni
'Q334735': 'BO-N', # Pando
# Singapore (city-state - no subdivisions)
'Q334': 'SG-SG', # Singapore
# Sint Maarten
'Q26273': 'SX-SX', # Sint Maarten
# UK countries/nations
'Q21': 'GB-ENG', # England
'Q22': 'GB-SCT', # Scotland
'Q25': 'GB-WLS', # Wales
'Q26': 'GB-NIR', # Northern Ireland
# South Korean special cities and provinces
'Q8684': 'KR-11', # Seoul
'Q16520': 'KR-26', # Busan
'Q41848': 'KR-27', # Daegu
'Q40674': 'KR-28', # Incheon
'Q41295': 'KR-29', # Gwangju
'Q42622': 'KR-30', # Daejeon
'Q42420': 'KR-31', # Ulsan
'Q20960': 'KR-41', # Gyeonggi
'Q41079': 'KR-42', # Gangwon
'Q41392': 'KR-43', # North Chungcheong
'Q41394': 'KR-44', # South Chungcheong
'Q41585': 'KR-45', # North Jeolla
'Q41587': 'KR-46', # South Jeolla
'Q41171': 'KR-47', # North Gyeongsang
'Q41158': 'KR-48', # South Gyeongsang
'Q28227': 'KR-49', # Jeju
'Q483134': 'KR-50', # Sejong
# Estonia counties
'Q189539': 'EE-37', # Harju County (Tallinn)
'Q192611': 'EE-39', # Hiiu County
'Q180297': 'EE-44', # Ida-Viru County
'Q188808': 'EE-49', # Jõgeva County
'Q190093': 'EE-51', # Järva County
'Q190086': 'EE-57', # Lääne County
'Q190085': 'EE-59', # Lääne-Viru County
'Q189537': 'EE-65', # Põlva County
'Q189544': 'EE-67', # Pärnu County
'Q189542': 'EE-70', # Rapla County
'Q189553': 'EE-74', # Saare County
'Q189530': 'EE-78', # Tartu County
'Q189554': 'EE-82', # Valga County
'Q189556': 'EE-84', # Viljandi County
'Q189538': 'EE-86', # Võru County
# Thai regions/provinces
'Q464862': 'TH-10', # Bangkok (Krung Thep Maha Nakhon)
# Indian states
'Q1159': 'IN-AP', # Andhra Pradesh
'Q1508': 'IN-AR', # Arunachal Pradesh
'Q1164': 'IN-AS', # Assam
'Q1165': 'IN-BR', # Bihar
'Q1168': 'IN-CT', # Chhattisgarh
'Q1171': 'IN-GA', # Goa
'Q1061': 'IN-GJ', # Gujarat
'Q1174': 'IN-HR', # Haryana
'Q1177': 'IN-HP', # Himachal Pradesh
'Q1180': 'IN-JH', # Jharkhand
'Q1185': 'IN-KA', # Karnataka
'Q1186': 'IN-KL', # Kerala
'Q1191': 'IN-MP', # Madhya Pradesh
'Q1191': 'IN-MH', # Maharashtra
'Q1193': 'IN-MN', # Manipur
'Q1195': 'IN-ML', # Meghalaya
'Q1502': 'IN-MZ', # Mizoram
'Q1497': 'IN-NL', # Nagaland
'Q22048': 'IN-OR', # Odisha
'Q22424': 'IN-PB', # Punjab
'Q1437': 'IN-RJ', # Rajasthan
'Q1505': 'IN-SK', # Sikkim
'Q1445': 'IN-TN', # Tamil Nadu
'Q677037': 'IN-TG', # Telangana
'Q1344': 'IN-TR', # Tripura
'Q1498': 'IN-UP', # Uttar Pradesh
'Q1499': 'IN-UT', # Uttarakhand
'Q1356': 'IN-WB', # West Bengal
# Mexican states
'Q30965': 'MX-AGU', # Aguascalientes
'Q30967': 'MX-BCN', # Baja California
'Q46508': 'MX-BCS', # Baja California Sur
'Q58731': 'MX-CAM', # Campeche
'Q61076': 'MX-COA', # Coahuila
'Q61077': 'MX-COL', # Colima
'Q61079': 'MX-CHP', # Chiapas
'Q61080': 'MX-CHH', # Chihuahua
'Q1489': 'MX-CMX', # Mexico City (CDMX)
'Q61083': 'MX-DUR', # Durango
'Q61084': 'MX-GUA', # Guanajuato
'Q61085': 'MX-GRO', # Guerrero
'Q61086': 'MX-HID', # Hidalgo
'Q61087': 'MX-JAL', # Jalisco
'Q61088': 'MX-MEX', # State of Mexico
'Q61089': 'MX-MIC', # Michoacan
'Q61090': 'MX-MOR', # Morelos
'Q61091': 'MX-NAY', # Nayarit
'Q61092': 'MX-NLE', # Nuevo Leon
'Q61093': 'MX-OAX', # Oaxaca
'Q61094': 'MX-PUE', # Puebla
'Q61095': 'MX-QUE', # Queretaro
'Q61096': 'MX-ROO', # Quintana Roo
'Q61097': 'MX-SLP', # San Luis Potosi
'Q61098': 'MX-SIN', # Sinaloa
'Q61099': 'MX-SON', # Sonora
'Q61100': 'MX-TAB', # Tabasco
'Q61101': 'MX-TAM', # Tamaulipas
'Q61102': 'MX-TLA', # Tlaxcala
'Q61103': 'MX-VER', # Veracruz
'Q61104': 'MX-YUC', # Yucatan
'Q61105': 'MX-ZAC', # Zacatecas
# Egyptian governorates
'Q85': 'EG-C', # Cairo
'Q87': 'EG-ALX', # Alexandria
'Q204060': 'EG-GZ', # Giza
# Dominican Republic provinces
'Q18393': 'DO-01', # Distrito Nacional (Santo Domingo)
# Jamaica parishes
'Q3534362': 'JM-01', # Kingston
# Jamaican capital
'Q34692': 'JM-01', # Kingston city
# Ukrainian oblasts
'Q1899': 'UA-30', # Kyiv
'Q7525': 'UA-05', # Vinnytsia Oblast
'Q7526': 'UA-07', # Volyn Oblast
'Q7528': 'UA-12', # Dnipropetrovsk Oblast
'Q7530': 'UA-14', # Donetsk Oblast
'Q7531': 'UA-18', # Zhytomyr Oblast
'Q7532': 'UA-21', # Zakarpattia Oblast
'Q7533': 'UA-23', # Zaporizhzhia Oblast
'Q7534': 'UA-26', # Ivano-Frankivsk Oblast
'Q7535': 'UA-32', # Kyiv Oblast
'Q7536': 'UA-35', # Kirovohrad Oblast
'Q7537': 'UA-09', # Luhansk Oblast
'Q7538': 'UA-46', # Lviv Oblast
'Q7539': 'UA-48', # Mykolaiv Oblast
'Q7540': 'UA-51', # Odesa Oblast
'Q7541': 'UA-53', # Poltava Oblast
'Q7542': 'UA-56', # Rivne Oblast
'Q7543': 'UA-59', # Sumy Oblast
'Q7544': 'UA-61', # Ternopil Oblast
'Q7545': 'UA-63', # Kharkiv Oblast
'Q7546': 'UA-65', # Kherson Oblast
'Q7547': 'UA-68', # Khmelnytskyi Oblast
'Q7548': 'UA-71', # Cherkasy Oblast
'Q7549': 'UA-74', # Chernivtsi Oblast
'Q7550': 'UA-77', # Chernihiv Oblast
# Iranian provinces
'Q160766': 'IR-30', # Razavi Khorasan (Mashhad)
'Q170416': 'IR-23', # Tehran
# Mozambique provinces
'Q182329': 'MZ-MPM', # Maputo Province
'Q182323': 'MZ-L', # Maputo City
# Czech regions (kraje)
'Q1085': 'CZ-10', # Prague (capital city)
'Q193702': 'CZ-10', # Prague (region)
'Q18473': 'CZ-20', # Central Bohemian Region (Středočeský kraj)
'Q18475': 'CZ-31', # South Bohemian Region (Jihočeský kraj)
'Q18471': 'CZ-32', # Plzeň Region (Plzeňský kraj)
'Q18461': 'CZ-41', # Karlovy Vary Region (Karlovarský kraj)
'Q18476': 'CZ-42', # Ústí nad Labem Region (Ústecký kraj)
'Q18465': 'CZ-51', # Liberec Region (Liberecký kraj)
'Q18463': 'CZ-52', # Hradec Králové Region (Královéhradecký kraj)
'Q18468': 'CZ-53', # Pardubice Region (Pardubický kraj)
'Q18478': 'CZ-63', # Vysočina Region
'Q18460': 'CZ-64', # South Moravian Region (Jihomoravský kraj)
'Q18467': 'CZ-71', # Olomouc Region (Olomoucký kraj)
'Q18479': 'CZ-72', # Zlín Region (Zlínský kraj)
'Q18466': 'CZ-80', # Moravian-Silesian Region (Moravskoslezský kraj)
# Czech major cities (to their regions)
'Q14960': 'CZ-64', # Brno -> South Moravian
'Q81137': 'CZ-80', # Ostrava -> Moravian-Silesian
'Q157311': 'CZ-32', # Plzeň -> Plzeň Region
'Q81938': 'CZ-51', # Liberec -> Liberec Region
'Q81979': 'CZ-71', # Olomouc -> Olomouc Region
'Q80284': 'CZ-31', # České Budějovice -> South Bohemian
'Q82057': 'CZ-52', # Hradec Králové -> Hradec Králové Region
'Q82197': 'CZ-42', # Ústí nad Labem -> Ústí nad Labem Region
'Q82463': 'CZ-53', # Pardubice -> Pardubice Region
# Belgian regions and provinces
'Q31': 'BE-VLG', # Flanders
'Q234': 'BE-WAL', # Wallonia
'Q240': 'BE-BRU', # Brussels-Capital Region
# Flemish provinces
'Q1112': 'BE-VAN', # Antwerp
'Q1114': 'BE-VLI', # Limburg (Belgium)
'Q1116': 'BE-VBR', # Flemish Brabant
'Q1117': 'BE-VOV', # East Flanders
'Q1118': 'BE-VWV', # West Flanders
# Walloon provinces
'Q1127': 'BE-WBR', # Walloon Brabant
'Q1128': 'BE-WHT', # Hainaut
'Q1130': 'BE-WLG', # Liège
'Q1131': 'BE-WLX', # Luxembourg (Belgium)
'Q1132': 'BE-WNA', # Namur
# Belgian major cities (to their provinces)
'Q12988': 'BE-BRU', # Brussels -> Brussels-Capital
'Q12892': 'BE-VAN', # Antwerp city -> Antwerp province
'Q12996': 'BE-VOV', # Ghent -> East Flanders
'Q12994': 'BE-VWV', # Bruges -> West Flanders
'Q118958': 'BE-WLG', # Liège city -> Liège province
'Q162022': 'BE-WHT', # Charleroi -> Hainaut
'Q162163': 'BE-VLI', # Hasselt -> Limburg
'Q12990': 'BE-VBR', # Leuven -> Flemish Brabant
'Q162176': 'BE-WNA', # Namur city -> Namur province
# Bulgarian oblasts (provinces)
'Q7921': 'BG-22', # Sofia City
'Q188812': 'BG-23', # Sofia Province
'Q215072': 'BG-01', # Blagoevgrad
'Q215129': 'BG-02', # Burgas
'Q215165': 'BG-08', # Dobrich
'Q215196': 'BG-07', # Gabrovo
'Q215235': 'BG-26', # Haskovo
'Q215270': 'BG-09', # Kardzhali
'Q215303': 'BG-10', # Kyustendil
'Q215340': 'BG-11', # Lovech
'Q215378': 'BG-12', # Montana
'Q215407': 'BG-13', # Pazardzhik
'Q215446': 'BG-14', # Pernik
'Q215475': 'BG-15', # Pleven
'Q215504': 'BG-16', # Plovdiv
'Q215538': 'BG-17', # Razgrad
'Q215565': 'BG-18', # Ruse
'Q215605': 'BG-27', # Shumen
'Q215636': 'BG-19', # Silistra
'Q215666': 'BG-20', # Sliven
'Q215696': 'BG-21', # Smolyan
'Q215727': 'BG-24', # Stara Zagora
'Q215758': 'BG-25', # Targovishte
'Q215787': 'BG-03', # Varna
'Q215820': 'BG-04', # Veliko Tarnovo
'Q215856': 'BG-05', # Vidin
'Q215882': 'BG-06', # Vratsa
'Q215917': 'BG-28', # Yambol
# Bulgarian major cities
'Q472': 'BG-22', # Sofia city -> Sofia City
'Q35825': 'BG-16', # Plovdiv city -> Plovdiv
'Q36367': 'BG-03', # Varna city -> Varna
'Q37106': 'BG-02', # Burgas city -> Burgas
'Q37252': 'BG-18', # Ruse city -> Ruse
# Philippine regions/NCR
'Q13580': 'PH-00', # Metro Manila (NCR)
'Q13586': 'PH-05', # Bicol Region
# Oman governorates
'Q193076': 'OM-MA', # Muscat
# Uzbekistan regions
'Q269': 'UZ-TK', # Tashkent
# Denmark regions
'Q26073': 'DK-84', # Capital Region of Denmark
# Netherlands provinces (for completeness)
'Q694': 'NL-NH', # North Holland
'Q695': 'NL-ZH', # South Holland
'Q696': 'NL-UT', # Utrecht
'Q772': 'NL-GE', # Gelderland
'Q775': 'NL-LI', # Limburg
'Q776': 'NL-NB', # North Brabant
'Q777': 'NL-OV', # Overijssel
'Q778': 'NL-FR', # Friesland
'Q779': 'NL-GR', # Groningen
'Q780': 'NL-DR', # Drenthe
'Q781': 'NL-FL', # Flevoland
'Q782': 'NL-ZE', # Zeeland
# French regions (new 2016 regions)
'Q13917': 'FR-IDF', # Île-de-France
'Q12130': 'FR-CVL', # Centre-Val de Loire
'Q18578': 'FR-BFC', # Bourgogne-Franche-Comté
'Q18677': 'FR-NOR', # Normandy
'Q18677': 'FR-HDF', # Hauts-de-France
'Q18677': 'FR-GES', # Grand Est
'Q18677': 'FR-PDL', # Pays de la Loire
'Q12130': 'FR-BRE', # Brittany
'Q18677': 'FR-NAQ', # Nouvelle-Aquitaine
'Q18677': 'FR-OCC', # Occitanie
'Q18677': 'FR-ARA', # Auvergne-Rhône-Alpes
'Q18677': 'FR-PAC', # Provence-Alpes-Côte d'Azur
'Q14112': 'FR-COR', # Corsica
'Q90': 'FR-IDF', # Paris -> Île-de-France
}
def query_p131_chain_for_entity(qid: str) -> Optional[str]:
"""
Query the P131 chain for a specific Wikidata entity to find its ISO 3166-2 region code.
Returns the ISO code if found, None otherwise.
"""
# First check if this entity itself is in our mapping
if qid in WIKIDATA_TO_ISO:
return WIKIDATA_TO_ISO[qid]
# Query P131 chain with P300 codes
query = f"""
SELECT ?admin ?adminLabel ?iso_code WHERE {{
wd:{qid} wdt:P131* ?admin.
OPTIONAL {{ ?admin wdt:P300 ?iso_code. }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
LIMIT 30
"""
url = "https://query.wikidata.org/sparql"
headers = {
'Accept': 'application/sparql-results+json',
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
}
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
try:
request = urllib.request.Request(url, data=data, headers=headers)
with urllib.request.urlopen(request, timeout=60) as response:
result = json.loads(response.read().decode('utf-8'))
bindings = result.get('results', {}).get('bindings', [])
except Exception as e:
print(f" P131 chain query error for {qid}: {e}")
return None
# Look for ISO code in results
for row in bindings:
# Check P300 ISO code
iso_code = row.get('iso_code', {}).get('value', '')
if iso_code and '-' in iso_code:
return iso_code
# Check our hardcoded mapping
admin_uri = row.get('admin', {}).get('value', '')
if admin_uri:
admin_qid = admin_uri.split('/')[-1]
if admin_qid in WIKIDATA_TO_ISO:
return WIKIDATA_TO_ISO[admin_qid]
return None
def get_location_entities(qid: str) -> List[str]:
"""
Get location-related entities for a Wikidata entity.
Checks P131 (located in), P159 (headquarters), P276 (location), P17 (country).
Returns list of QIDs to check for P131 chain.
"""
query = f"""
SELECT ?prop ?value WHERE {{
VALUES ?prop {{ wdt:P131 wdt:P159 wdt:P276 wdt:P17 }}
wd:{qid} ?prop ?value.
}}
"""
url = "https://query.wikidata.org/sparql"
headers = {
'Accept': 'application/sparql-results+json',
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
}
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
try:
request = urllib.request.Request(url, data=data, headers=headers)
with urllib.request.urlopen(request, timeout=60) as response:
result = json.loads(response.read().decode('utf-8'))
bindings = result.get('results', {}).get('bindings', [])
except Exception as e:
print(f" Location entities query error for {qid}: {e}")
return []
entities = []
for row in bindings:
value_uri = row.get('value', {}).get('value', '')
if value_uri:
value_qid = value_uri.split('/')[-1]
if value_qid not in entities:
entities.append(value_qid)
return entities
def query_p131_chain(qid: str) -> Optional[str]:
"""
Query the P131 chain for a Wikidata entity to find its ISO 3166-2 region code.
Tries multiple strategies:
1. Entity's own P131 chain
2. P159 (headquarters) entity's P131 chain
3. P276 (location) entity's P131 chain
Returns the ISO code if found, None otherwise.
"""
# First check if this entity itself is in our mapping
if qid in WIKIDATA_TO_ISO:
return WIKIDATA_TO_ISO[qid]
# Try entity's own P131 chain
iso_code = query_p131_chain_for_entity(qid)
if iso_code:
return iso_code
# Get location-related entities (P131, P159, P276)
location_entities = get_location_entities(qid)
# Try each location entity's P131 chain
for loc_qid in location_entities:
iso_code = query_p131_chain_for_entity(loc_qid)
if iso_code:
return iso_code
return None
def update_file_with_region(filepath: Path, iso_code: str, admin_label: str,
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
"""Update a custodian file with resolved region code."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {filepath}: {e}")
return False, None
if 'ghcid' not in data:
return False, None
ghcid = data['ghcid']
if 'location_resolution' not in ghcid:
ghcid['location_resolution'] = {}
loc_res = ghcid['location_resolution']
country_code = loc_res.get('country_code', '')
if not country_code:
return False, None
old_region = loc_res.get('region_code', 'XX')
if old_region != 'XX':
return False, None
# Extract region part from ISO code (e.g., "AR-B" -> "B", "CH-GE" -> "GE")
if '-' in iso_code:
parts = iso_code.split('-')
iso_country = parts[0]
region_code = parts[1]
# Verify country matches
if iso_country != country_code:
print(f" Warning: ISO country {iso_country} != file country {country_code}")
return False, None
else:
region_code = iso_code
# Update location resolution
loc_res['region_code'] = region_code
loc_res['region_name'] = admin_label
loc_res['method'] = 'WIKIDATA_P131'
loc_res['iso_code_source'] = iso_code
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
# Update GHCID string
old_ghcid = ghcid.get('ghcid_current', '')
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
if new_ghcid != old_ghcid:
ghcid['ghcid_current'] = new_ghcid
if 'ghcid_history' not in ghcid:
ghcid['ghcid_history'] = []
ghcid['ghcid_history'].append({
'ghcid': new_ghcid,
'valid_from': datetime.now(timezone.utc).isoformat(),
'reason': f"Region resolved via Wikidata P131: XX->{region_code} ({admin_label})"
})
# Add provenance note
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
elif isinstance(data['provenance']['notes'], str):
data['provenance']['notes'] = [data['provenance']['notes']]
data['provenance']['notes'].append(
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
f"XX->{region_code} via Wikidata P131 ({admin_label})"
)
# Determine new filename
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
new_filepath = filepath.parent / new_filename
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
if new_filepath != filepath and not new_filepath.exists():
filepath.rename(new_filepath)
return True, new_filepath if new_filepath != filepath else None
def main():
"""Main entry point."""
import argparse
import time
parser = argparse.ArgumentParser(
description='Resolve XX region codes using Wikidata P131 hierarchy'
)
parser.add_argument('--apply', action='store_true',
help='Actually apply the fixes (default: dry run)')
parser.add_argument('--path', type=str, default='data/custodian',
help='Path to custodian files directory')
parser.add_argument('--limit', type=int, default=100,
help='Limit number of files to process')
parser.add_argument('--country', type=str,
help='Only process files for a specific country')
args = parser.parse_args()
custodian_dir = Path(args.path)
if not custodian_dir.exists():
print(f"Error: Directory {custodian_dir} does not exist")
sys.exit(1)
dry_run = not args.apply
print("=" * 70)
print("REGION RESOLUTION VIA WIKIDATA P131 HIERARCHY")
print("=" * 70)
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
print()
print("NOTE: This script only resolves region codes (XX).")
print(" For city/settlement resolution, use resolve_locations_geonames.py")
print()
# Find files with XX region codes
files_to_process = []
for filepath in custodian_dir.glob('*-XX-*.yaml'):
files_to_process.append(filepath)
print(f"Found {len(files_to_process)} files with XX region codes")
# Load files and extract Wikidata IDs
file_data = []
for filepath in files_to_process[:args.limit]:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get country code
country = None
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
country = data['ghcid']['location_resolution'].get('country_code')
if not country:
continue
if args.country and country != args.country:
continue
# Get Wikidata ID from various locations
wikidata_id = None
if 'wikidata_enrichment' in data:
wikidata_id = data['wikidata_enrichment'].get('wikidata_entity_id')
if not wikidata_id and 'original_entry' in data:
wikidata_id = data['original_entry'].get('wikidata_id')
# Also check identifiers list
if not wikidata_id and 'original_entry' in data:
for ident in data['original_entry'].get('identifiers', []):
if ident.get('identifier_scheme') == 'Wikidata':
wikidata_id = ident.get('identifier_value')
break
if not wikidata_id:
continue
file_data.append({
'filepath': filepath,
'data': data,
'country': country,
'wikidata_id': wikidata_id
})
except Exception as e:
print(f"Error loading {filepath}: {e}")
print(f"Processing {len(file_data)} files with Wikidata IDs")
print()
# Process each file
resolved = 0
renamed = 0
failed = 0
for f in file_data:
filepath = f['filepath']
qid = f['wikidata_id']
country = f['country']
print(f"Processing {filepath.name} ({qid})...")
# Query P131 chain for ISO code
iso_code = query_p131_chain(qid)
if not iso_code:
print(f" No ISO code found")
failed += 1
time.sleep(0.5) # Rate limiting
continue
# Extract admin label from ISO code for provenance
admin_label = iso_code # Use ISO code as label if we don't have a name
# Update file
success, new_path = update_file_with_region(filepath, iso_code, admin_label, dry_run=dry_run)
if success:
resolved += 1
if new_path:
renamed += 1
print(f" {filepath.name} -> {new_path.name} ({iso_code})")
else:
print(f" Updated: {filepath.name} ({iso_code})")
else:
failed += 1
print(f" Failed to update")
time.sleep(0.5) # Rate limiting
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {len(file_data)}")
print(f"Resolved: {resolved}")
print(f"Renamed: {renamed}")
print(f"Failed: {failed}")
if dry_run:
print()
print("This was a DRY RUN. Use --apply to make changes.")
if __name__ == '__main__':
main()