glam/scripts/resolve_locations_p131.py
2025-12-07 00:26:01 +01:00

613 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Resolve XX region codes using Wikidata P131 hierarchy.
This script handles files that lack coordinates by:
1. Querying Wikidata P131 (located in administrative entity) chain
2. Following the chain until finding an entity with P300 (ISO 3166-2 code)
3. Using hardcoded mappings for entities without P300
Following AGENTS.md Rules:
- Rule 5: Additive only - never delete existing data
- GHCID settlement standardization: Use proper settlements only
IMPORTANT: This script only resolves REGION codes (XX -> proper region).
For city/settlement resolution, use resolve_locations_geonames.py which requires coordinates.
"""
import os
import sys
import yaml
import json
import re
import urllib.request
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
# Direct mapping of Wikidata admin entities to ISO 3166-2 codes
# This is for entities that don't have P300 but we know the mapping
WIKIDATA_TO_ISO = {
# Australian states
'Q3258': 'AU-NSW', # New South Wales
'Q36687': 'AU-VIC', # Victoria
'Q36074': 'AU-QLD', # Queensland
'Q35850': 'AU-WA', # Western Australia
'Q35715': 'AU-SA', # South Australia
'Q34366': 'AU-TAS', # Tasmania
'Q3235': 'AU-ACT', # Australian Capital Territory
'Q3373': 'AU-NT', # Northern Territory
# Swiss cantons
'Q11943': 'CH-ZH', # Zurich
'Q11911': 'CH-BE', # Bern
'Q12146': 'CH-LU', # Lucerne
'Q12172': 'CH-UR', # Uri
'Q12174': 'CH-SZ', # Schwyz
'Q12193': 'CH-OW', # Obwalden
'Q12191': 'CH-NW', # Nidwalden
'Q12262': 'CH-GL', # Glarus
'Q12226': 'CH-ZG', # Zug
'Q12640': 'CH-FR', # Fribourg
'Q12433': 'CH-SO', # Solothurn
'Q12503': 'CH-BS', # Basel-Stadt
'Q12536': 'CH-BL', # Basel-Landschaft
'Q12079': 'CH-SH', # Schaffhausen
'Q12094': 'CH-AR', # Appenzell Ausserrhoden
'Q12106': 'CH-AI', # Appenzell Innerrhoden
'Q12121': 'CH-SG', # St. Gallen
'Q12697': 'CH-GR', # Graubunden
'Q12738': 'CH-AG', # Aargau
'Q12755': 'CH-TG', # Thurgau
'Q12713': 'CH-TI', # Ticino
'Q12771': 'CH-VD', # Vaud
'Q12800': 'CH-VS', # Valais
'Q12592': 'CH-NE', # Neuchatel
'Q12573': 'CH-GE', # Geneva
'Q12596': 'CH-JU', # Jura
# Argentine provinces
'Q1486': 'AR-C', # Buenos Aires (city)
'Q44754': 'AR-C', # Autonomous City of Buenos Aires
'Q44757': 'AR-B', # Buenos Aires Province
'Q44758': 'AR-K', # Catamarca
'Q44759': 'AR-H', # Chaco
'Q44760': 'AR-U', # Chubut
'Q44761': 'AR-X', # Cordoba
'Q44762': 'AR-W', # Corrientes
'Q44763': 'AR-E', # Entre Rios
'Q44764': 'AR-P', # Formosa
'Q44765': 'AR-Y', # Jujuy
'Q44766': 'AR-L', # La Pampa
'Q44767': 'AR-F', # La Rioja
'Q44768': 'AR-M', # Mendoza
'Q44769': 'AR-N', # Misiones
'Q44770': 'AR-Q', # Neuquen
'Q44771': 'AR-R', # Rio Negro
'Q44772': 'AR-A', # Salta
'Q44773': 'AR-J', # San Juan
'Q44774': 'AR-D', # San Luis
'Q44775': 'AR-Z', # Santa Cruz
'Q44776': 'AR-S', # Santa Fe
'Q44777': 'AR-G', # Santiago del Estero
'Q44778': 'AR-V', # Tierra del Fuego
'Q44779': 'AR-T', # Tucuman
# Bangladesh divisions
'Q240042': 'BD-A', # Barisal
'Q331265': 'BD-B', # Chittagong
'Q309068': 'BD-C', # Dhaka
'Q321140': 'BD-D', # Khulna
'Q326015': 'BD-H', # Mymensingh
'Q326004': 'BD-E', # Rajshahi
'Q326088': 'BD-F', # Rangpur
'Q331258': 'BD-G', # Sylhet
# Bolivian departments
'Q334620': 'BO-C', # Cochabamba
'Q334632': 'BO-H', # Chuquisaca
'Q334649': 'BO-L', # La Paz
'Q334665': 'BO-O', # Oruro
'Q334678': 'BO-P', # Potosi
'Q334699': 'BO-S', # Santa Cruz
'Q334711': 'BO-T', # Tarija
'Q334724': 'BO-B', # Beni
'Q334735': 'BO-N', # Pando
# Singapore (city-state - no subdivisions)
'Q334': 'SG-SG', # Singapore
# Sint Maarten
'Q26273': 'SX-SX', # Sint Maarten
# UK countries/nations
'Q21': 'GB-ENG', # England
'Q22': 'GB-SCT', # Scotland
'Q25': 'GB-WLS', # Wales
'Q26': 'GB-NIR', # Northern Ireland
# South Korean special cities and provinces
'Q8684': 'KR-11', # Seoul
'Q16520': 'KR-26', # Busan
'Q41848': 'KR-27', # Daegu
'Q40674': 'KR-28', # Incheon
'Q41295': 'KR-29', # Gwangju
'Q42622': 'KR-30', # Daejeon
'Q42420': 'KR-31', # Ulsan
'Q20960': 'KR-41', # Gyeonggi
'Q41079': 'KR-42', # Gangwon
'Q41392': 'KR-43', # North Chungcheong
'Q41394': 'KR-44', # South Chungcheong
'Q41585': 'KR-45', # North Jeolla
'Q41587': 'KR-46', # South Jeolla
'Q41171': 'KR-47', # North Gyeongsang
'Q41158': 'KR-48', # South Gyeongsang
'Q28227': 'KR-49', # Jeju
'Q483134': 'KR-50', # Sejong
# Estonia counties
'Q189539': 'EE-37', # Harju County (Tallinn)
'Q192611': 'EE-39', # Hiiu County
'Q180297': 'EE-44', # Ida-Viru County
'Q188808': 'EE-49', # Jõgeva County
'Q190093': 'EE-51', # Järva County
'Q190086': 'EE-57', # Lääne County
'Q190085': 'EE-59', # Lääne-Viru County
'Q189537': 'EE-65', # Põlva County
'Q189544': 'EE-67', # Pärnu County
'Q189542': 'EE-70', # Rapla County
'Q189553': 'EE-74', # Saare County
'Q189530': 'EE-78', # Tartu County
'Q189554': 'EE-82', # Valga County
'Q189556': 'EE-84', # Viljandi County
'Q189538': 'EE-86', # Võru County
# Thai regions/provinces
'Q464862': 'TH-10', # Bangkok (Krung Thep Maha Nakhon)
# Indian states
'Q1159': 'IN-AP', # Andhra Pradesh
'Q1508': 'IN-AR', # Arunachal Pradesh
'Q1164': 'IN-AS', # Assam
'Q1165': 'IN-BR', # Bihar
'Q1168': 'IN-CT', # Chhattisgarh
'Q1171': 'IN-GA', # Goa
'Q1061': 'IN-GJ', # Gujarat
'Q1174': 'IN-HR', # Haryana
'Q1177': 'IN-HP', # Himachal Pradesh
'Q1180': 'IN-JH', # Jharkhand
'Q1185': 'IN-KA', # Karnataka
'Q1186': 'IN-KL', # Kerala
'Q1191': 'IN-MP', # Madhya Pradesh
'Q1191': 'IN-MH', # Maharashtra
'Q1193': 'IN-MN', # Manipur
'Q1195': 'IN-ML', # Meghalaya
'Q1502': 'IN-MZ', # Mizoram
'Q1497': 'IN-NL', # Nagaland
'Q22048': 'IN-OR', # Odisha
'Q22424': 'IN-PB', # Punjab
'Q1437': 'IN-RJ', # Rajasthan
'Q1505': 'IN-SK', # Sikkim
'Q1445': 'IN-TN', # Tamil Nadu
'Q677037': 'IN-TG', # Telangana
'Q1344': 'IN-TR', # Tripura
'Q1498': 'IN-UP', # Uttar Pradesh
'Q1499': 'IN-UT', # Uttarakhand
'Q1356': 'IN-WB', # West Bengal
# Mexican states
'Q30965': 'MX-AGU', # Aguascalientes
'Q30967': 'MX-BCN', # Baja California
'Q46508': 'MX-BCS', # Baja California Sur
'Q58731': 'MX-CAM', # Campeche
'Q61076': 'MX-COA', # Coahuila
'Q61077': 'MX-COL', # Colima
'Q61079': 'MX-CHP', # Chiapas
'Q61080': 'MX-CHH', # Chihuahua
'Q1489': 'MX-CMX', # Mexico City (CDMX)
'Q61083': 'MX-DUR', # Durango
'Q61084': 'MX-GUA', # Guanajuato
'Q61085': 'MX-GRO', # Guerrero
'Q61086': 'MX-HID', # Hidalgo
'Q61087': 'MX-JAL', # Jalisco
'Q61088': 'MX-MEX', # State of Mexico
'Q61089': 'MX-MIC', # Michoacan
'Q61090': 'MX-MOR', # Morelos
'Q61091': 'MX-NAY', # Nayarit
'Q61092': 'MX-NLE', # Nuevo Leon
'Q61093': 'MX-OAX', # Oaxaca
'Q61094': 'MX-PUE', # Puebla
'Q61095': 'MX-QUE', # Queretaro
'Q61096': 'MX-ROO', # Quintana Roo
'Q61097': 'MX-SLP', # San Luis Potosi
'Q61098': 'MX-SIN', # Sinaloa
'Q61099': 'MX-SON', # Sonora
'Q61100': 'MX-TAB', # Tabasco
'Q61101': 'MX-TAM', # Tamaulipas
'Q61102': 'MX-TLA', # Tlaxcala
'Q61103': 'MX-VER', # Veracruz
'Q61104': 'MX-YUC', # Yucatan
'Q61105': 'MX-ZAC', # Zacatecas
# Egyptian governorates
'Q85': 'EG-C', # Cairo
'Q87': 'EG-ALX', # Alexandria
'Q204060': 'EG-GZ', # Giza
# Dominican Republic provinces
'Q18393': 'DO-01', # Distrito Nacional (Santo Domingo)
# Jamaica parishes
'Q3534362': 'JM-01', # Kingston
# Jamaican capital
'Q34692': 'JM-01', # Kingston city
# Ukrainian oblasts
'Q1899': 'UA-30', # Kyiv
'Q7525': 'UA-05', # Vinnytsia Oblast
'Q7526': 'UA-07', # Volyn Oblast
'Q7528': 'UA-12', # Dnipropetrovsk Oblast
'Q7530': 'UA-14', # Donetsk Oblast
'Q7531': 'UA-18', # Zhytomyr Oblast
'Q7532': 'UA-21', # Zakarpattia Oblast
'Q7533': 'UA-23', # Zaporizhzhia Oblast
'Q7534': 'UA-26', # Ivano-Frankivsk Oblast
'Q7535': 'UA-32', # Kyiv Oblast
'Q7536': 'UA-35', # Kirovohrad Oblast
'Q7537': 'UA-09', # Luhansk Oblast
'Q7538': 'UA-46', # Lviv Oblast
'Q7539': 'UA-48', # Mykolaiv Oblast
'Q7540': 'UA-51', # Odesa Oblast
'Q7541': 'UA-53', # Poltava Oblast
'Q7542': 'UA-56', # Rivne Oblast
'Q7543': 'UA-59', # Sumy Oblast
'Q7544': 'UA-61', # Ternopil Oblast
'Q7545': 'UA-63', # Kharkiv Oblast
'Q7546': 'UA-65', # Kherson Oblast
'Q7547': 'UA-68', # Khmelnytskyi Oblast
'Q7548': 'UA-71', # Cherkasy Oblast
'Q7549': 'UA-74', # Chernivtsi Oblast
'Q7550': 'UA-77', # Chernihiv Oblast
# Iranian provinces
'Q160766': 'IR-30', # Razavi Khorasan (Mashhad)
'Q170416': 'IR-23', # Tehran
# Mozambique provinces
'Q182329': 'MZ-MPM', # Maputo Province
'Q182323': 'MZ-L', # Maputo City
# Czech regions
'Q193702': 'CZ-PR', # Prague
# Philippine regions/NCR
'Q13580': 'PH-00', # Metro Manila (NCR)
'Q13586': 'PH-05', # Bicol Region
# Oman governorates
'Q193076': 'OM-MA', # Muscat
# Uzbekistan regions
'Q269': 'UZ-TK', # Tashkent
# Denmark regions
'Q26073': 'DK-84', # Capital Region of Denmark
# Netherlands provinces (for completeness)
'Q694': 'NL-NH', # North Holland
'Q695': 'NL-ZH', # South Holland
'Q696': 'NL-UT', # Utrecht
'Q772': 'NL-GE', # Gelderland
'Q775': 'NL-LI', # Limburg
'Q776': 'NL-NB', # North Brabant
'Q777': 'NL-OV', # Overijssel
'Q778': 'NL-FR', # Friesland
'Q779': 'NL-GR', # Groningen
'Q780': 'NL-DR', # Drenthe
'Q781': 'NL-FL', # Flevoland
'Q782': 'NL-ZE', # Zeeland
# French regions (new 2016 regions)
'Q13917': 'FR-IDF', # Île-de-France
'Q12130': 'FR-CVL', # Centre-Val de Loire
'Q18578': 'FR-BFC', # Bourgogne-Franche-Comté
'Q18677': 'FR-NOR', # Normandy
'Q18677': 'FR-HDF', # Hauts-de-France
'Q18677': 'FR-GES', # Grand Est
'Q18677': 'FR-PDL', # Pays de la Loire
'Q12130': 'FR-BRE', # Brittany
'Q18677': 'FR-NAQ', # Nouvelle-Aquitaine
'Q18677': 'FR-OCC', # Occitanie
'Q18677': 'FR-ARA', # Auvergne-Rhône-Alpes
'Q18677': 'FR-PAC', # Provence-Alpes-Côte d'Azur
'Q14112': 'FR-COR', # Corsica
'Q90': 'FR-IDF', # Paris -> Île-de-France
}
def query_p131_chain(qid: str) -> Optional[str]:
"""
Query the P131 chain for a Wikidata entity to find its ISO 3166-2 region code.
Returns the ISO code if found, None otherwise.
"""
# First check if this entity itself is in our mapping
if qid in WIKIDATA_TO_ISO:
return WIKIDATA_TO_ISO[qid]
# Query P131 chain with P300 codes
query = f"""
SELECT ?admin ?adminLabel ?iso_code WHERE {{
wd:{qid} wdt:P131* ?admin.
OPTIONAL {{ ?admin wdt:P300 ?iso_code. }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
LIMIT 30
"""
url = "https://query.wikidata.org/sparql"
headers = {
'Accept': 'application/sparql-results+json',
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
}
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
try:
request = urllib.request.Request(url, data=data, headers=headers)
with urllib.request.urlopen(request, timeout=60) as response:
result = json.loads(response.read().decode('utf-8'))
bindings = result.get('results', {}).get('bindings', [])
except Exception as e:
print(f" P131 chain query error for {qid}: {e}")
return None
# Look for ISO code in results
for row in bindings:
# Check P300 ISO code
iso_code = row.get('iso_code', {}).get('value', '')
if iso_code and '-' in iso_code:
return iso_code
# Check our hardcoded mapping
admin_uri = row.get('admin', {}).get('value', '')
if admin_uri:
admin_qid = admin_uri.split('/')[-1]
if admin_qid in WIKIDATA_TO_ISO:
return WIKIDATA_TO_ISO[admin_qid]
return None
def update_file_with_region(filepath: Path, iso_code: str, admin_label: str,
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
"""Update a custodian file with resolved region code."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {filepath}: {e}")
return False, None
if 'ghcid' not in data:
return False, None
ghcid = data['ghcid']
if 'location_resolution' not in ghcid:
ghcid['location_resolution'] = {}
loc_res = ghcid['location_resolution']
country_code = loc_res.get('country_code', '')
if not country_code:
return False, None
old_region = loc_res.get('region_code', 'XX')
if old_region != 'XX':
return False, None
# Extract region part from ISO code (e.g., "AR-B" -> "B", "CH-GE" -> "GE")
if '-' in iso_code:
parts = iso_code.split('-')
iso_country = parts[0]
region_code = parts[1]
# Verify country matches
if iso_country != country_code:
print(f" Warning: ISO country {iso_country} != file country {country_code}")
return False, None
else:
region_code = iso_code
# Update location resolution
loc_res['region_code'] = region_code
loc_res['region_name'] = admin_label
loc_res['method'] = 'WIKIDATA_P131'
loc_res['iso_code_source'] = iso_code
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
# Update GHCID string
old_ghcid = ghcid.get('ghcid_current', '')
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
if new_ghcid != old_ghcid:
ghcid['ghcid_current'] = new_ghcid
if 'ghcid_history' not in ghcid:
ghcid['ghcid_history'] = []
ghcid['ghcid_history'].append({
'ghcid': new_ghcid,
'valid_from': datetime.now(timezone.utc).isoformat(),
'reason': f"Region resolved via Wikidata P131: XX->{region_code} ({admin_label})"
})
# Add provenance note
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
elif isinstance(data['provenance']['notes'], str):
data['provenance']['notes'] = [data['provenance']['notes']]
data['provenance']['notes'].append(
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
f"XX->{region_code} via Wikidata P131 ({admin_label})"
)
# Determine new filename
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
new_filepath = filepath.parent / new_filename
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
if new_filepath != filepath and not new_filepath.exists():
filepath.rename(new_filepath)
return True, new_filepath if new_filepath != filepath else None
def main():
"""Main entry point."""
import argparse
import time
parser = argparse.ArgumentParser(
description='Resolve XX region codes using Wikidata P131 hierarchy'
)
parser.add_argument('--apply', action='store_true',
help='Actually apply the fixes (default: dry run)')
parser.add_argument('--path', type=str, default='data/custodian',
help='Path to custodian files directory')
parser.add_argument('--limit', type=int, default=100,
help='Limit number of files to process')
parser.add_argument('--country', type=str,
help='Only process files for a specific country')
args = parser.parse_args()
custodian_dir = Path(args.path)
if not custodian_dir.exists():
print(f"Error: Directory {custodian_dir} does not exist")
sys.exit(1)
dry_run = not args.apply
print("=" * 70)
print("REGION RESOLUTION VIA WIKIDATA P131 HIERARCHY")
print("=" * 70)
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
print()
print("NOTE: This script only resolves region codes (XX).")
print(" For city/settlement resolution, use resolve_locations_geonames.py")
print()
# Find files with XX region codes
files_to_process = []
for filepath in custodian_dir.glob('*-XX-*.yaml'):
files_to_process.append(filepath)
print(f"Found {len(files_to_process)} files with XX region codes")
# Load files and extract Wikidata IDs
file_data = []
for filepath in files_to_process[:args.limit]:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get country code
country = None
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
country = data['ghcid']['location_resolution'].get('country_code')
if not country:
continue
if args.country and country != args.country:
continue
# Get Wikidata ID
wikidata_id = None
if 'wikidata_enrichment' in data:
wikidata_id = data['wikidata_enrichment'].get('wikidata_entity_id')
if not wikidata_id and 'original_entry' in data:
wikidata_id = data['original_entry'].get('wikidata_id')
if not wikidata_id:
continue
file_data.append({
'filepath': filepath,
'data': data,
'country': country,
'wikidata_id': wikidata_id
})
except Exception as e:
print(f"Error loading {filepath}: {e}")
print(f"Processing {len(file_data)} files with Wikidata IDs")
print()
# Process each file
resolved = 0
renamed = 0
failed = 0
for f in file_data:
filepath = f['filepath']
qid = f['wikidata_id']
country = f['country']
print(f"Processing {filepath.name} ({qid})...")
# Query P131 chain for ISO code
iso_code = query_p131_chain(qid)
if not iso_code:
print(f" No ISO code found")
failed += 1
time.sleep(0.5) # Rate limiting
continue
# Extract admin label from ISO code for provenance
admin_label = iso_code # Use ISO code as label if we don't have a name
# Update file
success, new_path = update_file_with_region(filepath, iso_code, admin_label, dry_run=dry_run)
if success:
resolved += 1
if new_path:
renamed += 1
print(f" {filepath.name} -> {new_path.name} ({iso_code})")
else:
print(f" Updated: {filepath.name} ({iso_code})")
else:
failed += 1
print(f" Failed to update")
time.sleep(0.5) # Rate limiting
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {len(file_data)}")
print(f"Resolved: {resolved}")
print(f"Renamed: {renamed}")
print(f"Failed: {failed}")
if dry_run:
print()
print("This was a DRY RUN. Use --apply to make changes.")
if __name__ == '__main__':
main()