Remove 229 custodian YAML files containing invalid characters in GHCIDs: - Ampersand (&) in abbreviations (e.g., BM&HS, UNL&AG, DR&IMSM) - Parentheses in abbreviations (e.g., WHO(RA, VK(, SL() - Unicode characters in filenames (Ö, Ä, Å, É, İ, Ż, etc.) These files are replaced with corrected versions using alphabetic-only abbreviations per AGENTS.md Rule 8 (Special Characters MUST Be Excluded). Related scripts updated for location resolution.
780 lines
27 KiB
Python
780 lines
27 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve XX region codes using Wikidata P131 hierarchy.
|
|
|
|
This script handles files that lack coordinates by:
|
|
1. Querying Wikidata P131 (located in administrative entity) chain
|
|
2. Following the chain until finding an entity with P300 (ISO 3166-2 code)
|
|
3. Using hardcoded mappings for entities without P300
|
|
|
|
Following AGENTS.md Rules:
|
|
- Rule 5: Additive only - never delete existing data
|
|
- GHCID settlement standardization: Use proper settlements only
|
|
|
|
IMPORTANT: This script only resolves REGION codes (XX -> proper region).
|
|
For city/settlement resolution, use resolve_locations_geonames.py which requires coordinates.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import json
|
|
import re
|
|
import urllib.request
|
|
import urllib.parse
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
# Direct mapping of Wikidata admin entities to ISO 3166-2 codes
|
|
# This is for entities that don't have P300 but we know the mapping
|
|
WIKIDATA_TO_ISO = {
|
|
# Australian states
|
|
'Q3258': 'AU-NSW', # New South Wales
|
|
'Q36687': 'AU-VIC', # Victoria
|
|
'Q36074': 'AU-QLD', # Queensland
|
|
'Q35850': 'AU-WA', # Western Australia
|
|
'Q35715': 'AU-SA', # South Australia
|
|
'Q34366': 'AU-TAS', # Tasmania
|
|
'Q3235': 'AU-ACT', # Australian Capital Territory
|
|
'Q3373': 'AU-NT', # Northern Territory
|
|
|
|
# Swiss cantons
|
|
'Q11943': 'CH-ZH', # Zurich
|
|
'Q11911': 'CH-BE', # Bern
|
|
'Q12146': 'CH-LU', # Lucerne
|
|
'Q12172': 'CH-UR', # Uri
|
|
'Q12174': 'CH-SZ', # Schwyz
|
|
'Q12193': 'CH-OW', # Obwalden
|
|
'Q12191': 'CH-NW', # Nidwalden
|
|
'Q12262': 'CH-GL', # Glarus
|
|
'Q12226': 'CH-ZG', # Zug
|
|
'Q12640': 'CH-FR', # Fribourg
|
|
'Q12433': 'CH-SO', # Solothurn
|
|
'Q12503': 'CH-BS', # Basel-Stadt
|
|
'Q12536': 'CH-BL', # Basel-Landschaft
|
|
'Q12079': 'CH-SH', # Schaffhausen
|
|
'Q12094': 'CH-AR', # Appenzell Ausserrhoden
|
|
'Q12106': 'CH-AI', # Appenzell Innerrhoden
|
|
'Q12121': 'CH-SG', # St. Gallen
|
|
'Q12697': 'CH-GR', # Graubunden
|
|
'Q12738': 'CH-AG', # Aargau
|
|
'Q12755': 'CH-TG', # Thurgau
|
|
'Q12713': 'CH-TI', # Ticino
|
|
'Q12771': 'CH-VD', # Vaud
|
|
'Q12800': 'CH-VS', # Valais
|
|
'Q12592': 'CH-NE', # Neuchatel
|
|
'Q12573': 'CH-GE', # Geneva
|
|
'Q12596': 'CH-JU', # Jura
|
|
|
|
# Argentine provinces
|
|
'Q1486': 'AR-C', # Buenos Aires (city)
|
|
'Q44754': 'AR-C', # Autonomous City of Buenos Aires
|
|
'Q44757': 'AR-B', # Buenos Aires Province
|
|
'Q44758': 'AR-K', # Catamarca
|
|
'Q44759': 'AR-H', # Chaco
|
|
'Q44760': 'AR-U', # Chubut
|
|
'Q44761': 'AR-X', # Cordoba
|
|
'Q44762': 'AR-W', # Corrientes
|
|
'Q44763': 'AR-E', # Entre Rios
|
|
'Q44764': 'AR-P', # Formosa
|
|
'Q44765': 'AR-Y', # Jujuy
|
|
'Q44766': 'AR-L', # La Pampa
|
|
'Q44767': 'AR-F', # La Rioja
|
|
'Q44768': 'AR-M', # Mendoza
|
|
'Q44769': 'AR-N', # Misiones
|
|
'Q44770': 'AR-Q', # Neuquen
|
|
'Q44771': 'AR-R', # Rio Negro
|
|
'Q44772': 'AR-A', # Salta
|
|
'Q44773': 'AR-J', # San Juan
|
|
'Q44774': 'AR-D', # San Luis
|
|
'Q44775': 'AR-Z', # Santa Cruz
|
|
'Q44776': 'AR-S', # Santa Fe
|
|
'Q44777': 'AR-G', # Santiago del Estero
|
|
'Q44778': 'AR-V', # Tierra del Fuego
|
|
'Q44779': 'AR-T', # Tucuman
|
|
|
|
# Bangladesh divisions
|
|
'Q240042': 'BD-A', # Barisal
|
|
'Q331265': 'BD-B', # Chittagong
|
|
'Q309068': 'BD-C', # Dhaka
|
|
'Q321140': 'BD-D', # Khulna
|
|
'Q326015': 'BD-H', # Mymensingh
|
|
'Q326004': 'BD-E', # Rajshahi
|
|
'Q326088': 'BD-F', # Rangpur
|
|
'Q331258': 'BD-G', # Sylhet
|
|
|
|
# Bolivian departments
|
|
'Q334620': 'BO-C', # Cochabamba
|
|
'Q334632': 'BO-H', # Chuquisaca
|
|
'Q334649': 'BO-L', # La Paz
|
|
'Q334665': 'BO-O', # Oruro
|
|
'Q334678': 'BO-P', # Potosi
|
|
'Q334699': 'BO-S', # Santa Cruz
|
|
'Q334711': 'BO-T', # Tarija
|
|
'Q334724': 'BO-B', # Beni
|
|
'Q334735': 'BO-N', # Pando
|
|
|
|
# Singapore (city-state - no subdivisions)
|
|
'Q334': 'SG-SG', # Singapore
|
|
|
|
# Sint Maarten
|
|
'Q26273': 'SX-SX', # Sint Maarten
|
|
|
|
# UK countries/nations
|
|
'Q21': 'GB-ENG', # England
|
|
'Q22': 'GB-SCT', # Scotland
|
|
'Q25': 'GB-WLS', # Wales
|
|
'Q26': 'GB-NIR', # Northern Ireland
|
|
|
|
# South Korean special cities and provinces
|
|
'Q8684': 'KR-11', # Seoul
|
|
'Q16520': 'KR-26', # Busan
|
|
'Q41848': 'KR-27', # Daegu
|
|
'Q40674': 'KR-28', # Incheon
|
|
'Q41295': 'KR-29', # Gwangju
|
|
'Q42622': 'KR-30', # Daejeon
|
|
'Q42420': 'KR-31', # Ulsan
|
|
'Q20960': 'KR-41', # Gyeonggi
|
|
'Q41079': 'KR-42', # Gangwon
|
|
'Q41392': 'KR-43', # North Chungcheong
|
|
'Q41394': 'KR-44', # South Chungcheong
|
|
'Q41585': 'KR-45', # North Jeolla
|
|
'Q41587': 'KR-46', # South Jeolla
|
|
'Q41171': 'KR-47', # North Gyeongsang
|
|
'Q41158': 'KR-48', # South Gyeongsang
|
|
'Q28227': 'KR-49', # Jeju
|
|
'Q483134': 'KR-50', # Sejong
|
|
|
|
# Estonia counties
|
|
'Q189539': 'EE-37', # Harju County (Tallinn)
|
|
'Q192611': 'EE-39', # Hiiu County
|
|
'Q180297': 'EE-44', # Ida-Viru County
|
|
'Q188808': 'EE-49', # Jõgeva County
|
|
'Q190093': 'EE-51', # Järva County
|
|
'Q190086': 'EE-57', # Lääne County
|
|
'Q190085': 'EE-59', # Lääne-Viru County
|
|
'Q189537': 'EE-65', # Põlva County
|
|
'Q189544': 'EE-67', # Pärnu County
|
|
'Q189542': 'EE-70', # Rapla County
|
|
'Q189553': 'EE-74', # Saare County
|
|
'Q189530': 'EE-78', # Tartu County
|
|
'Q189554': 'EE-82', # Valga County
|
|
'Q189556': 'EE-84', # Viljandi County
|
|
'Q189538': 'EE-86', # Võru County
|
|
|
|
# Thai regions/provinces
|
|
'Q464862': 'TH-10', # Bangkok (Krung Thep Maha Nakhon)
|
|
|
|
# Indian states
|
|
'Q1159': 'IN-AP', # Andhra Pradesh
|
|
'Q1508': 'IN-AR', # Arunachal Pradesh
|
|
'Q1164': 'IN-AS', # Assam
|
|
'Q1165': 'IN-BR', # Bihar
|
|
'Q1168': 'IN-CT', # Chhattisgarh
|
|
'Q1171': 'IN-GA', # Goa
|
|
'Q1061': 'IN-GJ', # Gujarat
|
|
'Q1174': 'IN-HR', # Haryana
|
|
'Q1177': 'IN-HP', # Himachal Pradesh
|
|
'Q1180': 'IN-JH', # Jharkhand
|
|
'Q1185': 'IN-KA', # Karnataka
|
|
'Q1186': 'IN-KL', # Kerala
|
|
'Q1191': 'IN-MP', # Madhya Pradesh
|
|
'Q1191': 'IN-MH', # Maharashtra
|
|
'Q1193': 'IN-MN', # Manipur
|
|
'Q1195': 'IN-ML', # Meghalaya
|
|
'Q1502': 'IN-MZ', # Mizoram
|
|
'Q1497': 'IN-NL', # Nagaland
|
|
'Q22048': 'IN-OR', # Odisha
|
|
'Q22424': 'IN-PB', # Punjab
|
|
'Q1437': 'IN-RJ', # Rajasthan
|
|
'Q1505': 'IN-SK', # Sikkim
|
|
'Q1445': 'IN-TN', # Tamil Nadu
|
|
'Q677037': 'IN-TG', # Telangana
|
|
'Q1344': 'IN-TR', # Tripura
|
|
'Q1498': 'IN-UP', # Uttar Pradesh
|
|
'Q1499': 'IN-UT', # Uttarakhand
|
|
'Q1356': 'IN-WB', # West Bengal
|
|
|
|
# Mexican states
|
|
'Q30965': 'MX-AGU', # Aguascalientes
|
|
'Q30967': 'MX-BCN', # Baja California
|
|
'Q46508': 'MX-BCS', # Baja California Sur
|
|
'Q58731': 'MX-CAM', # Campeche
|
|
'Q61076': 'MX-COA', # Coahuila
|
|
'Q61077': 'MX-COL', # Colima
|
|
'Q61079': 'MX-CHP', # Chiapas
|
|
'Q61080': 'MX-CHH', # Chihuahua
|
|
'Q1489': 'MX-CMX', # Mexico City (CDMX)
|
|
'Q61083': 'MX-DUR', # Durango
|
|
'Q61084': 'MX-GUA', # Guanajuato
|
|
'Q61085': 'MX-GRO', # Guerrero
|
|
'Q61086': 'MX-HID', # Hidalgo
|
|
'Q61087': 'MX-JAL', # Jalisco
|
|
'Q61088': 'MX-MEX', # State of Mexico
|
|
'Q61089': 'MX-MIC', # Michoacan
|
|
'Q61090': 'MX-MOR', # Morelos
|
|
'Q61091': 'MX-NAY', # Nayarit
|
|
'Q61092': 'MX-NLE', # Nuevo Leon
|
|
'Q61093': 'MX-OAX', # Oaxaca
|
|
'Q61094': 'MX-PUE', # Puebla
|
|
'Q61095': 'MX-QUE', # Queretaro
|
|
'Q61096': 'MX-ROO', # Quintana Roo
|
|
'Q61097': 'MX-SLP', # San Luis Potosi
|
|
'Q61098': 'MX-SIN', # Sinaloa
|
|
'Q61099': 'MX-SON', # Sonora
|
|
'Q61100': 'MX-TAB', # Tabasco
|
|
'Q61101': 'MX-TAM', # Tamaulipas
|
|
'Q61102': 'MX-TLA', # Tlaxcala
|
|
'Q61103': 'MX-VER', # Veracruz
|
|
'Q61104': 'MX-YUC', # Yucatan
|
|
'Q61105': 'MX-ZAC', # Zacatecas
|
|
|
|
# Egyptian governorates
|
|
'Q85': 'EG-C', # Cairo
|
|
'Q87': 'EG-ALX', # Alexandria
|
|
'Q204060': 'EG-GZ', # Giza
|
|
|
|
# Dominican Republic provinces
|
|
'Q18393': 'DO-01', # Distrito Nacional (Santo Domingo)
|
|
|
|
# Jamaica parishes
|
|
'Q3534362': 'JM-01', # Kingston
|
|
|
|
# Jamaican capital
|
|
'Q34692': 'JM-01', # Kingston city
|
|
|
|
# Ukrainian oblasts
|
|
'Q1899': 'UA-30', # Kyiv
|
|
'Q7525': 'UA-05', # Vinnytsia Oblast
|
|
'Q7526': 'UA-07', # Volyn Oblast
|
|
'Q7528': 'UA-12', # Dnipropetrovsk Oblast
|
|
'Q7530': 'UA-14', # Donetsk Oblast
|
|
'Q7531': 'UA-18', # Zhytomyr Oblast
|
|
'Q7532': 'UA-21', # Zakarpattia Oblast
|
|
'Q7533': 'UA-23', # Zaporizhzhia Oblast
|
|
'Q7534': 'UA-26', # Ivano-Frankivsk Oblast
|
|
'Q7535': 'UA-32', # Kyiv Oblast
|
|
'Q7536': 'UA-35', # Kirovohrad Oblast
|
|
'Q7537': 'UA-09', # Luhansk Oblast
|
|
'Q7538': 'UA-46', # Lviv Oblast
|
|
'Q7539': 'UA-48', # Mykolaiv Oblast
|
|
'Q7540': 'UA-51', # Odesa Oblast
|
|
'Q7541': 'UA-53', # Poltava Oblast
|
|
'Q7542': 'UA-56', # Rivne Oblast
|
|
'Q7543': 'UA-59', # Sumy Oblast
|
|
'Q7544': 'UA-61', # Ternopil Oblast
|
|
'Q7545': 'UA-63', # Kharkiv Oblast
|
|
'Q7546': 'UA-65', # Kherson Oblast
|
|
'Q7547': 'UA-68', # Khmelnytskyi Oblast
|
|
'Q7548': 'UA-71', # Cherkasy Oblast
|
|
'Q7549': 'UA-74', # Chernivtsi Oblast
|
|
'Q7550': 'UA-77', # Chernihiv Oblast
|
|
|
|
# Iranian provinces
|
|
'Q160766': 'IR-30', # Razavi Khorasan (Mashhad)
|
|
'Q170416': 'IR-23', # Tehran
|
|
|
|
# Mozambique provinces
|
|
'Q182329': 'MZ-MPM', # Maputo Province
|
|
'Q182323': 'MZ-L', # Maputo City
|
|
|
|
# Czech regions (kraje)
|
|
'Q1085': 'CZ-10', # Prague (capital city)
|
|
'Q193702': 'CZ-10', # Prague (region)
|
|
'Q18473': 'CZ-20', # Central Bohemian Region (Středočeský kraj)
|
|
'Q18475': 'CZ-31', # South Bohemian Region (Jihočeský kraj)
|
|
'Q18471': 'CZ-32', # Plzeň Region (Plzeňský kraj)
|
|
'Q18461': 'CZ-41', # Karlovy Vary Region (Karlovarský kraj)
|
|
'Q18476': 'CZ-42', # Ústí nad Labem Region (Ústecký kraj)
|
|
'Q18465': 'CZ-51', # Liberec Region (Liberecký kraj)
|
|
'Q18463': 'CZ-52', # Hradec Králové Region (Královéhradecký kraj)
|
|
'Q18468': 'CZ-53', # Pardubice Region (Pardubický kraj)
|
|
'Q18478': 'CZ-63', # Vysočina Region
|
|
'Q18460': 'CZ-64', # South Moravian Region (Jihomoravský kraj)
|
|
'Q18467': 'CZ-71', # Olomouc Region (Olomoucký kraj)
|
|
'Q18479': 'CZ-72', # Zlín Region (Zlínský kraj)
|
|
'Q18466': 'CZ-80', # Moravian-Silesian Region (Moravskoslezský kraj)
|
|
# Czech major cities (to their regions)
|
|
'Q14960': 'CZ-64', # Brno -> South Moravian
|
|
'Q81137': 'CZ-80', # Ostrava -> Moravian-Silesian
|
|
'Q157311': 'CZ-32', # Plzeň -> Plzeň Region
|
|
'Q81938': 'CZ-51', # Liberec -> Liberec Region
|
|
'Q81979': 'CZ-71', # Olomouc -> Olomouc Region
|
|
'Q80284': 'CZ-31', # České Budějovice -> South Bohemian
|
|
'Q82057': 'CZ-52', # Hradec Králové -> Hradec Králové Region
|
|
'Q82197': 'CZ-42', # Ústí nad Labem -> Ústí nad Labem Region
|
|
'Q82463': 'CZ-53', # Pardubice -> Pardubice Region
|
|
|
|
# Belgian regions and provinces
|
|
'Q31': 'BE-VLG', # Flanders
|
|
'Q234': 'BE-WAL', # Wallonia
|
|
'Q240': 'BE-BRU', # Brussels-Capital Region
|
|
# Flemish provinces
|
|
'Q1112': 'BE-VAN', # Antwerp
|
|
'Q1114': 'BE-VLI', # Limburg (Belgium)
|
|
'Q1116': 'BE-VBR', # Flemish Brabant
|
|
'Q1117': 'BE-VOV', # East Flanders
|
|
'Q1118': 'BE-VWV', # West Flanders
|
|
# Walloon provinces
|
|
'Q1127': 'BE-WBR', # Walloon Brabant
|
|
'Q1128': 'BE-WHT', # Hainaut
|
|
'Q1130': 'BE-WLG', # Liège
|
|
'Q1131': 'BE-WLX', # Luxembourg (Belgium)
|
|
'Q1132': 'BE-WNA', # Namur
|
|
# Belgian major cities (to their provinces)
|
|
'Q12988': 'BE-BRU', # Brussels -> Brussels-Capital
|
|
'Q12892': 'BE-VAN', # Antwerp city -> Antwerp province
|
|
'Q12996': 'BE-VOV', # Ghent -> East Flanders
|
|
'Q12994': 'BE-VWV', # Bruges -> West Flanders
|
|
'Q118958': 'BE-WLG', # Liège city -> Liège province
|
|
'Q162022': 'BE-WHT', # Charleroi -> Hainaut
|
|
'Q162163': 'BE-VLI', # Hasselt -> Limburg
|
|
'Q12990': 'BE-VBR', # Leuven -> Flemish Brabant
|
|
'Q162176': 'BE-WNA', # Namur city -> Namur province
|
|
|
|
# Bulgarian oblasts (provinces)
|
|
'Q7921': 'BG-22', # Sofia City
|
|
'Q188812': 'BG-23', # Sofia Province
|
|
'Q215072': 'BG-01', # Blagoevgrad
|
|
'Q215129': 'BG-02', # Burgas
|
|
'Q215165': 'BG-08', # Dobrich
|
|
'Q215196': 'BG-07', # Gabrovo
|
|
'Q215235': 'BG-26', # Haskovo
|
|
'Q215270': 'BG-09', # Kardzhali
|
|
'Q215303': 'BG-10', # Kyustendil
|
|
'Q215340': 'BG-11', # Lovech
|
|
'Q215378': 'BG-12', # Montana
|
|
'Q215407': 'BG-13', # Pazardzhik
|
|
'Q215446': 'BG-14', # Pernik
|
|
'Q215475': 'BG-15', # Pleven
|
|
'Q215504': 'BG-16', # Plovdiv
|
|
'Q215538': 'BG-17', # Razgrad
|
|
'Q215565': 'BG-18', # Ruse
|
|
'Q215605': 'BG-27', # Shumen
|
|
'Q215636': 'BG-19', # Silistra
|
|
'Q215666': 'BG-20', # Sliven
|
|
'Q215696': 'BG-21', # Smolyan
|
|
'Q215727': 'BG-24', # Stara Zagora
|
|
'Q215758': 'BG-25', # Targovishte
|
|
'Q215787': 'BG-03', # Varna
|
|
'Q215820': 'BG-04', # Veliko Tarnovo
|
|
'Q215856': 'BG-05', # Vidin
|
|
'Q215882': 'BG-06', # Vratsa
|
|
'Q215917': 'BG-28', # Yambol
|
|
# Bulgarian major cities
|
|
'Q472': 'BG-22', # Sofia city -> Sofia City
|
|
'Q35825': 'BG-16', # Plovdiv city -> Plovdiv
|
|
'Q36367': 'BG-03', # Varna city -> Varna
|
|
'Q37106': 'BG-02', # Burgas city -> Burgas
|
|
'Q37252': 'BG-18', # Ruse city -> Ruse
|
|
|
|
# Philippine regions/NCR
|
|
'Q13580': 'PH-00', # Metro Manila (NCR)
|
|
'Q13586': 'PH-05', # Bicol Region
|
|
|
|
# Oman governorates
|
|
'Q193076': 'OM-MA', # Muscat
|
|
|
|
# Uzbekistan regions
|
|
'Q269': 'UZ-TK', # Tashkent
|
|
|
|
# Denmark regions
|
|
'Q26073': 'DK-84', # Capital Region of Denmark
|
|
|
|
# Netherlands provinces (for completeness)
|
|
'Q694': 'NL-NH', # North Holland
|
|
'Q695': 'NL-ZH', # South Holland
|
|
'Q696': 'NL-UT', # Utrecht
|
|
'Q772': 'NL-GE', # Gelderland
|
|
'Q775': 'NL-LI', # Limburg
|
|
'Q776': 'NL-NB', # North Brabant
|
|
'Q777': 'NL-OV', # Overijssel
|
|
'Q778': 'NL-FR', # Friesland
|
|
'Q779': 'NL-GR', # Groningen
|
|
'Q780': 'NL-DR', # Drenthe
|
|
'Q781': 'NL-FL', # Flevoland
|
|
'Q782': 'NL-ZE', # Zeeland
|
|
|
|
# French regions (new 2016 regions)
|
|
'Q13917': 'FR-IDF', # Île-de-France
|
|
'Q12130': 'FR-CVL', # Centre-Val de Loire
|
|
'Q18578': 'FR-BFC', # Bourgogne-Franche-Comté
|
|
'Q18677': 'FR-NOR', # Normandy
|
|
'Q18677': 'FR-HDF', # Hauts-de-France
|
|
'Q18677': 'FR-GES', # Grand Est
|
|
'Q18677': 'FR-PDL', # Pays de la Loire
|
|
'Q12130': 'FR-BRE', # Brittany
|
|
'Q18677': 'FR-NAQ', # Nouvelle-Aquitaine
|
|
'Q18677': 'FR-OCC', # Occitanie
|
|
'Q18677': 'FR-ARA', # Auvergne-Rhône-Alpes
|
|
'Q18677': 'FR-PAC', # Provence-Alpes-Côte d'Azur
|
|
'Q14112': 'FR-COR', # Corsica
|
|
'Q90': 'FR-IDF', # Paris -> Île-de-France
|
|
}
|
|
|
|
|
|
def query_p131_chain_for_entity(qid: str) -> Optional[str]:
|
|
"""
|
|
Query the P131 chain for a specific Wikidata entity to find its ISO 3166-2 region code.
|
|
|
|
Returns the ISO code if found, None otherwise.
|
|
"""
|
|
# First check if this entity itself is in our mapping
|
|
if qid in WIKIDATA_TO_ISO:
|
|
return WIKIDATA_TO_ISO[qid]
|
|
|
|
# Query P131 chain with P300 codes
|
|
query = f"""
|
|
SELECT ?admin ?adminLabel ?iso_code WHERE {{
|
|
wd:{qid} wdt:P131* ?admin.
|
|
OPTIONAL {{ ?admin wdt:P300 ?iso_code. }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
|
|
}}
|
|
LIMIT 30
|
|
"""
|
|
|
|
url = "https://query.wikidata.org/sparql"
|
|
headers = {
|
|
'Accept': 'application/sparql-results+json',
|
|
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
|
}
|
|
|
|
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
|
|
|
|
try:
|
|
request = urllib.request.Request(url, data=data, headers=headers)
|
|
with urllib.request.urlopen(request, timeout=60) as response:
|
|
result = json.loads(response.read().decode('utf-8'))
|
|
bindings = result.get('results', {}).get('bindings', [])
|
|
except Exception as e:
|
|
print(f" P131 chain query error for {qid}: {e}")
|
|
return None
|
|
|
|
# Look for ISO code in results
|
|
for row in bindings:
|
|
# Check P300 ISO code
|
|
iso_code = row.get('iso_code', {}).get('value', '')
|
|
if iso_code and '-' in iso_code:
|
|
return iso_code
|
|
|
|
# Check our hardcoded mapping
|
|
admin_uri = row.get('admin', {}).get('value', '')
|
|
if admin_uri:
|
|
admin_qid = admin_uri.split('/')[-1]
|
|
if admin_qid in WIKIDATA_TO_ISO:
|
|
return WIKIDATA_TO_ISO[admin_qid]
|
|
|
|
return None
|
|
|
|
|
|
def get_location_entities(qid: str) -> List[str]:
|
|
"""
|
|
Get location-related entities for a Wikidata entity.
|
|
Checks P131 (located in), P159 (headquarters), P276 (location), P17 (country).
|
|
|
|
Returns list of QIDs to check for P131 chain.
|
|
"""
|
|
query = f"""
|
|
SELECT ?prop ?value WHERE {{
|
|
VALUES ?prop {{ wdt:P131 wdt:P159 wdt:P276 wdt:P17 }}
|
|
wd:{qid} ?prop ?value.
|
|
}}
|
|
"""
|
|
|
|
url = "https://query.wikidata.org/sparql"
|
|
headers = {
|
|
'Accept': 'application/sparql-results+json',
|
|
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
|
}
|
|
|
|
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
|
|
|
|
try:
|
|
request = urllib.request.Request(url, data=data, headers=headers)
|
|
with urllib.request.urlopen(request, timeout=60) as response:
|
|
result = json.loads(response.read().decode('utf-8'))
|
|
bindings = result.get('results', {}).get('bindings', [])
|
|
except Exception as e:
|
|
print(f" Location entities query error for {qid}: {e}")
|
|
return []
|
|
|
|
entities = []
|
|
for row in bindings:
|
|
value_uri = row.get('value', {}).get('value', '')
|
|
if value_uri:
|
|
value_qid = value_uri.split('/')[-1]
|
|
if value_qid not in entities:
|
|
entities.append(value_qid)
|
|
|
|
return entities
|
|
|
|
|
|
def query_p131_chain(qid: str) -> Optional[str]:
|
|
"""
|
|
Query the P131 chain for a Wikidata entity to find its ISO 3166-2 region code.
|
|
|
|
Tries multiple strategies:
|
|
1. Entity's own P131 chain
|
|
2. P159 (headquarters) entity's P131 chain
|
|
3. P276 (location) entity's P131 chain
|
|
|
|
Returns the ISO code if found, None otherwise.
|
|
"""
|
|
# First check if this entity itself is in our mapping
|
|
if qid in WIKIDATA_TO_ISO:
|
|
return WIKIDATA_TO_ISO[qid]
|
|
|
|
# Try entity's own P131 chain
|
|
iso_code = query_p131_chain_for_entity(qid)
|
|
if iso_code:
|
|
return iso_code
|
|
|
|
# Get location-related entities (P131, P159, P276)
|
|
location_entities = get_location_entities(qid)
|
|
|
|
# Try each location entity's P131 chain
|
|
for loc_qid in location_entities:
|
|
iso_code = query_p131_chain_for_entity(loc_qid)
|
|
if iso_code:
|
|
return iso_code
|
|
|
|
return None
|
|
|
|
|
|
def update_file_with_region(filepath: Path, iso_code: str, admin_label: str,
|
|
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
|
"""Update a custodian file with resolved region code."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Error reading {filepath}: {e}")
|
|
return False, None
|
|
|
|
if 'ghcid' not in data:
|
|
return False, None
|
|
|
|
ghcid = data['ghcid']
|
|
if 'location_resolution' not in ghcid:
|
|
ghcid['location_resolution'] = {}
|
|
|
|
loc_res = ghcid['location_resolution']
|
|
country_code = loc_res.get('country_code', '')
|
|
|
|
if not country_code:
|
|
return False, None
|
|
|
|
old_region = loc_res.get('region_code', 'XX')
|
|
|
|
if old_region != 'XX':
|
|
return False, None
|
|
|
|
# Extract region part from ISO code (e.g., "AR-B" -> "B", "CH-GE" -> "GE")
|
|
if '-' in iso_code:
|
|
parts = iso_code.split('-')
|
|
iso_country = parts[0]
|
|
region_code = parts[1]
|
|
|
|
# Verify country matches
|
|
if iso_country != country_code:
|
|
print(f" Warning: ISO country {iso_country} != file country {country_code}")
|
|
return False, None
|
|
else:
|
|
region_code = iso_code
|
|
|
|
# Update location resolution
|
|
loc_res['region_code'] = region_code
|
|
loc_res['region_name'] = admin_label
|
|
loc_res['method'] = 'WIKIDATA_P131'
|
|
loc_res['iso_code_source'] = iso_code
|
|
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update GHCID string
|
|
old_ghcid = ghcid.get('ghcid_current', '')
|
|
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
|
|
|
if new_ghcid != old_ghcid:
|
|
ghcid['ghcid_current'] = new_ghcid
|
|
|
|
if 'ghcid_history' not in ghcid:
|
|
ghcid['ghcid_history'] = []
|
|
|
|
ghcid['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'reason': f"Region resolved via Wikidata P131: XX->{region_code} ({admin_label})"
|
|
})
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
elif isinstance(data['provenance']['notes'], str):
|
|
data['provenance']['notes'] = [data['provenance']['notes']]
|
|
|
|
data['provenance']['notes'].append(
|
|
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
|
f"XX->{region_code} via Wikidata P131 ({admin_label})"
|
|
)
|
|
|
|
# Determine new filename
|
|
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
if new_filepath != filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
|
|
return True, new_filepath if new_filepath != filepath else None
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
import time
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Resolve XX region codes using Wikidata P131 hierarchy'
|
|
)
|
|
parser.add_argument('--apply', action='store_true',
|
|
help='Actually apply the fixes (default: dry run)')
|
|
parser.add_argument('--path', type=str, default='data/custodian',
|
|
help='Path to custodian files directory')
|
|
parser.add_argument('--limit', type=int, default=100,
|
|
help='Limit number of files to process')
|
|
parser.add_argument('--country', type=str,
|
|
help='Only process files for a specific country')
|
|
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.path)
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory {custodian_dir} does not exist")
|
|
sys.exit(1)
|
|
|
|
dry_run = not args.apply
|
|
|
|
print("=" * 70)
|
|
print("REGION RESOLUTION VIA WIKIDATA P131 HIERARCHY")
|
|
print("=" * 70)
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
|
print()
|
|
print("NOTE: This script only resolves region codes (XX).")
|
|
print(" For city/settlement resolution, use resolve_locations_geonames.py")
|
|
print()
|
|
|
|
# Find files with XX region codes
|
|
files_to_process = []
|
|
|
|
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
|
files_to_process.append(filepath)
|
|
|
|
print(f"Found {len(files_to_process)} files with XX region codes")
|
|
|
|
# Load files and extract Wikidata IDs
|
|
file_data = []
|
|
for filepath in files_to_process[:args.limit]:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get country code
|
|
country = None
|
|
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
|
country = data['ghcid']['location_resolution'].get('country_code')
|
|
|
|
if not country:
|
|
continue
|
|
|
|
if args.country and country != args.country:
|
|
continue
|
|
|
|
# Get Wikidata ID from various locations
|
|
wikidata_id = None
|
|
if 'wikidata_enrichment' in data:
|
|
wikidata_id = data['wikidata_enrichment'].get('wikidata_entity_id')
|
|
if not wikidata_id and 'original_entry' in data:
|
|
wikidata_id = data['original_entry'].get('wikidata_id')
|
|
# Also check identifiers list
|
|
if not wikidata_id and 'original_entry' in data:
|
|
for ident in data['original_entry'].get('identifiers', []):
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
wikidata_id = ident.get('identifier_value')
|
|
break
|
|
|
|
if not wikidata_id:
|
|
continue
|
|
|
|
file_data.append({
|
|
'filepath': filepath,
|
|
'data': data,
|
|
'country': country,
|
|
'wikidata_id': wikidata_id
|
|
})
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}")
|
|
|
|
print(f"Processing {len(file_data)} files with Wikidata IDs")
|
|
print()
|
|
|
|
# Process each file
|
|
resolved = 0
|
|
renamed = 0
|
|
failed = 0
|
|
|
|
for f in file_data:
|
|
filepath = f['filepath']
|
|
qid = f['wikidata_id']
|
|
country = f['country']
|
|
|
|
print(f"Processing {filepath.name} ({qid})...")
|
|
|
|
# Query P131 chain for ISO code
|
|
iso_code = query_p131_chain(qid)
|
|
|
|
if not iso_code:
|
|
print(f" No ISO code found")
|
|
failed += 1
|
|
time.sleep(0.5) # Rate limiting
|
|
continue
|
|
|
|
# Extract admin label from ISO code for provenance
|
|
admin_label = iso_code # Use ISO code as label if we don't have a name
|
|
|
|
# Update file
|
|
success, new_path = update_file_with_region(filepath, iso_code, admin_label, dry_run=dry_run)
|
|
|
|
if success:
|
|
resolved += 1
|
|
if new_path:
|
|
renamed += 1
|
|
print(f" {filepath.name} -> {new_path.name} ({iso_code})")
|
|
else:
|
|
print(f" Updated: {filepath.name} ({iso_code})")
|
|
else:
|
|
failed += 1
|
|
print(f" Failed to update")
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files processed: {len(file_data)}")
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Renamed: {renamed}")
|
|
print(f"Failed: {failed}")
|
|
|
|
if dry_run:
|
|
print()
|
|
print("This was a DRY RUN. Use --apply to make changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|