glam/scripts/resolve_ar_xx_regions.py
2025-12-21 00:01:54 +01:00

349 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Resolve AR-XX-* files by querying Wikidata P131 chain to find province.
Uses ISO 3166-2:AR codes for standardization.
"""
import json
import yaml
import os
import re
import time
import shutil
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
# Wikidata province QIDs to ISO 3166-2:AR codes
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:AR
WIKIDATA_PROVINCE_QIDS = {
"Q44754": "A", # Salta
"Q44705": "B", # Buenos Aires Province
"Q1486": "C", # Buenos Aires (city / CABA)
"Q44926": "D", # San Luis
"Q44762": "E", # Entre Ríos
"Q44821": "F", # La Rioja
"Q44690": "G", # Santiago del Estero
"Q44770": "H", # Chaco
"Q44915": "J", # San Juan
"Q44838": "K", # Catamarca
"Q44789": "L", # La Pampa
"Q44937": "M", # Mendoza
"Q44745": "N", # Misiones
"Q44797": "P", # Formosa
"Q44859": "Q", # Neuquén
"Q44869": "R", # Río Negro
"Q44713": "S", # Santa Fe
"Q44881": "T", # Tucumán
"Q44895": "U", # Chubut
"Q44908": "V", # Tierra del Fuego
"Q44729": "W", # Corrientes
"Q44778": "X", # Córdoba
"Q45034": "Y", # Jujuy
"Q44922": "Z", # Santa Cruz
}
# Province names to ISO codes (for label matching)
PROVINCE_NAMES_TO_ISO = {
"salta": "A",
"provincia de salta": "A",
"buenos aires": "B", # Province (not city)
"provincia de buenos aires": "B",
"ciudad autónoma de buenos aires": "C",
"ciudad de buenos aires": "C",
"caba": "C",
"san luis": "D",
"provincia de san luis": "D",
"entre ríos": "E",
"provincia de entre ríos": "E",
"la rioja": "F",
"provincia de la rioja": "F",
"santiago del estero": "G",
"provincia de santiago del estero": "G",
"chaco": "H",
"provincia del chaco": "H",
"san juan": "J",
"provincia de san juan": "J",
"catamarca": "K",
"provincia de catamarca": "K",
"la pampa": "L",
"provincia de la pampa": "L",
"mendoza": "M",
"provincia de mendoza": "M",
"misiones": "N",
"provincia de misiones": "N",
"formosa": "P",
"provincia de formosa": "P",
"neuquén": "Q",
"provincia del neuquén": "Q",
"río negro": "R",
"provincia de río negro": "R",
"santa fe": "S",
"provincia de santa fe": "S",
"tucumán": "T",
"provincia de tucumán": "T",
"chubut": "U",
"provincia del chubut": "U",
"tierra del fuego": "V",
"provincia de tierra del fuego, antártida e islas del atlántico sur": "V",
"corrientes": "W",
"provincia de corrientes": "W",
"córdoba": "X",
"provincia de córdoba": "X",
"jujuy": "Y",
"provincia de jujuy": "Y",
"santa cruz": "Z",
"provincia de santa cruz": "Z",
}
def query_wikidata_p131_chain(qid: str) -> list:
"""Query Wikidata for P131 chain (located in administrative entity)."""
import urllib.request
import urllib.parse
sparql = f"""
SELECT ?item ?itemLabel WHERE {{
wd:{qid} wdt:P131* ?item .
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en". }}
}}
"""
url = "https://query.wikidata.org/sparql"
params = urllib.parse.urlencode({
'query': sparql,
'format': 'json'
})
try:
req = urllib.request.Request(
f"{url}?{params}",
headers={'User-Agent': 'GLAM-Ontology-Bot/1.0'}
)
with urllib.request.urlopen(req, timeout=30) as response:
data = json.loads(response.read().decode())
results = []
for binding in data.get('results', {}).get('bindings', []):
item_uri = binding.get('item', {}).get('value', '')
item_qid = item_uri.split('/')[-1] if item_uri else None
item_label = binding.get('itemLabel', {}).get('value', '')
if item_qid:
results.append((item_qid, item_label))
return results
except Exception as e:
print(f" ⚠️ Error querying {qid}: {e}")
return []
def find_province_from_chain(chain: list) -> tuple:
"""Find the province from P131 chain. Returns (iso_code, province_name, city_qid, city_name)."""
province_code = None
province_name = None
city_qid = None
city_name = None
for qid, label in chain:
# Check if this QID is a known province
if qid in WIKIDATA_PROVINCE_QIDS:
province_code = WIKIDATA_PROVINCE_QIDS[qid]
province_name = label
break
# Check if label matches a known province name
label_lower = label.lower().strip()
if label_lower in PROVINCE_NAMES_TO_ISO:
province_code = PROVINCE_NAMES_TO_ISO[label_lower]
province_name = label
break
# Find city (first entity in chain that's not the item itself and not a province)
for i, (qid, label) in enumerate(chain):
if i == 0: # Skip the item itself
continue
if qid in WIKIDATA_PROVINCE_QIDS:
continue
if 'argentina' in label.lower():
continue
# This should be a city/municipality/partido
city_qid = qid
city_name = label
break
return province_code, province_name, city_qid, city_name
def normalize_to_ascii(text: str) -> str:
"""Normalize unicode to ASCII."""
normalized = unicodedata.normalize('NFD', text)
return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
def extract_city_code(city_name: str) -> str:
"""Generate 3-letter city code from name."""
if not city_name:
return "XXX"
# Clean the name
name = normalize_to_ascii(city_name.strip())
# Remove common prefixes
prefixes = ['Partido de ', 'Departamento ', 'Ciudad de ', 'Provincia de ', 'Partido ']
for prefix in prefixes:
if name.startswith(prefix):
name = name[len(prefix):]
# Single word: first 3 letters
words = [w for w in name.split() if w]
if len(words) == 1:
return name[:3].upper()
# Multi-word: initials (up to 3)
initials = ''.join(w[0] for w in words if w and w[0].isalpha())
return initials[:3].upper() if initials else name[:3].upper()
def generate_new_ghcid(old_ghcid: str, new_region: str, new_city_code: str) -> str:
"""Generate new GHCID with resolved region and city."""
parts = old_ghcid.split('-')
# AR-XX-XXX-M-ABBREV -> AR-{region}-{city}-M-ABBREV
if len(parts) >= 5:
parts[1] = new_region
parts[2] = new_city_code
return '-'.join(parts)
return old_ghcid
def update_yaml_file(filepath: Path, new_region: str, new_city_code: str, city_name: str, province_name: str):
"""Update the YAML file with resolved region and rename."""
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
new_ghcid = generate_new_ghcid(old_ghcid, new_region, new_city_code)
# Update GHCID
if 'ghcid' not in data:
data['ghcid'] = {}
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['location_resolution']['region_code'] = new_region
data['ghcid']['location_resolution']['city_code'] = new_city_code
if city_name:
data['ghcid']['location_resolution']['city_label'] = city_name
data['ghcid']['location_resolution']['province_name'] = province_name
data['ghcid']['location_resolution']['resolution_method'] = 'WIKIDATA_P131_CHAIN'
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
# Update location
if 'location' in data:
data['location']['region_code'] = new_region
if city_name:
data['location']['city'] = city_name
# Calculate new filename
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
# Write updated data
with open(filepath, 'w') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Rename file if GHCID changed
if new_filepath != filepath and not new_filepath.exists():
shutil.move(filepath, new_filepath)
return new_filepath
elif new_filepath.exists() and new_filepath != filepath:
# Collision - add QID suffix
qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id', '')
new_filename = f"{new_ghcid}-{qid}.yaml"
new_filepath = filepath.parent / new_filename
shutil.move(filepath, new_filepath)
return new_filepath
return filepath
def process_ar_xx_files(dry_run: bool = False):
"""Process all AR-XX-* files and resolve regions."""
base_dir = Path("/Users/kempersc/apps/glam")
custodian_dir = base_dir / "data" / "custodian"
ar_xx_files = list(custodian_dir.glob("AR-XX-*.yaml"))
print(f"Found {len(ar_xx_files)} AR-XX-* files to process\n")
resolved = []
unresolved = []
for filepath in sorted(ar_xx_files):
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
name = data.get('custodian_name', {}).get('claim_value', filepath.stem)
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
print(f"Processing: {name}")
print(f" QID: {qid}")
print(f" Old GHCID: {old_ghcid}")
if not qid:
print(f" ❌ No Wikidata ID - cannot resolve")
unresolved.append((filepath, name, "No Wikidata ID"))
continue
chain = query_wikidata_p131_chain(qid)
time.sleep(0.5) # Rate limit
if not chain:
print(f" ❌ No P131 chain found")
unresolved.append((filepath, name, "No P131 chain"))
continue
province_code, province_name, city_qid, city_name = find_province_from_chain(chain)
if province_code:
# Use existing city code if we don't have a better one
existing_city_code = data.get('ghcid', {}).get('location_resolution', {}).get('city_code', 'XXX')
new_city_code = extract_city_code(city_name) if city_name else existing_city_code
new_ghcid = generate_new_ghcid(old_ghcid, province_code, new_city_code)
print(f" ✅ Province: {province_name} → AR-{province_code}")
print(f" 📍 City: {city_name}{new_city_code}")
print(f" 🆕 New GHCID: {new_ghcid}")
if not dry_run:
new_path = update_yaml_file(filepath, province_code, new_city_code, city_name, province_name)
print(f" 📁 Renamed to: {new_path.name}")
resolved.append({
'old_filepath': filepath,
'name': name,
'qid': qid,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'province_code': province_code,
'province_name': province_name,
'city_name': city_name,
'city_code': new_city_code,
})
else:
chain_labels = [l for _, l in chain[:5]]
print(f" ❌ Could not determine province from chain: {chain_labels}")
unresolved.append((filepath, name, f"Chain: {chain_labels}"))
print()
print("="*70)
print("SUMMARY")
print("="*70)
print(f"✅ Resolved: {len(resolved)}")
print(f"❌ Unresolved: {len(unresolved)}")
if unresolved:
print("\n❌ Unresolved institutions (need manual research):")
for filepath, name, reason in unresolved:
print(f"{name}")
print(f" File: {filepath.name}")
print(f" Reason: {reason}")
return resolved, unresolved
if __name__ == "__main__":
import sys
dry_run = "--dry-run" in sys.argv
if dry_run:
print("🔍 DRY RUN - no files will be modified\n")
resolved, unresolved = process_ar_xx_files(dry_run=dry_run)