349 lines
12 KiB
Python
349 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve AR-XX-* files by querying Wikidata P131 chain to find province.
|
|
Uses ISO 3166-2:AR codes for standardization.
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import os
|
|
import re
|
|
import time
|
|
import shutil
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Wikidata province QIDs to ISO 3166-2:AR codes
|
|
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:AR
|
|
WIKIDATA_PROVINCE_QIDS = {
|
|
"Q44754": "A", # Salta
|
|
"Q44705": "B", # Buenos Aires Province
|
|
"Q1486": "C", # Buenos Aires (city / CABA)
|
|
"Q44926": "D", # San Luis
|
|
"Q44762": "E", # Entre Ríos
|
|
"Q44821": "F", # La Rioja
|
|
"Q44690": "G", # Santiago del Estero
|
|
"Q44770": "H", # Chaco
|
|
"Q44915": "J", # San Juan
|
|
"Q44838": "K", # Catamarca
|
|
"Q44789": "L", # La Pampa
|
|
"Q44937": "M", # Mendoza
|
|
"Q44745": "N", # Misiones
|
|
"Q44797": "P", # Formosa
|
|
"Q44859": "Q", # Neuquén
|
|
"Q44869": "R", # Río Negro
|
|
"Q44713": "S", # Santa Fe
|
|
"Q44881": "T", # Tucumán
|
|
"Q44895": "U", # Chubut
|
|
"Q44908": "V", # Tierra del Fuego
|
|
"Q44729": "W", # Corrientes
|
|
"Q44778": "X", # Córdoba
|
|
"Q45034": "Y", # Jujuy
|
|
"Q44922": "Z", # Santa Cruz
|
|
}
|
|
|
|
# Province names to ISO codes (for label matching)
|
|
PROVINCE_NAMES_TO_ISO = {
|
|
"salta": "A",
|
|
"provincia de salta": "A",
|
|
"buenos aires": "B", # Province (not city)
|
|
"provincia de buenos aires": "B",
|
|
"ciudad autónoma de buenos aires": "C",
|
|
"ciudad de buenos aires": "C",
|
|
"caba": "C",
|
|
"san luis": "D",
|
|
"provincia de san luis": "D",
|
|
"entre ríos": "E",
|
|
"provincia de entre ríos": "E",
|
|
"la rioja": "F",
|
|
"provincia de la rioja": "F",
|
|
"santiago del estero": "G",
|
|
"provincia de santiago del estero": "G",
|
|
"chaco": "H",
|
|
"provincia del chaco": "H",
|
|
"san juan": "J",
|
|
"provincia de san juan": "J",
|
|
"catamarca": "K",
|
|
"provincia de catamarca": "K",
|
|
"la pampa": "L",
|
|
"provincia de la pampa": "L",
|
|
"mendoza": "M",
|
|
"provincia de mendoza": "M",
|
|
"misiones": "N",
|
|
"provincia de misiones": "N",
|
|
"formosa": "P",
|
|
"provincia de formosa": "P",
|
|
"neuquén": "Q",
|
|
"provincia del neuquén": "Q",
|
|
"río negro": "R",
|
|
"provincia de río negro": "R",
|
|
"santa fe": "S",
|
|
"provincia de santa fe": "S",
|
|
"tucumán": "T",
|
|
"provincia de tucumán": "T",
|
|
"chubut": "U",
|
|
"provincia del chubut": "U",
|
|
"tierra del fuego": "V",
|
|
"provincia de tierra del fuego, antártida e islas del atlántico sur": "V",
|
|
"corrientes": "W",
|
|
"provincia de corrientes": "W",
|
|
"córdoba": "X",
|
|
"provincia de córdoba": "X",
|
|
"jujuy": "Y",
|
|
"provincia de jujuy": "Y",
|
|
"santa cruz": "Z",
|
|
"provincia de santa cruz": "Z",
|
|
}
|
|
|
|
def query_wikidata_p131_chain(qid: str) -> list:
|
|
"""Query Wikidata for P131 chain (located in administrative entity)."""
|
|
import urllib.request
|
|
import urllib.parse
|
|
|
|
sparql = f"""
|
|
SELECT ?item ?itemLabel WHERE {{
|
|
wd:{qid} wdt:P131* ?item .
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en". }}
|
|
}}
|
|
"""
|
|
|
|
url = "https://query.wikidata.org/sparql"
|
|
params = urllib.parse.urlencode({
|
|
'query': sparql,
|
|
'format': 'json'
|
|
})
|
|
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{url}?{params}",
|
|
headers={'User-Agent': 'GLAM-Ontology-Bot/1.0'}
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
|
data = json.loads(response.read().decode())
|
|
results = []
|
|
for binding in data.get('results', {}).get('bindings', []):
|
|
item_uri = binding.get('item', {}).get('value', '')
|
|
item_qid = item_uri.split('/')[-1] if item_uri else None
|
|
item_label = binding.get('itemLabel', {}).get('value', '')
|
|
if item_qid:
|
|
results.append((item_qid, item_label))
|
|
return results
|
|
except Exception as e:
|
|
print(f" ⚠️ Error querying {qid}: {e}")
|
|
return []
|
|
|
|
def find_province_from_chain(chain: list) -> tuple:
|
|
"""Find the province from P131 chain. Returns (iso_code, province_name, city_qid, city_name)."""
|
|
province_code = None
|
|
province_name = None
|
|
city_qid = None
|
|
city_name = None
|
|
|
|
for qid, label in chain:
|
|
# Check if this QID is a known province
|
|
if qid in WIKIDATA_PROVINCE_QIDS:
|
|
province_code = WIKIDATA_PROVINCE_QIDS[qid]
|
|
province_name = label
|
|
break
|
|
|
|
# Check if label matches a known province name
|
|
label_lower = label.lower().strip()
|
|
if label_lower in PROVINCE_NAMES_TO_ISO:
|
|
province_code = PROVINCE_NAMES_TO_ISO[label_lower]
|
|
province_name = label
|
|
break
|
|
|
|
# Find city (first entity in chain that's not the item itself and not a province)
|
|
for i, (qid, label) in enumerate(chain):
|
|
if i == 0: # Skip the item itself
|
|
continue
|
|
if qid in WIKIDATA_PROVINCE_QIDS:
|
|
continue
|
|
if 'argentina' in label.lower():
|
|
continue
|
|
# This should be a city/municipality/partido
|
|
city_qid = qid
|
|
city_name = label
|
|
break
|
|
|
|
return province_code, province_name, city_qid, city_name
|
|
|
|
def normalize_to_ascii(text: str) -> str:
|
|
"""Normalize unicode to ASCII."""
|
|
normalized = unicodedata.normalize('NFD', text)
|
|
return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
def extract_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from name."""
|
|
if not city_name:
|
|
return "XXX"
|
|
|
|
# Clean the name
|
|
name = normalize_to_ascii(city_name.strip())
|
|
|
|
# Remove common prefixes
|
|
prefixes = ['Partido de ', 'Departamento ', 'Ciudad de ', 'Provincia de ', 'Partido ']
|
|
for prefix in prefixes:
|
|
if name.startswith(prefix):
|
|
name = name[len(prefix):]
|
|
|
|
# Single word: first 3 letters
|
|
words = [w for w in name.split() if w]
|
|
if len(words) == 1:
|
|
return name[:3].upper()
|
|
|
|
# Multi-word: initials (up to 3)
|
|
initials = ''.join(w[0] for w in words if w and w[0].isalpha())
|
|
return initials[:3].upper() if initials else name[:3].upper()
|
|
|
|
def generate_new_ghcid(old_ghcid: str, new_region: str, new_city_code: str) -> str:
|
|
"""Generate new GHCID with resolved region and city."""
|
|
parts = old_ghcid.split('-')
|
|
# AR-XX-XXX-M-ABBREV -> AR-{region}-{city}-M-ABBREV
|
|
if len(parts) >= 5:
|
|
parts[1] = new_region
|
|
parts[2] = new_city_code
|
|
return '-'.join(parts)
|
|
return old_ghcid
|
|
|
|
def update_yaml_file(filepath: Path, new_region: str, new_city_code: str, city_name: str, province_name: str):
|
|
"""Update the YAML file with resolved region and rename."""
|
|
with open(filepath, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
new_ghcid = generate_new_ghcid(old_ghcid, new_region, new_city_code)
|
|
|
|
# Update GHCID
|
|
if 'ghcid' not in data:
|
|
data['ghcid'] = {}
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['location_resolution']['region_code'] = new_region
|
|
data['ghcid']['location_resolution']['city_code'] = new_city_code
|
|
if city_name:
|
|
data['ghcid']['location_resolution']['city_label'] = city_name
|
|
data['ghcid']['location_resolution']['province_name'] = province_name
|
|
data['ghcid']['location_resolution']['resolution_method'] = 'WIKIDATA_P131_CHAIN'
|
|
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
if 'location' in data:
|
|
data['location']['region_code'] = new_region
|
|
if city_name:
|
|
data['location']['city'] = city_name
|
|
|
|
# Calculate new filename
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
# Write updated data
|
|
with open(filepath, 'w') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Rename file if GHCID changed
|
|
if new_filepath != filepath and not new_filepath.exists():
|
|
shutil.move(filepath, new_filepath)
|
|
return new_filepath
|
|
elif new_filepath.exists() and new_filepath != filepath:
|
|
# Collision - add QID suffix
|
|
qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id', '')
|
|
new_filename = f"{new_ghcid}-{qid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
shutil.move(filepath, new_filepath)
|
|
return new_filepath
|
|
|
|
return filepath
|
|
|
|
def process_ar_xx_files(dry_run: bool = False):
|
|
"""Process all AR-XX-* files and resolve regions."""
|
|
base_dir = Path("/Users/kempersc/apps/glam")
|
|
custodian_dir = base_dir / "data" / "custodian"
|
|
|
|
ar_xx_files = list(custodian_dir.glob("AR-XX-*.yaml"))
|
|
print(f"Found {len(ar_xx_files)} AR-XX-* files to process\n")
|
|
|
|
resolved = []
|
|
unresolved = []
|
|
|
|
for filepath in sorted(ar_xx_files):
|
|
with open(filepath, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
|
|
name = data.get('custodian_name', {}).get('claim_value', filepath.stem)
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
|
|
print(f"Processing: {name}")
|
|
print(f" QID: {qid}")
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
|
|
if not qid:
|
|
print(f" ❌ No Wikidata ID - cannot resolve")
|
|
unresolved.append((filepath, name, "No Wikidata ID"))
|
|
continue
|
|
|
|
chain = query_wikidata_p131_chain(qid)
|
|
time.sleep(0.5) # Rate limit
|
|
|
|
if not chain:
|
|
print(f" ❌ No P131 chain found")
|
|
unresolved.append((filepath, name, "No P131 chain"))
|
|
continue
|
|
|
|
province_code, province_name, city_qid, city_name = find_province_from_chain(chain)
|
|
|
|
if province_code:
|
|
# Use existing city code if we don't have a better one
|
|
existing_city_code = data.get('ghcid', {}).get('location_resolution', {}).get('city_code', 'XXX')
|
|
new_city_code = extract_city_code(city_name) if city_name else existing_city_code
|
|
|
|
new_ghcid = generate_new_ghcid(old_ghcid, province_code, new_city_code)
|
|
|
|
print(f" ✅ Province: {province_name} → AR-{province_code}")
|
|
print(f" 📍 City: {city_name} → {new_city_code}")
|
|
print(f" 🆕 New GHCID: {new_ghcid}")
|
|
|
|
if not dry_run:
|
|
new_path = update_yaml_file(filepath, province_code, new_city_code, city_name, province_name)
|
|
print(f" 📁 Renamed to: {new_path.name}")
|
|
|
|
resolved.append({
|
|
'old_filepath': filepath,
|
|
'name': name,
|
|
'qid': qid,
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'province_code': province_code,
|
|
'province_name': province_name,
|
|
'city_name': city_name,
|
|
'city_code': new_city_code,
|
|
})
|
|
else:
|
|
chain_labels = [l for _, l in chain[:5]]
|
|
print(f" ❌ Could not determine province from chain: {chain_labels}")
|
|
unresolved.append((filepath, name, f"Chain: {chain_labels}"))
|
|
|
|
print()
|
|
|
|
print("="*70)
|
|
print("SUMMARY")
|
|
print("="*70)
|
|
print(f"✅ Resolved: {len(resolved)}")
|
|
print(f"❌ Unresolved: {len(unresolved)}")
|
|
|
|
if unresolved:
|
|
print("\n❌ Unresolved institutions (need manual research):")
|
|
for filepath, name, reason in unresolved:
|
|
print(f" • {name}")
|
|
print(f" File: {filepath.name}")
|
|
print(f" Reason: {reason}")
|
|
|
|
return resolved, unresolved
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
dry_run = "--dry-run" in sys.argv
|
|
if dry_run:
|
|
print("🔍 DRY RUN - no files will be modified\n")
|
|
resolved, unresolved = process_ar_xx_files(dry_run=dry_run)
|