#!/usr/bin/env python3 """ Resolve AR-XX-* files by querying Wikidata P131 chain to find province. Uses ISO 3166-2:AR codes for standardization. """ import json import yaml import os import re import time import shutil import unicodedata from pathlib import Path from datetime import datetime, timezone # Wikidata province QIDs to ISO 3166-2:AR codes # Source: https://en.wikipedia.org/wiki/ISO_3166-2:AR WIKIDATA_PROVINCE_QIDS = { "Q44754": "A", # Salta "Q44705": "B", # Buenos Aires Province "Q1486": "C", # Buenos Aires (city / CABA) "Q44926": "D", # San Luis "Q44762": "E", # Entre Ríos "Q44821": "F", # La Rioja "Q44690": "G", # Santiago del Estero "Q44770": "H", # Chaco "Q44915": "J", # San Juan "Q44838": "K", # Catamarca "Q44789": "L", # La Pampa "Q44937": "M", # Mendoza "Q44745": "N", # Misiones "Q44797": "P", # Formosa "Q44859": "Q", # Neuquén "Q44869": "R", # Río Negro "Q44713": "S", # Santa Fe "Q44881": "T", # Tucumán "Q44895": "U", # Chubut "Q44908": "V", # Tierra del Fuego "Q44729": "W", # Corrientes "Q44778": "X", # Córdoba "Q45034": "Y", # Jujuy "Q44922": "Z", # Santa Cruz } # Province names to ISO codes (for label matching) PROVINCE_NAMES_TO_ISO = { "salta": "A", "provincia de salta": "A", "buenos aires": "B", # Province (not city) "provincia de buenos aires": "B", "ciudad autónoma de buenos aires": "C", "ciudad de buenos aires": "C", "caba": "C", "san luis": "D", "provincia de san luis": "D", "entre ríos": "E", "provincia de entre ríos": "E", "la rioja": "F", "provincia de la rioja": "F", "santiago del estero": "G", "provincia de santiago del estero": "G", "chaco": "H", "provincia del chaco": "H", "san juan": "J", "provincia de san juan": "J", "catamarca": "K", "provincia de catamarca": "K", "la pampa": "L", "provincia de la pampa": "L", "mendoza": "M", "provincia de mendoza": "M", "misiones": "N", "provincia de misiones": "N", "formosa": "P", "provincia de formosa": "P", "neuquén": "Q", "provincia del neuquén": "Q", "río negro": "R", "provincia de río negro": "R", "santa fe": "S", "provincia de santa fe": "S", "tucumán": "T", "provincia de tucumán": "T", "chubut": "U", "provincia del chubut": "U", "tierra del fuego": "V", "provincia de tierra del fuego, antártida e islas del atlántico sur": "V", "corrientes": "W", "provincia de corrientes": "W", "córdoba": "X", "provincia de córdoba": "X", "jujuy": "Y", "provincia de jujuy": "Y", "santa cruz": "Z", "provincia de santa cruz": "Z", } def query_wikidata_p131_chain(qid: str) -> list: """Query Wikidata for P131 chain (located in administrative entity).""" import urllib.request import urllib.parse sparql = f""" SELECT ?item ?itemLabel WHERE {{ wd:{qid} wdt:P131* ?item . SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en". }} }} """ url = "https://query.wikidata.org/sparql" params = urllib.parse.urlencode({ 'query': sparql, 'format': 'json' }) try: req = urllib.request.Request( f"{url}?{params}", headers={'User-Agent': 'GLAM-Ontology-Bot/1.0'} ) with urllib.request.urlopen(req, timeout=30) as response: data = json.loads(response.read().decode()) results = [] for binding in data.get('results', {}).get('bindings', []): item_uri = binding.get('item', {}).get('value', '') item_qid = item_uri.split('/')[-1] if item_uri else None item_label = binding.get('itemLabel', {}).get('value', '') if item_qid: results.append((item_qid, item_label)) return results except Exception as e: print(f" ⚠️ Error querying {qid}: {e}") return [] def find_province_from_chain(chain: list) -> tuple: """Find the province from P131 chain. Returns (iso_code, province_name, city_qid, city_name).""" province_code = None province_name = None city_qid = None city_name = None for qid, label in chain: # Check if this QID is a known province if qid in WIKIDATA_PROVINCE_QIDS: province_code = WIKIDATA_PROVINCE_QIDS[qid] province_name = label break # Check if label matches a known province name label_lower = label.lower().strip() if label_lower in PROVINCE_NAMES_TO_ISO: province_code = PROVINCE_NAMES_TO_ISO[label_lower] province_name = label break # Find city (first entity in chain that's not the item itself and not a province) for i, (qid, label) in enumerate(chain): if i == 0: # Skip the item itself continue if qid in WIKIDATA_PROVINCE_QIDS: continue if 'argentina' in label.lower(): continue # This should be a city/municipality/partido city_qid = qid city_name = label break return province_code, province_name, city_qid, city_name def normalize_to_ascii(text: str) -> str: """Normalize unicode to ASCII.""" normalized = unicodedata.normalize('NFD', text) return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') def extract_city_code(city_name: str) -> str: """Generate 3-letter city code from name.""" if not city_name: return "XXX" # Clean the name name = normalize_to_ascii(city_name.strip()) # Remove common prefixes prefixes = ['Partido de ', 'Departamento ', 'Ciudad de ', 'Provincia de ', 'Partido '] for prefix in prefixes: if name.startswith(prefix): name = name[len(prefix):] # Single word: first 3 letters words = [w for w in name.split() if w] if len(words) == 1: return name[:3].upper() # Multi-word: initials (up to 3) initials = ''.join(w[0] for w in words if w and w[0].isalpha()) return initials[:3].upper() if initials else name[:3].upper() def generate_new_ghcid(old_ghcid: str, new_region: str, new_city_code: str) -> str: """Generate new GHCID with resolved region and city.""" parts = old_ghcid.split('-') # AR-XX-XXX-M-ABBREV -> AR-{region}-{city}-M-ABBREV if len(parts) >= 5: parts[1] = new_region parts[2] = new_city_code return '-'.join(parts) return old_ghcid def update_yaml_file(filepath: Path, new_region: str, new_city_code: str, city_name: str, province_name: str): """Update the YAML file with resolved region and rename.""" with open(filepath, 'r') as f: data = yaml.safe_load(f) old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') new_ghcid = generate_new_ghcid(old_ghcid, new_region, new_city_code) # Update GHCID if 'ghcid' not in data: data['ghcid'] = {} data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution']['region_code'] = new_region data['ghcid']['location_resolution']['city_code'] = new_city_code if city_name: data['ghcid']['location_resolution']['city_label'] = city_name data['ghcid']['location_resolution']['province_name'] = province_name data['ghcid']['location_resolution']['resolution_method'] = 'WIKIDATA_P131_CHAIN' data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat() # Update location if 'location' in data: data['location']['region_code'] = new_region if city_name: data['location']['city'] = city_name # Calculate new filename new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename # Write updated data with open(filepath, 'w') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Rename file if GHCID changed if new_filepath != filepath and not new_filepath.exists(): shutil.move(filepath, new_filepath) return new_filepath elif new_filepath.exists() and new_filepath != filepath: # Collision - add QID suffix qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id', '') new_filename = f"{new_ghcid}-{qid}.yaml" new_filepath = filepath.parent / new_filename shutil.move(filepath, new_filepath) return new_filepath return filepath def process_ar_xx_files(dry_run: bool = False): """Process all AR-XX-* files and resolve regions.""" base_dir = Path("/Users/kempersc/apps/glam") custodian_dir = base_dir / "data" / "custodian" ar_xx_files = list(custodian_dir.glob("AR-XX-*.yaml")) print(f"Found {len(ar_xx_files)} AR-XX-* files to process\n") resolved = [] unresolved = [] for filepath in sorted(ar_xx_files): with open(filepath, 'r') as f: data = yaml.safe_load(f) qid = data.get('wikidata_enrichment', {}).get('wikidata_entity_id') name = data.get('custodian_name', {}).get('claim_value', filepath.stem) old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') print(f"Processing: {name}") print(f" QID: {qid}") print(f" Old GHCID: {old_ghcid}") if not qid: print(f" ❌ No Wikidata ID - cannot resolve") unresolved.append((filepath, name, "No Wikidata ID")) continue chain = query_wikidata_p131_chain(qid) time.sleep(0.5) # Rate limit if not chain: print(f" ❌ No P131 chain found") unresolved.append((filepath, name, "No P131 chain")) continue province_code, province_name, city_qid, city_name = find_province_from_chain(chain) if province_code: # Use existing city code if we don't have a better one existing_city_code = data.get('ghcid', {}).get('location_resolution', {}).get('city_code', 'XXX') new_city_code = extract_city_code(city_name) if city_name else existing_city_code new_ghcid = generate_new_ghcid(old_ghcid, province_code, new_city_code) print(f" ✅ Province: {province_name} → AR-{province_code}") print(f" 📍 City: {city_name} → {new_city_code}") print(f" 🆕 New GHCID: {new_ghcid}") if not dry_run: new_path = update_yaml_file(filepath, province_code, new_city_code, city_name, province_name) print(f" 📁 Renamed to: {new_path.name}") resolved.append({ 'old_filepath': filepath, 'name': name, 'qid': qid, 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'province_code': province_code, 'province_name': province_name, 'city_name': city_name, 'city_code': new_city_code, }) else: chain_labels = [l for _, l in chain[:5]] print(f" ❌ Could not determine province from chain: {chain_labels}") unresolved.append((filepath, name, f"Chain: {chain_labels}")) print() print("="*70) print("SUMMARY") print("="*70) print(f"✅ Resolved: {len(resolved)}") print(f"❌ Unresolved: {len(unresolved)}") if unresolved: print("\n❌ Unresolved institutions (need manual research):") for filepath, name, reason in unresolved: print(f" • {name}") print(f" File: {filepath.name}") print(f" Reason: {reason}") return resolved, unresolved if __name__ == "__main__": import sys dry_run = "--dry-run" in sys.argv if dry_run: print("🔍 DRY RUN - no files will be modified\n") resolved, unresolved = process_ar_xx_files(dry_run=dry_run)