#!/usr/bin/env python3 """ Import Argentina heritage institutions from Wikidata into custodian YAML files. Queries Wikidata for museums and archives in Argentina, filters out institutions that already exist in custodian files, and creates new YAML files with complete GHCID metadata. GLAM Data Extraction Project Schema: LinkML v0.2.1 Country: Argentina (AR) Source: Wikidata SPARQL queries Usage: python scripts/import_argentina_wikidata_institutions.py [--dry-run] """ import argparse import json import re import sqlite3 import sys import time import unicodedata from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional import requests import yaml # Add project root to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import GHCIDComponents # Constants SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Argentina-Wikidata-Import/1.0 (https://github.com/glam-project)" BASE_DIR = Path(__file__).parent.parent CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" GEONAMES_DB = BASE_DIR / "data" / "reference" / "geonames.db" ISO_3166_2_AR = BASE_DIR / "data" / "reference" / "iso_3166_2_ar.json" # Argentina ISO 3166-2 region codes AR_REGION_CODES = { "Salta": "A", "Buenos Aires": "B", "Buenos Aires Province": "B", "Provincia de Buenos Aires": "B", "Ciudad Autónoma de Buenos Aires": "C", "Ciudad de Buenos Aires": "C", "Autonomous City of Buenos Aires": "C", "Capital Federal": "C", "CABA": "C", "San Luis": "D", "Entre Ríos": "E", "Entre Rios": "E", "La Rioja": "F", "Santiago del Estero": "G", "Chaco": "H", "San Juan": "J", "Catamarca": "K", "La Pampa": "L", "Mendoza": "M", "Misiones": "N", "Formosa": "P", "Neuquén": "Q", "Neuquen": "Q", "Río Negro": "R", "Rio Negro": "R", "Santa Fe": "S", "Tucumán": "T", "Tucuman": "T", "Chubut": "U", "Tierra del Fuego": "V", "Corrientes": "W", "Córdoba": "X", "Cordoba": "X", "Jujuy": "Y", "Santa Cruz": "Z", } # GeoNames admin1 code to ISO 3166-2 mapping for Argentina GEONAMES_ADMIN1_TO_ISO = { "01": "B", # Buenos Aires Province "02": "K", # Catamarca "03": "H", # Chaco "04": "U", # Chubut "05": "X", # Córdoba "06": "W", # Corrientes "07": "C", # Ciudad de Buenos Aires (CABA) "08": "E", # Entre Ríos "09": "P", # Formosa "10": "Y", # Jujuy "11": "L", # La Pampa "12": "F", # La Rioja "13": "M", # Mendoza "14": "N", # Misiones "15": "Q", # Neuquén "16": "R", # Río Negro "17": "A", # Salta "18": "J", # San Juan "19": "D", # San Luis "20": "Z", # Santa Cruz "21": "S", # Santa Fe "22": "G", # Santiago del Estero "23": "V", # Tierra del Fuego "24": "T", # Tucumán } def normalize_to_ascii(text: str) -> str: """Normalize text to ASCII, removing diacritics.""" # NFD decomposition separates base characters from combining marks normalized = unicodedata.normalize("NFD", text) # Remove combining marks (category 'Mn' = Mark, Nonspacing) ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn") return ascii_text def generate_city_code(city_name: str) -> str: """ Generate 3-letter city code from city name. Rules: - Single word: First 3 letters - Multi-word: First letter of each word (up to 3) - Dutch articles (de, het, den, 's): Article initial + 2 from main word """ if not city_name: return "XXX" # Normalize to ASCII city_ascii = normalize_to_ascii(city_name) # Split into words words = city_ascii.split() if len(words) == 1: # Single word: first 3 letters return words[0][:3].upper() # Check for Spanish articles (la, el, los, las) spanish_articles = {"la", "el", "los", "las", "de", "del"} if words[0].lower() in spanish_articles: # Skip article, use main word(s) remaining = [w for w in words if w.lower() not in spanish_articles] if remaining: if len(remaining) == 1: return remaining[0][:3].upper() else: # Initials of remaining words return "".join(w[0] for w in remaining[:3]).upper() # Multi-word: initials return "".join(w[0] for w in words[:3]).upper() def extract_abbreviation_from_name(name: str) -> str: """ Generate institution abbreviation from emic name. Takes first letter of each significant word (skipping articles, prepositions). Maximum 10 characters. """ if not name: return "UNK" # Spanish skip words (articles, prepositions, conjunctions) skip_words = { "el", "la", "los", "las", "un", "una", "unos", "unas", "de", "del", "a", "al", "en", "con", "por", "para", "sobre", "bajo", "y", "o", "e", "u" } # Normalize to ASCII name_ascii = normalize_to_ascii(name) # Remove special characters except spaces name_clean = re.sub(r"[^a-zA-Z0-9\s]", "", name_ascii) # Split into words words = name_clean.split() # Filter skip words significant_words = [w for w in words if w.lower() not in skip_words and len(w) > 0] if not significant_words: # Fallback: use all words significant_words = words if not significant_words: return "UNK" # Take first letter of each significant word abbrev = "".join(w[0].upper() for w in significant_words) # Limit to 10 characters return abbrev[:10] def query_wikidata_museums() -> list[dict]: """Query Wikidata for museums in Argentina.""" query = """ SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE { ?item wdt:P31/wdt:P279* wd:Q33506 . # instance of museum (or subclass) ?item wdt:P17 wd:Q414 . # country: Argentina OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P131 ?city . } OPTIONAL { ?item wdt:P856 ?websiteUrl . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } ORDER BY ?itemLabel """ return _execute_sparql(query) def query_wikidata_archives() -> list[dict]: """Query Wikidata for archives in Argentina.""" query = """ SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE { ?item wdt:P31/wdt:P279* wd:Q166118 . # instance of archive (or subclass) ?item wdt:P17 wd:Q414 . # country: Argentina OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P131 ?city . } OPTIONAL { ?item wdt:P856 ?websiteUrl . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } ORDER BY ?itemLabel """ return _execute_sparql(query) def query_wikidata_galleries() -> list[dict]: """Query Wikidata for art galleries in Argentina.""" query = """ SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE { ?item wdt:P31/wdt:P279* wd:Q1007870 . # instance of art gallery (or subclass) ?item wdt:P17 wd:Q414 . # country: Argentina OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P131 ?city . } OPTIONAL { ?item wdt:P856 ?websiteUrl . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } ORDER BY ?itemLabel """ return _execute_sparql(query) def _execute_sparql(query: str) -> list[dict]: """Execute SPARQL query and return parsed results.""" headers = { "User-Agent": USER_AGENT, "Accept": "application/sparql-results+json" } params = {"query": query, "format": "json"} time.sleep(1.0) # Rate limiting try: response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60) response.raise_for_status() data = response.json() return data.get("results", {}).get("bindings", []) except Exception as e: print(f" ❌ SPARQL query failed: {e}") return [] def parse_wikidata_result(binding: dict, institution_type: str) -> Optional[dict]: """Parse a Wikidata SPARQL result binding into a normalized dict.""" item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None label = binding.get("itemLabel", {}).get("value", "") if not label or label == qid: # Skip if label is just the QID (no label found) return None result = { "qid": qid, "name": label, "institution_type": institution_type, } # Parse coordinates coords_str = binding.get("coords", {}).get("value", "") if coords_str and coords_str.startswith("Point("): try: lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) except (ValueError, IndexError): pass # Parse city city = binding.get("cityLabel", {}).get("value", "") if city and not city.startswith("Q"): # Skip if city label is QID result["city"] = city # Parse website website = binding.get("websiteUrl", {}).get("value", "") if website: result["website"] = website return result def get_existing_qids() -> set[str]: """Get set of Wikidata QIDs already in Argentina custodian files.""" qids = set() for filepath in CUSTODIAN_DIR.glob("AR-*.yaml"): try: with open(filepath, "r", encoding="utf-8") as f: data = yaml.safe_load(f) # Check wikidata_enrichment section wd_id = data.get("wikidata_enrichment", {}).get("wikidata_entity_id") if wd_id: qids.add(wd_id) # Check original_entry section wd_id = data.get("original_entry", {}).get("wikidata_id") if wd_id: qids.add(wd_id) except Exception: continue return qids def reverse_geocode_to_region(lat: float, lon: float) -> Optional[tuple[str, str, str]]: """ Reverse geocode coordinates to find region code and city. Returns: (region_code, city_name, city_code) or None """ if not GEONAMES_DB.exists(): return None try: conn = sqlite3.connect(GEONAMES_DB) cursor = conn.cursor() # Find nearest city with proper feature codes (not neighborhoods) cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq FROM cities WHERE country_code = 'AR' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY distance_sq LIMIT 1 """, (lat, lat, lon, lon)) row = cursor.fetchone() conn.close() if row: city_name = row[0] ascii_name = row[1] admin1_code = row[2] # Map GeoNames admin1 to ISO 3166-2 region_code = GEONAMES_ADMIN1_TO_ISO.get(admin1_code, "XX") city_code = generate_city_code(ascii_name or city_name) return (region_code, city_name, city_code) except Exception as e: print(f" ⚠️ GeoNames lookup failed: {e}") return None def city_label_to_region(city_label: str) -> Optional[str]: """Try to map city label to region code.""" if not city_label: return None # Direct match in region codes for name, code in AR_REGION_CODES.items(): if name.lower() == city_label.lower(): return code if name.lower() in city_label.lower(): return code # Known city to region mappings city_to_region = { "la plata": "B", "mar del plata": "B", "bahía blanca": "B", "bahia blanca": "B", "rosario": "S", "ushuaia": "V", "resistencia": "H", "posadas": "N", "paraná": "E", "parana": "E", "san salvador de jujuy": "Y", "san miguel de tucumán": "T", "san miguel de tucuman": "T", } city_lower = city_label.lower() for city, region in city_to_region.items(): if city in city_lower: return region return None def create_custodian_yaml(inst: dict, dry_run: bool = False) -> Optional[Path]: """Create a custodian YAML file for an institution.""" qid = inst["qid"] name = inst["name"] inst_type = inst["institution_type"] # Determine location region_code = "XX" city_code = "XXX" city_name = inst.get("city", "") # Try reverse geocoding first (most accurate) if "latitude" in inst and "longitude" in inst: geo_result = reverse_geocode_to_region(inst["latitude"], inst["longitude"]) if geo_result: region_code, city_name, city_code = geo_result # Fallback: try city label if region_code == "XX" and city_name: region = city_label_to_region(city_name) if region: region_code = region city_code = generate_city_code(city_name) # Generate abbreviation abbreviation = extract_abbreviation_from_name(name) # Create GHCID components try: components = GHCIDComponents( country_code="AR", region_code=region_code, city_locode=city_code, institution_type=inst_type, abbreviation=abbreviation, ) ghcid_current = components.to_string() ghcid_uuid = str(components.to_uuid()) ghcid_uuid_sha256 = str(components.to_uuid_sha256()) ghcid_numeric = components.to_numeric() except Exception as e: print(f" ❌ GHCID generation failed for {name}: {e}") return None # Check for collision filename = f"{ghcid_current}.yaml" filepath = CUSTODIAN_DIR / filename if filepath.exists(): # Collision - append Wikidata QID components.wikidata_qid = qid.replace("Q", "") ghcid_current = components.to_string() ghcid_uuid = str(components.to_uuid()) ghcid_uuid_sha256 = str(components.to_uuid_sha256()) ghcid_numeric = components.to_numeric() filename = f"{ghcid_current}.yaml" filepath = CUSTODIAN_DIR / filename timestamp = datetime.now(timezone.utc).isoformat() # Build YAML structure data = { "original_entry": { "name": name, "source": "Wikidata SPARQL import", "wikidata_id": qid, }, "processing_timestamp": timestamp, "ghcid": { "ghcid_current": ghcid_current, "ghcid_uuid": ghcid_uuid, "ghcid_uuid_sha256": ghcid_uuid_sha256, "ghcid_numeric": ghcid_numeric, "record_id": str(__import__("uuid").uuid4()), "generation_timestamp": timestamp, "location_resolution": { "method": "WIKIDATA_IMPORT", "country_code": "AR", "region_code": region_code, "city_code": city_code, "city_label": city_name or None, }, }, "custodian_name": { "claim_type": "custodian_name", "claim_value": name, "source_type": "wikidata", "emic_name": name, "name_language": "es", }, "institution_type": { "M": "MUSEUM", "A": "ARCHIVE", "G": "GALLERY", "L": "LIBRARY", }.get(inst_type, "UNKNOWN"), "location": { "country": "AR", "region_code": region_code, }, "wikidata_enrichment": { "wikidata_entity_id": qid, "enrichment_date": timestamp, "source": "Wikidata SPARQL import", }, } # Add optional fields if city_name: data["location"]["city"] = city_name if "latitude" in inst and "longitude" in inst: data["location"]["latitude"] = inst["latitude"] data["location"]["longitude"] = inst["longitude"] if "website" in inst: data["website"] = inst["website"] if dry_run: print(f" [DRY RUN] Would create: {filename}") return filepath # Write YAML file with open(filepath, "w", encoding="utf-8") as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return filepath def main(): parser = argparse.ArgumentParser(description="Import Argentina institutions from Wikidata") parser.add_argument("--dry-run", action="store_true", help="Don't create files, just report what would be done") args = parser.parse_args() print("=" * 80) print("ARGENTINA WIKIDATA INSTITUTION IMPORT") print("=" * 80) print() # Get existing QIDs print("📂 Scanning existing custodian files...") existing_qids = get_existing_qids() print(f" Found {len(existing_qids)} existing Wikidata QIDs") print() # Query Wikidata print("🔍 Querying Wikidata for Argentina institutions...") print(" Museums...", end=" ", flush=True) museum_results = query_wikidata_museums() print(f"found {len(museum_results)} raw results") print(" Archives...", end=" ", flush=True) archive_results = query_wikidata_archives() print(f"found {len(archive_results)} raw results") print(" Galleries...", end=" ", flush=True) gallery_results = query_wikidata_galleries() print(f"found {len(gallery_results)} raw results") print() # Parse and deduplicate results institutions = {} for binding in museum_results: inst = parse_wikidata_result(binding, "M") if inst and inst["qid"] not in institutions: institutions[inst["qid"]] = inst for binding in archive_results: inst = parse_wikidata_result(binding, "A") if inst and inst["qid"] not in institutions: institutions[inst["qid"]] = inst for binding in gallery_results: inst = parse_wikidata_result(binding, "G") if inst and inst["qid"] not in institutions: institutions[inst["qid"]] = inst print(f"📊 Total unique institutions: {len(institutions)}") # Filter out existing new_institutions = {qid: inst for qid, inst in institutions.items() if qid not in existing_qids} print(f" After filtering existing: {len(new_institutions)} new institutions") print() # Create custodian files stats = { "created": 0, "collisions": 0, "errors": 0, "by_type": {"M": 0, "A": 0, "G": 0}, } if args.dry_run: print("🔄 [DRY RUN] Would create the following files:") else: print("🔄 Creating custodian YAML files...") print() for qid, inst in sorted(new_institutions.items(), key=lambda x: x[1]["name"]): name = inst["name"] inst_type = inst["institution_type"] city = inst.get("city", "Unknown") print(f" [{inst_type}] {name}") print(f" 📍 {city}, QID: {qid}") filepath = create_custodian_yaml(inst, dry_run=args.dry_run) if filepath: if not args.dry_run: print(f" ✅ Created: {filepath.name}") stats["created"] += 1 stats["by_type"][inst_type] += 1 else: stats["errors"] += 1 print() # Summary print("=" * 80) print("IMPORT COMPLETE") print("=" * 80) print(f"✅ Created: {stats['created']} custodian files") print(f" - Museums: {stats['by_type']['M']}") print(f" - Archives: {stats['by_type']['A']}") print(f" - Galleries: {stats['by_type']['G']}") print(f"❌ Errors: {stats['errors']}") print() # Final count if not args.dry_run: final_count = len(list(CUSTODIAN_DIR.glob("AR-*.yaml"))) print(f"📁 Total Argentina custodian files: {final_count}") if __name__ == "__main__": main()