glam/scripts/import_argentina_wikidata_institutions.py

#!/usr/bin/env python3
"""
Import Argentina heritage institutions from Wikidata into custodian YAML files.

Queries Wikidata for museums and archives in Argentina, filters out institutions
that already exist in custodian files, and creates new YAML files with complete
GHCID metadata.

GLAM Data Extraction Project
Schema: LinkML v0.2.1
Country: Argentina (AR)
Source: Wikidata SPARQL queries

Usage:
    python scripts/import_argentina_wikidata_institutions.py [--dry-run]
"""

import argparse
import json
import re
import sqlite3
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

import requests
import yaml

# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import GHCIDComponents

# Constants
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Argentina-Wikidata-Import/1.0 (https://github.com/glam-project)"
BASE_DIR = Path(__file__).parent.parent
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
GEONAMES_DB = BASE_DIR / "data" / "reference" / "geonames.db"
ISO_3166_2_AR = BASE_DIR / "data" / "reference" / "iso_3166_2_ar.json"

# Argentina ISO 3166-2 region codes
AR_REGION_CODES = {
    "Salta": "A",
    "Buenos Aires": "B",
    "Buenos Aires Province": "B",
    "Provincia de Buenos Aires": "B",
    "Ciudad Autónoma de Buenos Aires": "C",
    "Ciudad de Buenos Aires": "C",
    "Autonomous City of Buenos Aires": "C",
    "Capital Federal": "C",
    "CABA": "C",
    "San Luis": "D",
    "Entre Ríos": "E",
    "Entre Rios": "E",
    "La Rioja": "F",
    "Santiago del Estero": "G",
    "Chaco": "H",
    "San Juan": "J",
    "Catamarca": "K",
    "La Pampa": "L",
    "Mendoza": "M",
    "Misiones": "N",
    "Formosa": "P",
    "Neuquén": "Q",
    "Neuquen": "Q",
    "Río Negro": "R",
    "Rio Negro": "R",
    "Santa Fe": "S",
    "Tucumán": "T",
    "Tucuman": "T",
    "Chubut": "U",
    "Tierra del Fuego": "V",
    "Corrientes": "W",
    "Córdoba": "X",
    "Cordoba": "X",
    "Jujuy": "Y",
    "Santa Cruz": "Z",
}

# GeoNames admin1 code to ISO 3166-2 mapping for Argentina
GEONAMES_ADMIN1_TO_ISO = {
    "01": "B",  # Buenos Aires Province
    "02": "K",  # Catamarca
    "03": "H",  # Chaco
    "04": "U",  # Chubut
    "05": "X",  # Córdoba
    "06": "W",  # Corrientes
    "07": "C",  # Ciudad de Buenos Aires (CABA)
    "08": "E",  # Entre Ríos
    "09": "P",  # Formosa
    "10": "Y",  # Jujuy
    "11": "L",  # La Pampa
    "12": "F",  # La Rioja
    "13": "M",  # Mendoza
    "14": "N",  # Misiones
    "15": "Q",  # Neuquén
    "16": "R",  # Río Negro
    "17": "A",  # Salta
    "18": "J",  # San Juan
    "19": "D",  # San Luis
    "20": "Z",  # Santa Cruz
    "21": "S",  # Santa Fe
    "22": "G",  # Santiago del Estero
    "23": "V",  # Tierra del Fuego
    "24": "T",  # Tucumán
}


def normalize_to_ascii(text: str) -> str:
    """Normalize text to ASCII, removing diacritics."""
    # NFD decomposition separates base characters from combining marks
    normalized = unicodedata.normalize("NFD", text)
    # Remove combining marks (category 'Mn' = Mark, Nonspacing)
    ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
    return ascii_text


def generate_city_code(city_name: str) -> str:
    """
    Generate 3-letter city code from city name.

    Rules:
    - Single word: First 3 letters
    - Multi-word: First letter of each word (up to 3)
    - Dutch articles (de, het, den, 's): Article initial + 2 from main word
    """
    if not city_name:
        return "XXX"

    # Normalize to ASCII
    city_ascii = normalize_to_ascii(city_name)

    # Split into words
    words = city_ascii.split()

    if len(words) == 1:
        # Single word: first 3 letters
        return words[0][:3].upper()

    # Check for Spanish articles (la, el, los, las)
    spanish_articles = {"la", "el", "los", "las", "de", "del"}
    if words[0].lower() in spanish_articles:
        # Skip article, use main word(s)
        remaining = [w for w in words if w.lower() not in spanish_articles]
        if remaining:
            if len(remaining) == 1:
                return remaining[0][:3].upper()
            else:
                # Initials of remaining words
                return "".join(w[0] for w in remaining[:3]).upper()

    # Multi-word: initials
    return "".join(w[0] for w in words[:3]).upper()


def extract_abbreviation_from_name(name: str) -> str:
    """
    Generate institution abbreviation from emic name.

    Takes first letter of each significant word (skipping articles, prepositions).
    Maximum 10 characters.
    """
    if not name:
        return "UNK"

    # Spanish skip words (articles, prepositions, conjunctions)
    skip_words = {
        "el", "la", "los", "las", "un", "una", "unos", "unas",
        "de", "del", "a", "al", "en", "con", "por", "para",
        "sobre", "bajo", "y", "o", "e", "u"
    }

    # Normalize to ASCII
    name_ascii = normalize_to_ascii(name)

    # Remove special characters except spaces
    name_clean = re.sub(r"[^a-zA-Z0-9\s]", "", name_ascii)

    # Split into words
    words = name_clean.split()

    # Filter skip words
    significant_words = [w for w in words if w.lower() not in skip_words and len(w) > 0]

    if not significant_words:
        # Fallback: use all words
        significant_words = words

    if not significant_words:
        return "UNK"

    # Take first letter of each significant word
    abbrev = "".join(w[0].upper() for w in significant_words)

    # Limit to 10 characters
    return abbrev[:10]


def query_wikidata_museums() -> list[dict]:
    """Query Wikidata for museums in Argentina."""
    query = """
    SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
      ?item wdt:P31/wdt:P279* wd:Q33506 .  # instance of museum (or subclass)
      ?item wdt:P17 wd:Q414 .              # country: Argentina

      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P131 ?city . }
      OPTIONAL { ?item wdt:P856 ?websiteUrl . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
    }
    ORDER BY ?itemLabel
    """
    return _execute_sparql(query)


def query_wikidata_archives() -> list[dict]:
    """Query Wikidata for archives in Argentina."""
    query = """
    SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
      ?item wdt:P31/wdt:P279* wd:Q166118 .  # instance of archive (or subclass)
      ?item wdt:P17 wd:Q414 .               # country: Argentina

      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P131 ?city . }
      OPTIONAL { ?item wdt:P856 ?websiteUrl . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
    }
    ORDER BY ?itemLabel
    """
    return _execute_sparql(query)


def query_wikidata_galleries() -> list[dict]:
    """Query Wikidata for art galleries in Argentina."""
    query = """
    SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
      ?item wdt:P31/wdt:P279* wd:Q1007870 .  # instance of art gallery (or subclass)
      ?item wdt:P17 wd:Q414 .                # country: Argentina

      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P131 ?city . }
      OPTIONAL { ?item wdt:P856 ?websiteUrl . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
    }
    ORDER BY ?itemLabel
    """
    return _execute_sparql(query)


def _execute_sparql(query: str) -> list[dict]:
    """Execute SPARQL query and return parsed results."""
    headers = {
        "User-Agent": USER_AGENT,
        "Accept": "application/sparql-results+json"
    }
    params = {"query": query, "format": "json"}

    time.sleep(1.0)  # Rate limiting

    try:
        response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
        response.raise_for_status()
        data = response.json()
        return data.get("results", {}).get("bindings", [])
    except Exception as e:
        print(f"  ❌ SPARQL query failed: {e}")
        return []


def parse_wikidata_result(binding: dict, institution_type: str) -> Optional[dict]:
    """Parse a Wikidata SPARQL result binding into a normalized dict."""
    item_uri = binding.get("item", {}).get("value", "")
    qid = item_uri.split("/")[-1] if item_uri else None

    if not qid or not qid.startswith("Q"):
        return None

    label = binding.get("itemLabel", {}).get("value", "")
    if not label or label == qid:  # Skip if label is just the QID (no label found)
        return None

    result = {
        "qid": qid,
        "name": label,
        "institution_type": institution_type,
    }

    # Parse coordinates
    coords_str = binding.get("coords", {}).get("value", "")
    if coords_str and coords_str.startswith("Point("):
        try:
            lon, lat = coords_str[6:-1].split()
            result["latitude"] = float(lat)
            result["longitude"] = float(lon)
        except (ValueError, IndexError):
            pass

    # Parse city
    city = binding.get("cityLabel", {}).get("value", "")
    if city and not city.startswith("Q"):  # Skip if city label is QID
        result["city"] = city

    # Parse website
    website = binding.get("websiteUrl", {}).get("value", "")
    if website:
        result["website"] = website

    return result


def get_existing_qids() -> set[str]:
    """Get set of Wikidata QIDs already in Argentina custodian files."""
    qids = set()
    for filepath in CUSTODIAN_DIR.glob("AR-*.yaml"):
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f)

            # Check wikidata_enrichment section
            wd_id = data.get("wikidata_enrichment", {}).get("wikidata_entity_id")
            if wd_id:
                qids.add(wd_id)

            # Check original_entry section
            wd_id = data.get("original_entry", {}).get("wikidata_id")
            if wd_id:
                qids.add(wd_id)
        except Exception:
            continue

    return qids


def reverse_geocode_to_region(lat: float, lon: float) -> Optional[tuple[str, str, str]]:
    """
    Reverse geocode coordinates to find region code and city.

    Returns: (region_code, city_name, city_code) or None
    """
    if not GEONAMES_DB.exists():
        return None

    try:
        conn = sqlite3.connect(GEONAMES_DB)
        cursor = conn.cursor()

        # Find nearest city with proper feature codes (not neighborhoods)
        cursor.execute("""
            SELECT name, ascii_name, admin1_code, admin1_name,
                   latitude, longitude,
                   ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
            FROM cities
            WHERE country_code = 'AR'
              AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
            ORDER BY distance_sq
            LIMIT 1
        """, (lat, lat, lon, lon))

        row = cursor.fetchone()
        conn.close()

        if row:
            city_name = row[0]
            ascii_name = row[1]
            admin1_code = row[2]

            # Map GeoNames admin1 to ISO 3166-2
            region_code = GEONAMES_ADMIN1_TO_ISO.get(admin1_code, "XX")
            city_code = generate_city_code(ascii_name or city_name)

            return (region_code, city_name, city_code)
    except Exception as e:
        print(f"  ⚠️  GeoNames lookup failed: {e}")

    return None


def city_label_to_region(city_label: str) -> Optional[str]:
    """Try to map city label to region code."""
    if not city_label:
        return None

    # Direct match in region codes
    for name, code in AR_REGION_CODES.items():
        if name.lower() == city_label.lower():
            return code
        if name.lower() in city_label.lower():
            return code

    # Known city to region mappings
    city_to_region = {
        "la plata": "B",
        "mar del plata": "B",
        "bahía blanca": "B",
        "bahia blanca": "B",
        "rosario": "S",
        "ushuaia": "V",
        "resistencia": "H",
        "posadas": "N",
        "paraná": "E",
        "parana": "E",
        "san salvador de jujuy": "Y",
        "san miguel de tucumán": "T",
        "san miguel de tucuman": "T",
    }

    city_lower = city_label.lower()
    for city, region in city_to_region.items():
        if city in city_lower:
            return region

    return None


def create_custodian_yaml(inst: dict, dry_run: bool = False) -> Optional[Path]:
    """Create a custodian YAML file for an institution."""
    qid = inst["qid"]
    name = inst["name"]
    inst_type = inst["institution_type"]

    # Determine location
    region_code = "XX"
    city_code = "XXX"
    city_name = inst.get("city", "")

    # Try reverse geocoding first (most accurate)
    if "latitude" in inst and "longitude" in inst:
        geo_result = reverse_geocode_to_region(inst["latitude"], inst["longitude"])
        if geo_result:
            region_code, city_name, city_code = geo_result

    # Fallback: try city label
    if region_code == "XX" and city_name:
        region = city_label_to_region(city_name)
        if region:
            region_code = region
        city_code = generate_city_code(city_name)

    # Generate abbreviation
    abbreviation = extract_abbreviation_from_name(name)

    # Create GHCID components
    try:
        components = GHCIDComponents(
            country_code="AR",
            region_code=region_code,
            city_locode=city_code,
            institution_type=inst_type,
            abbreviation=abbreviation,
        )
        ghcid_current = components.to_string()
        ghcid_uuid = str(components.to_uuid())
        ghcid_uuid_sha256 = str(components.to_uuid_sha256())
        ghcid_numeric = components.to_numeric()
    except Exception as e:
        print(f"  ❌ GHCID generation failed for {name}: {e}")
        return None

    # Check for collision
    filename = f"{ghcid_current}.yaml"
    filepath = CUSTODIAN_DIR / filename

    if filepath.exists():
        # Collision - append Wikidata QID
        components.wikidata_qid = qid.replace("Q", "")
        ghcid_current = components.to_string()
        ghcid_uuid = str(components.to_uuid())
        ghcid_uuid_sha256 = str(components.to_uuid_sha256())
        ghcid_numeric = components.to_numeric()
        filename = f"{ghcid_current}.yaml"
        filepath = CUSTODIAN_DIR / filename

    timestamp = datetime.now(timezone.utc).isoformat()

    # Build YAML structure
    data = {
        "original_entry": {
            "name": name,
            "source": "Wikidata SPARQL import",
            "wikidata_id": qid,
        },
        "processing_timestamp": timestamp,
        "ghcid": {
            "ghcid_current": ghcid_current,
            "ghcid_uuid": ghcid_uuid,
            "ghcid_uuid_sha256": ghcid_uuid_sha256,
            "ghcid_numeric": ghcid_numeric,
            "record_id": str(__import__("uuid").uuid4()),
            "generation_timestamp": timestamp,
            "location_resolution": {
                "method": "WIKIDATA_IMPORT",
                "country_code": "AR",
                "region_code": region_code,
                "city_code": city_code,
                "city_label": city_name or None,
            },
        },
        "custodian_name": {
            "claim_type": "custodian_name",
            "claim_value": name,
            "source_type": "wikidata",
            "emic_name": name,
            "name_language": "es",
        },
        "institution_type": {
            "M": "MUSEUM",
            "A": "ARCHIVE",
            "G": "GALLERY",
            "L": "LIBRARY",
        }.get(inst_type, "UNKNOWN"),
        "location": {
            "country": "AR",
            "region_code": region_code,
        },
        "wikidata_enrichment": {
            "wikidata_entity_id": qid,
            "enrichment_date": timestamp,
            "source": "Wikidata SPARQL import",
        },
    }

    # Add optional fields
    if city_name:
        data["location"]["city"] = city_name

    if "latitude" in inst and "longitude" in inst:
        data["location"]["latitude"] = inst["latitude"]
        data["location"]["longitude"] = inst["longitude"]

    if "website" in inst:
        data["website"] = inst["website"]

    if dry_run:
        print(f"  [DRY RUN] Would create: {filename}")
        return filepath

    # Write YAML file
    with open(filepath, "w", encoding="utf-8") as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    return filepath


def main():
    parser = argparse.ArgumentParser(description="Import Argentina institutions from Wikidata")
    parser.add_argument("--dry-run", action="store_true", help="Don't create files, just report what would be done")
    args = parser.parse_args()

    print("=" * 80)
    print("ARGENTINA WIKIDATA INSTITUTION IMPORT")
    print("=" * 80)
    print()

    # Get existing QIDs
    print("📂 Scanning existing custodian files...")
    existing_qids = get_existing_qids()
    print(f"   Found {len(existing_qids)} existing Wikidata QIDs")
    print()

    # Query Wikidata
    print("🔍 Querying Wikidata for Argentina institutions...")

    print("   Museums...", end=" ", flush=True)
    museum_results = query_wikidata_museums()
    print(f"found {len(museum_results)} raw results")

    print("   Archives...", end=" ", flush=True)
    archive_results = query_wikidata_archives()
    print(f"found {len(archive_results)} raw results")

    print("   Galleries...", end=" ", flush=True)
    gallery_results = query_wikidata_galleries()
    print(f"found {len(gallery_results)} raw results")
    print()

    # Parse and deduplicate results
    institutions = {}

    for binding in museum_results:
        inst = parse_wikidata_result(binding, "M")
        if inst and inst["qid"] not in institutions:
            institutions[inst["qid"]] = inst

    for binding in archive_results:
        inst = parse_wikidata_result(binding, "A")
        if inst and inst["qid"] not in institutions:
            institutions[inst["qid"]] = inst

    for binding in gallery_results:
        inst = parse_wikidata_result(binding, "G")
        if inst and inst["qid"] not in institutions:
            institutions[inst["qid"]] = inst

    print(f"📊 Total unique institutions: {len(institutions)}")

    # Filter out existing
    new_institutions = {qid: inst for qid, inst in institutions.items() if qid not in existing_qids}
    print(f"   After filtering existing: {len(new_institutions)} new institutions")
    print()

    # Create custodian files
    stats = {
        "created": 0,
        "collisions": 0,
        "errors": 0,
        "by_type": {"M": 0, "A": 0, "G": 0},
    }

    if args.dry_run:
        print("🔄 [DRY RUN] Would create the following files:")
    else:
        print("🔄 Creating custodian YAML files...")
    print()

    for qid, inst in sorted(new_institutions.items(), key=lambda x: x[1]["name"]):
        name = inst["name"]
        inst_type = inst["institution_type"]
        city = inst.get("city", "Unknown")

        print(f"  [{inst_type}] {name}")
        print(f"      📍 {city}, QID: {qid}")

        filepath = create_custodian_yaml(inst, dry_run=args.dry_run)

        if filepath:
            if not args.dry_run:
                print(f"      ✅ Created: {filepath.name}")
            stats["created"] += 1
            stats["by_type"][inst_type] += 1
        else:
            stats["errors"] += 1
        print()

    # Summary
    print("=" * 80)
    print("IMPORT COMPLETE")
    print("=" * 80)
    print(f"✅ Created: {stats['created']} custodian files")
    print(f"   - Museums:   {stats['by_type']['M']}")
    print(f"   - Archives:  {stats['by_type']['A']}")
    print(f"   - Galleries: {stats['by_type']['G']}")
    print(f"❌ Errors: {stats['errors']}")
    print()

    # Final count
    if not args.dry_run:
        final_count = len(list(CUSTODIAN_DIR.glob("AR-*.yaml")))
        print(f"📁 Total Argentina custodian files: {final_count}")


if __name__ == "__main__":
    main()