glam/scripts/enrich_institutions_wikidata_sparql.py

#!/usr/bin/env python3
"""
Global Wikidata SPARQL Enrichment for Heritage Institutions

This script enriches heritage institutions worldwide by querying Wikidata's SPARQL endpoint
and performing fuzzy name matching to find real Q-numbers.

🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨
This script NEVER generates synthetic Q-numbers. If no Wikidata match is found,
institutions remain without Q-numbers and are flagged for manual enrichment.

Strategy:
1. Process institutions by country (configurable priority order)
2. Query Wikidata for museums/archives/libraries in each country using SPARQL
3. Fuzzy match institution names (threshold > 0.85)
4. Cross-reference ISIL/VIAF identifiers for high-confidence matches
5. Update GHCIDs ONLY when collision resolution requires Q-number
6. Track provenance with match confidence scores

Priority Countries (configurable):
- Netherlands (NL, Q55): Highest data quality, 1,351 institutions
- Chile (CL, Q298): Good name quality, 28.9% current coverage
- Belgium (BE, Q31): ~500 institutions
- Italy (IT, Q38): ~400 institutions
- Denmark (DK, Q35): ~300 institutions

Usage:
    python enrich_institutions_wikidata_sparql.py --countries NL CL BE --threshold 0.85 --dry-run
    python enrich_institutions_wikidata_sparql.py --all-countries --skip-existing
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
import argparse

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


# Country configurations (Wikidata QIDs)
# Prioritized by data quality, institution count, and expected match rate
COUNTRY_CONFIGS = {
    # Priority 1: High data quality, large datasets
    'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1},
    'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1},

    # Priority 2: Medium datasets, good coverage potential
    'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 2},
    'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2},
    'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2},
    'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2},
    'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2},
    'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2},

    # Priority 3: Latin America (already partially enriched)
    'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3},
    'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3},
    'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3},
    'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3},

    # Priority 4: Asian countries (language barriers)
    'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4},
    'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4},
    'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4},
    'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4},
    'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4},
    'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4},
    'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4},
    'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4},

    # Priority 5: African/Middle Eastern countries (fewer Wikidata entries)
    'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5},
    'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5},
    'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5},
    'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5},
    'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5},

    # Add more countries as needed
}


def normalize_name(name: str) -> str:
    """
    Normalize institution name for fuzzy matching.

    Removes common prefixes/suffixes in multiple languages to improve matching.
    """
    # Lowercase
    name = name.lower()

    # Remove common prefixes (multilingual)
    prefixes = [
        # Dutch
        r'^(het |de |museum |archief |bibliotheek |stichting |nationaal |provinciaal |gemeentelijk |regionaal )',
        # English
        r'^(the |museum |archive |library |foundation |national |provincial |municipal |regional )',
        # Spanish/Portuguese
        r'^(el |la |los |las |museo |archivo |biblioteca |fundación |fundação |nacional |provincial |municipal |regional )',
        # French
        r'^(le |la |les |musée |archives |bibliothèque |fondation |national |provincial |municipal |régional )',
        # German
        r'^(das |die |der |museum |archiv |bibliothek |stiftung |national |provinziell |kommunal |regional )',
        # Italian
        r'^(il |lo |la |museo |archivio |biblioteca |fondazione |nazionale |provinciale |comunale |regionale )',
    ]
    for prefix_pattern in prefixes:
        name = re.sub(prefix_pattern, '', name, flags=re.IGNORECASE)

    # Remove common suffixes (multilingual)
    suffixes = [
        r'\s+(museum|museu|museo|musée)$',
        r'\s+(archief|archive|archivo|arquivo|archives)$',
        r'\s+(bibliotheek|library|biblioteca|bibliothèque)$',
        r'\s+(stichting|foundation|fundación|fundação|fondation|fondazione)$',
        r'\s+(national|nacional|nationale|nationaal)$',
        r'\s+(regional|regional|régional)$',
        r'\s+(municipal|comunal|municipale)$',
    ]
    for suffix_pattern in suffixes:
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name.strip()


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names using SequenceMatcher (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_wikidata_institutions(
    sparql: SPARQLWrapper,
    country_qid: str,
    institution_types: list[str],
    languages: str = "en"
) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for heritage institutions in a specific country.

    🔧 OPTIMIZED VERSION: Queries each institution type separately to avoid
    expensive transitive subclass queries (wdt:P279*) that cause 504 timeouts.

    Args:
        sparql: Configured SPARQLWrapper instance
        country_qid: Wikidata QID for country (e.g., Q55 for Netherlands)
        institution_types: List of Wikidata QIDs for institution types:
            Q33506 - museum
            Q7075 - library
            Q166118 - archive
            Q2668072 - art gallery
            Q5282129 - cultural center
        languages: Comma-separated language codes for labels (e.g., "nl,en")

    Returns:
        Dictionary mapping Wikidata QIDs to institution metadata
    """

    # Query each type separately to avoid timeout
    all_results = {}

    for inst_type_qid in institution_types:
        print(f"   - Querying {inst_type_qid}...", end="", flush=True)

        query = f"""
        SELECT DISTINCT ?item ?itemLabel ?itemDescription ?itemAltLabel ?isil ?viaf ?coords ?website ?inception ?instType ?instTypeLabel
        WHERE {{
          # Direct instance-of match (no expensive transitive subclass)
          ?item wdt:P31/wdt:P279? wd:{inst_type_qid} .  # instance of (or subclass of) type
          ?item wdt:P17 wd:{country_qid} .              # country

          # Capture the specific type
          ?item wdt:P31 ?instType .

          # Optional identifiers and metadata
          OPTIONAL {{ ?item wdt:P791 ?isil . }}      # ISIL code
          OPTIONAL {{ ?item wdt:P214 ?viaf . }}      # VIAF ID
          OPTIONAL {{ ?item wdt:P625 ?coords . }}    # Coordinates
          OPTIONAL {{ ?item wdt:P856 ?website . }}   # Official website
          OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date

          SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "{languages}" .
            ?item rdfs:label ?itemLabel .
            ?item schema:description ?itemDescription .
            ?item skos:altLabel ?itemAltLabel .
            ?instType rdfs:label ?instTypeLabel .
          }}
        }}
        LIMIT 1000
        """

        sparql.setQuery(query)

        try:
            raw_results = sparql.query().convert()
            bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

            # Merge results
            type_results = _parse_sparql_bindings(bindings)
            all_results.update(type_results)
            print(f" {len(type_results)} found")

        except Exception as e:
            print(f" ❌ Error: {e}")
            continue

    return all_results


def _parse_sparql_bindings(bindings: list[dict]) -> dict[str, dict[str, Any]]:
    """
    Helper function to parse SPARQL query bindings into institution metadata.

    Returns:
        Dictionary mapping Wikidata QIDs to institution metadata
    """
    results = {}

    for binding in bindings:
        item_uri = binding.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            continue

        # Check if it's a REAL Wikidata Q-number (not synthetic)
        try:
            qid_num = int(qid[1:])
            if qid_num >= 90000000:
                # Synthetic Q-number range - SKIP
                continue
        except ValueError:
            continue

        # Initialize or update result
        if qid not in results:
            results[qid] = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("instTypeLabel", {}).get("value", ""),
                "alternative_names": [],
                "identifiers": {}
            }

        # Collect alternative names (multilingual labels)
        alt_label = binding.get("itemAltLabel", {}).get("value", "")
        if alt_label and alt_label not in results[qid]["alternative_names"]:
            results[qid]["alternative_names"].append(alt_label)

        # Add identifiers
        if "isil" in binding:
            results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]

        if "viaf" in binding:
            results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]

        if "website" in binding:
            results[qid]["identifiers"]["Website"] = binding["website"]["value"]

        if "inception" in binding:
            results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]

        if "coords" in binding:
            coords_str = binding["coords"]["value"]
            if coords_str.startswith("Point("):
                lon, lat = coords_str[6:-1].split()
                results[qid]["latitude"] = float(lat)
                results[qid]["longitude"] = float(lon)

    return results


def has_real_wikidata_id(inst: dict[str, Any]) -> bool:
    """Check if institution already has a REAL (non-synthetic) Wikidata ID."""
    for id_obj in inst.get("identifiers", []):
        if not isinstance(id_obj, dict):
            continue

        if id_obj.get("identifier_scheme") == "Wikidata":
            qid = id_obj.get("identifier_value", "")
            if qid.startswith("Q"):
                try:
                    qid_num = int(qid[1:])
                    if qid_num < 90000000:
                        return True  # Real Wikidata ID
                except ValueError:
                    pass

    return False


def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
    """
    Check if institution types are compatible (avoid museum/archive/library mismatches).

    Uses both the institution's formal type and name keywords to validate compatibility.
    """
    inst_lower = inst_name.lower()
    wd_lower = wd_type.lower()
    formal_type = inst_type.upper()

    # Define type keywords (multilingual)
    museum_keywords = ['museum', 'museo', 'museu', 'musée', 'muzeum', 'muzeu']
    archive_keywords = ['archief', 'archive', 'archivo', 'arquivo', 'archiv', 'arkiv']
    library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek', 'bibliotek']
    gallery_keywords = ['gallery', 'galerie', 'galería', 'galleria', 'kunsthal', 'kunsthalle']

    # Check if institution name contains type keyword
    inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
    inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
    inst_is_library = any(kw in inst_lower for kw in library_keywords)
    inst_is_gallery = any(kw in inst_lower for kw in gallery_keywords)

    # Check if Wikidata type contains type keyword
    wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
    wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
    wd_is_library = any(kw in wd_lower for kw in library_keywords)
    wd_is_gallery = any(kw in wd_lower for kw in gallery_keywords)

    # Check formal institution type
    formal_is_museum = formal_type in ('MUSEUM', 'GALLERY', 'BOTANICAL_ZOO')
    formal_is_archive = formal_type == 'ARCHIVE'
    formal_is_library = formal_type == 'LIBRARY'

    # If Wikidata type is empty, allow match (type will be determined by name/formal type)
    if not wd_type or not wd_lower.strip():
        return True

    # If both have explicit types, they must match
    if (inst_is_museum or formal_is_museum) and not wd_is_museum and not wd_is_gallery:
        return False
    if (inst_is_archive or formal_is_archive) and not wd_is_archive:
        return False
    if (inst_is_library or formal_is_library) and not wd_is_library:
        return False

    return True


def isil_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
    """
    Check for ISIL code cross-reference (highest confidence match).

    Returns:
        1.0 if ISIL codes match exactly
        None if no ISIL match
    """
    inst_isil = None
    for id_obj in inst.get("identifiers", []):
        if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "ISIL":
            inst_isil = id_obj.get("identifier_value", "").strip()
            break

    wd_isil = wd_data.get("identifiers", {}).get("ISIL", "").strip()

    if inst_isil and wd_isil and inst_isil == wd_isil:
        return 1.0  # Perfect match via ISIL

    return None


def viaf_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
    """
    Check for VIAF ID cross-reference (high confidence match).

    Returns:
        0.98 if VIAF IDs match exactly
        None if no VIAF match
    """
    inst_viaf = None
    for id_obj in inst.get("identifiers", []):
        if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "VIAF":
            inst_viaf = id_obj.get("identifier_value", "").strip()
            break

    wd_viaf = wd_data.get("identifiers", {}).get("VIAF", "").strip()

    if inst_viaf and wd_viaf and inst_viaf == wd_viaf:
        return 0.98  # Very high confidence via VIAF

    return None


def fuzzy_match_institutions(
    institutions: list[dict[str, Any]],
    wikidata_results: dict[str, dict[str, Any]],
    threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any], str]]:
    """
    Fuzzy match institutions with Wikidata results using multiple strategies.

    Matching strategies (in priority order):
    1. ISIL code cross-reference (confidence: 1.0)
    2. VIAF ID cross-reference (confidence: 0.98)
    3. Fuzzy name matching (confidence: similarity score)

    Returns:
        List of (institution_idx, qid, confidence_score, wd_data, match_method)
    """
    matches = []

    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        inst_type = inst.get("institution_type", "")
        if not inst_name:
            continue

        # Skip if already has real Wikidata ID
        if has_real_wikidata_id(inst):
            continue

        # Find best match using multiple strategies
        best_score = 0.0
        best_qid = None
        best_data = None
        best_method = "fuzzy_name_match"

        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_type = wd_data.get("type", "")
            if not wd_name:
                continue

            # Check type compatibility
            if not institution_type_compatible(inst_name, inst_type, wd_type):
                continue

            # Strategy 1: ISIL cross-reference (highest confidence)
            isil_score = isil_cross_reference_match(inst, wd_data)
            if isil_score:
                best_score = isil_score
                best_qid = qid
                best_data = wd_data
                best_method = "isil_cross_reference"
                break  # Perfect match, no need to continue

            # Strategy 2: VIAF cross-reference (very high confidence)
            viaf_score = viaf_cross_reference_match(inst, wd_data)
            if viaf_score and viaf_score > best_score:
                best_score = viaf_score
                best_qid = qid
                best_data = wd_data
                best_method = "viaf_cross_reference"

            # Strategy 3: Fuzzy name matching (check primary name and alternatives)
            names_to_check = [wd_name] + wd_data.get("alternative_names", [])
            for wd_name_variant in names_to_check:
                score = similarity_score(inst_name, wd_name_variant)
                if score > best_score:
                    best_score = score
                    best_qid = qid
                    best_data = wd_data
                    best_method = "fuzzy_name_match"

        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data, best_method))

    return matches


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any], match_method: str, confidence: float) -> bool:
    """
    Enrich an institution with Wikidata data.

    🚨 CRITICAL: This function ONLY adds REAL Wikidata Q-numbers.
    It NEVER generates synthetic Q-numbers.

    Returns:
        True if institution was enriched
    """
    enriched = False

    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Check if Q-number is REAL (not synthetic)
    qid = wd_data["qid"]
    try:
        qid_num = int(qid[1:])
        if qid_num >= 90000000:
            print(f"⚠️  WARNING: Attempted to add synthetic Q-number {qid} - REJECTED")
            return False
    except ValueError:
        print(f"⚠️  WARNING: Invalid Q-number format {qid} - REJECTED")
        return False

    # Add or replace Wikidata ID
    wikidata_idx = None
    for i, id_obj in enumerate(identifiers_list):
        if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata":
            wikidata_idx = i
            break

    if wikidata_idx is not None:
        # Replace existing (possibly synthetic) Wikidata ID
        old_value = identifiers_list[wikidata_idx].get("identifier_value", "")
        if old_value != qid:
            identifiers_list[wikidata_idx] = {
                "identifier_scheme": "Wikidata",
                "identifier_value": qid,
                "identifier_url": f"https://www.wikidata.org/wiki/{qid}"
            }
            enriched = True
    else:
        # Add new Wikidata ID
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": qid,
            "identifier_url": f"https://www.wikidata.org/wiki/{qid}"
        })
        enriched = True

    # Add other identifiers from Wikidata
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value
            elif scheme == "ISIL":
                # Don't override existing ISIL, but add if missing
                pass

            identifiers_list.append(id_obj)
            enriched = True

    # Add founding date if missing
    if "founding_date" in wd_data and not inst.get("founding_date"):
        inst["founding_date"] = wd_data["founding_date"]
        enriched = True

    # Add coordinates if missing
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict):
                if first_loc.get("latitude") is None or first_loc.get("longitude") is None:
                    first_loc["latitude"] = wd_data["latitude"]
                    first_loc["longitude"] = wd_data["longitude"]
                    enriched = True

    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            match_type_desc = {
                "isil_cross_reference": "Wikidata enrichment (ISIL cross-reference)",
                "viaf_cross_reference": "Wikidata enrichment (VIAF cross-reference)",
                "fuzzy_name_match": f"Wikidata enrichment (fuzzy name match, confidence: {confidence:.3f})"
            }.get(match_method, "Wikidata enrichment")

            if existing_method:
                prov["extraction_method"] = f"{existing_method} + {match_type_desc}"
            else:
                prov["extraction_method"] = match_type_desc

            # Update extraction date
            prov["enrichment_date"] = datetime.now(timezone.utc).isoformat()

    return enriched


def process_country(
    institutions: list[dict[str, Any]],
    country_code: str,
    sparql: SPARQLWrapper,
    threshold: float = 0.85,
    dry_run: bool = False
) -> tuple[int, int, dict[str, int]]:
    """
    Process a single country's institutions.

    Returns:
        (institutions_without_wikidata, enriched_count, match_methods_stats)
    """
    country_info = COUNTRY_CONFIGS.get(country_code)
    if not country_info:
        print(f"\n⚠️  Unknown country code: {country_code}")
        return 0, 0, {}

    print(f"\n{'='*80}")
    print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
    print(f"{'='*80}\n")

    # Filter institutions for this country
    country_institutions_idx = [
        idx for idx, inst in enumerate(institutions)
        if inst.get('locations', [{}])[0].get('country') == country_code
    ]

    print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions")

    # Count those without real Wikidata
    without_wikidata = [
        idx for idx in country_institutions_idx
        if not has_real_wikidata_id(institutions[idx])
    ]

    current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
    print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)")
    print(f"❓ Without Wikidata: {len(without_wikidata):,}\n")

    if not without_wikidata:
        print("✨ All institutions already have Wikidata IDs!")
        return 0, 0, {}

    # Query Wikidata
    print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...")
    print("   (This may take 30-90 seconds)\n")

    # Query for museums, libraries, archives, galleries
    institution_types = ["Q33506", "Q7075", "Q166118", "Q2668072"]
    languages = country_info.get('languages', 'en')
    wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types, languages)

    print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n")

    if not wikidata_results:
        print("⚠️  No Wikidata results, skipping fuzzy matching")
        return len(without_wikidata), 0, {}

    # Fuzzy match
    print(f"🔗 Matching institutions (threshold: {threshold:.2f})...\n")

    country_insts = [institutions[idx] for idx in without_wikidata]
    matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=threshold)

    print(f"✨ Found {len(matches):,} high-confidence matches\n")

    # Track match methods
    match_methods_stats = {
        "isil_cross_reference": 0,
        "viaf_cross_reference": 0,
        "fuzzy_name_match": 0
    }

    # Show sample matches
    if matches:
        print(f"{'='*80}")
        print(f"📋 SAMPLE MATCHES (Top 10)")
        print(f"{'='*80}")
        for i, (local_idx, qid, score, wd_data, method) in enumerate(matches[:10]):
            inst = country_insts[local_idx]
            print(f"\n{i+1}. Method: {method.upper()}, Confidence: {score:.3f}")
            print(f"   Local:    {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
            print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
            print(f"   Type:     {wd_data.get('type', 'Unknown')}")
            if "ISIL" in wd_data.get("identifiers", {}):
                print(f"   ISIL:     {wd_data['identifiers']['ISIL']}")
            if "VIAF" in wd_data.get("identifiers", {}):
                print(f"   VIAF:     {wd_data['identifiers']['VIAF']}")

        print(f"\n{'='*80}\n")

        if dry_run:
            print("🔍 DRY RUN: Would enrich the following institutions:\n")
            for local_idx, qid, score, wd_data, method in matches:
                inst = country_insts[local_idx]
                print(f"  - {inst.get('name')} → {qid} (method: {method}, confidence: {score:.3f})")
            print(f"\n✅ Dry run complete. Use --no-dry-run to apply changes.\n")
            return len(without_wikidata), 0, {}

        # Apply all matches
        print("✅ Applying all matches...\n")
        enriched_count = 0

        for local_idx, qid, score, wd_data, method in matches:
            global_idx = without_wikidata[local_idx]
            if enrich_institution(institutions[global_idx], wd_data, method, score):
                enriched_count += 1
                match_methods_stats[method] += 1

        new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
        print(f"✨ Enriched {enriched_count:,} institutions")
        print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
        print(f"\n📊 Match methods:")
        for method, count in match_methods_stats.items():
            if count > 0:
                print(f"   {method}: {count:,}")
        print()

        return len(without_wikidata), enriched_count, match_methods_stats
    else:
        print("❌ No matches found. Try lowering threshold.\n")
        return len(without_wikidata), 0, {}


def main():
    parser = argparse.ArgumentParser(
        description="Enrich heritage institutions with real Wikidata Q-numbers using SPARQL",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Dutch and Chilean institutions (priority 1)
  %(prog)s --countries NL CL --threshold 0.85

  # All priority 1 and 2 countries
  %(prog)s --priority 1 2 --threshold 0.85

  # Dry run (preview matches without applying)
  %(prog)s --countries NL --dry-run

  # All countries (not recommended - use priority groups)
  %(prog)s --all-countries --threshold 0.85
        """
    )

    parser.add_argument(
        '--countries',
        nargs='+',
        metavar='CODE',
        help='Country codes to process (e.g., NL CL BE IT)'
    )

    parser.add_argument(
        '--priority',
        nargs='+',
        type=int,
        metavar='N',
        help='Process countries by priority level (1-5)'
    )

    parser.add_argument(
        '--all-countries',
        action='store_true',
        help='Process all configured countries (use with caution)'
    )

    parser.add_argument(
        '--threshold',
        type=float,
        default=0.85,
        help='Fuzzy match threshold (0.0-1.0, default: 0.85)'
    )

    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview matches without applying changes'
    )

    parser.add_argument(
        '--input',
        type=Path,
        help='Input YAML file (default: data/instances/global/global_heritage_institutions_wikidata_enriched.yaml)'
    )

    parser.add_argument(
        '--output',
        type=Path,
        help='Output YAML file (default: overwrites input or creates new file with _sparql_enriched suffix)'
    )

    args = parser.parse_args()

    # Determine countries to process
    countries_to_process = []

    if args.countries:
        countries_to_process = args.countries
    elif args.priority:
        countries_to_process = [
            code for code, info in COUNTRY_CONFIGS.items()
            if info.get('priority') in args.priority
        ]
    elif args.all_countries:
        countries_to_process = list(COUNTRY_CONFIGS.keys())
    else:
        # Default: Priority 1 countries
        countries_to_process = [
            code for code, info in COUNTRY_CONFIGS.items()
            if info.get('priority') == 1
        ]

    # Validate country codes
    invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS]
    if invalid_countries:
        print(f"❌ Invalid country codes: {', '.join(invalid_countries)}")
        print(f"   Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}")
        return 1

    # File paths
    base_dir = Path(__file__).parent.parent

    if args.input:
        input_file = args.input
    else:
        input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"

    if args.output:
        output_file = args.output
    elif args.dry_run:
        output_file = None  # No output for dry run
    else:
        output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml"

    # Header
    print("="*80)
    print("🌍 GLOBAL WIKIDATA SPARQL ENRICHMENT")
    print("="*80)
    print(f"\n📖 Loading dataset: {input_file.name}\n")

    start_time = time.time()

    # Load dataset
    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    load_time = time.time() - start_time
    print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s")

    # Setup SPARQL
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Wikidata Enrichment)")

    # Process countries
    print(f"\n🌍 Processing {len(countries_to_process)} countries:")
    country_names = [COUNTRY_CONFIGS[c]['name'] for c in countries_to_process]
    print(f"   {', '.join(country_names)}\n")

    if args.dry_run:
        print("🔍 DRY RUN MODE: No changes will be saved\n")

    total_without_wikidata = 0
    total_enriched = 0
    total_match_methods = {
        "isil_cross_reference": 0,
        "viaf_cross_reference": 0,
        "fuzzy_name_match": 0
    }

    for i, country_code in enumerate(countries_to_process):
        without, enriched, methods = process_country(
            institutions,
            country_code,
            sparql,
            threshold=args.threshold,
            dry_run=args.dry_run
        )
        total_without_wikidata += without
        total_enriched += enriched

        for method, count in methods.items():
            total_match_methods[method] += count

        # Rate limiting - be nice to Wikidata
        if i < len(countries_to_process) - 1:
            print("⏸️  Waiting 5 seconds (Wikidata rate limiting)...\n")
            time.sleep(5)

    # Write output (unless dry run)
    if not args.dry_run and total_enriched > 0 and output_file:
        print("="*80)
        print("💾 WRITING ENRICHED DATASET")
        print("="*80 + "\n")

        header = f"""---
# Global Heritage Institutions - SPARQL Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Countries processed: {', '.join(countries_to_process)}
# New Wikidata matches: {total_enriched:,}
# Match threshold: {args.threshold:.2f}

"""

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(header)
            yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)

        print(f"✅ Complete! Output: {output_file}\n")

    # Final report
    print("="*80)
    print("📊 FINAL ENRICHMENT REPORT")
    print("="*80)
    print(f"\n✨ Results:")
    print(f"   Total institutions enriched: {total_enriched:,}")
    print(f"   Institutions still without Wikidata: {total_without_wikidata - total_enriched:,}")

    if total_enriched > 0:
        print(f"\n📊 Enrichment methods:")
        for method, count in total_match_methods.items():
            if count > 0:
                percentage = (count / total_enriched * 100) if total_enriched > 0 else 0
                print(f"   {method}: {count:,} ({percentage:.1f}%)")

    print(f"\n⏱️  Total processing time: {(time.time()-start_time)/60:.1f} minutes")

    if args.dry_run:
        print("\n🔍 This was a dry run. Use --no-dry-run to apply changes.")

    print("="*80 + "\n")

    return 0


if __name__ == "__main__":
    sys.exit(main())