glam/scripts/enrich_latam_institutions_fuzzy.py

#!/usr/bin/env python3
"""
Enrich Latin American institutions using fuzzy name matching in Wikidata.

This script addresses low coverage in Brazil (1%), Mexico (21%), and Chile (29%)
by querying Wikidata for heritage institutions using name-based searches.

Strategy:
1. Find institutions without Wikidata IDs in target countries
2. Query Wikidata for museums/archives/libraries in each country
3. Fuzzy match names (normalized)
4. Apply high-confidence matches (>0.85)

Countries:
- Brazil (BR, Q155): 1% → 15-25% expected
- Mexico (MX, Q96): 21% → 35-45% expected
- Chile (CL, Q298): 29% → 40-50% expected
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


# Country configurations
COUNTRIES = {
    'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷'},
    'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽'},
    'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱'}
}


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    # Lowercase
    name = name.lower()

    # Remove common prefixes/suffixes (multilingual)
    # Spanish
    name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro)\s+', '', name)
    name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|regional|municipal)$', '', name)
    # Portuguese
    name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro)\s+', '', name)
    name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name)
    # English
    name = re.sub(r'^(foundation|museum|library|archive|center|centre)\s+', '', name)
    name = re.sub(r'\s+(museum|library|archive|national|regional|municipal)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_wikidata_institutions(
    sparql: SPARQLWrapper,
    country_qid: str,
    institution_types: list[str]
) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for heritage institutions in a specific country.

    country_qid: Wikidata QID for country (Q155=Brazil, Q96=Mexico, Q298=Chile)
    institution_types: List of Wikidata QIDs for institution types
        Q33506 - museum
        Q7075 - library
        Q166118 - archive
    """

    types_values = " ".join(f"wd:{qid}" for qid in institution_types)

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
    WHERE {{
      VALUES ?type {{ {types_values} }}

      ?item wdt:P31 ?type .           # instance of museum/library/archive
      ?item wdt:P17 wd:{country_qid} . # country

      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,pt,en" . }}
    }}
    LIMIT 2000
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        # Parse results into dict keyed by QID
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return {}


def institution_type_compatible(inst_name: str, wd_type: str) -> bool:
    """Check if institution types are compatible (avoid museum/archive mismatches)."""
    inst_lower = inst_name.lower()
    wd_lower = wd_type.lower()

    # Define type keywords (multilingual)
    museum_keywords = ['museum', 'museo', 'museu', 'musée']
    archive_keywords = ['archief', 'archive', 'archivo', 'arquivo']
    library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque']

    # Check if institution name contains type keyword
    inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
    inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
    inst_is_library = any(kw in inst_lower for kw in library_keywords)

    # Check if Wikidata type contains type keyword
    wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
    wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
    wd_is_library = any(kw in wd_lower for kw in library_keywords)

    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False

    return True


def fuzzy_match_institutions(
    institutions: list[dict[str, Any]],
    wikidata_results: dict[str, dict[str, Any]],
    threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any]]]:
    """
    Fuzzy match institutions with Wikidata results.

    Returns: List of (institution_idx, qid, confidence_score, wd_data)
    """
    matches = []

    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        if not inst_name:
            continue

        # Skip if already has real Wikidata ID
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in inst.get("identifiers", [])
        )
        if has_wikidata:
            continue

        # Find best match
        best_score = 0.0
        best_qid = None
        best_data = None

        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_type = wd_data.get("type", "")
            if not wd_name:
                continue

            # Check type compatibility
            if not institution_type_compatible(inst_name, wd_type):
                continue

            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_qid = qid
                best_data = wd_data

        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data))

    return matches


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
    """Enrich an institution with Wikidata data. Returns True if enriched."""
    enriched = False

    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Add Wikidata ID (or replace synthetic Q-number)
    wikidata_idx = None
    for i, id_obj in enumerate(identifiers_list):
        if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata":
            wikidata_idx = i
            break

    if wikidata_idx is not None:
        # Replace existing (possibly synthetic) Wikidata ID
        old_value = identifiers_list[wikidata_idx].get("identifier_value", "")
        if old_value != wd_data["qid"]:
            identifiers_list[wikidata_idx] = {
                "identifier_scheme": "Wikidata",
                "identifier_value": wd_data["qid"],
                "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
            }
            enriched = True
    else:
        # Add new Wikidata ID
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True

    # Add other identifiers
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value

            identifiers_list.append(id_obj)
            enriched = True

    # Add founding date
    if "founding_date" in wd_data and not inst.get("founding_date"):
        inst["founding_date"] = wd_data["founding_date"]
        enriched = True

    # Add coordinates if missing
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
                first_loc["latitude"] = wd_data["latitude"]
                first_loc["longitude"] = wd_data["longitude"]
                enriched = True

    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if existing_method:
                prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (fuzzy name match)"
            else:
                prov["extraction_method"] = "Wikidata enrichment (fuzzy name match)"

    return enriched


def process_country(
    institutions: list[dict[str, Any]],
    country_code: str,
    sparql: SPARQLWrapper
) -> tuple[int, int]:
    """
    Process a single country's institutions.

    Returns: (institutions_without_wikidata, enriched_count)
    """
    country_info = COUNTRIES[country_code]

    print(f"\n{'='*80}")
    print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
    print(f"{'='*80}\n")

    # Filter institutions for this country
    country_institutions_idx = [
        idx for idx, inst in enumerate(institutions)
        if inst.get('locations', [{}])[0].get('country') == country_code
    ]

    print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions")

    # Count those without real Wikidata
    without_wikidata = [
        idx for idx in country_institutions_idx
        if not any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in institutions[idx].get("identifiers", [])
        )
    ]

    current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
    print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)")
    print(f"❓ Without Wikidata: {len(without_wikidata):,}\n")

    if not without_wikidata:
        print("✨ All institutions already have Wikidata IDs!")
        return 0, 0

    # Query Wikidata
    print(f"🔍 Querying Wikidata for {country_info['name']} museums, libraries, and archives...")
    print("   (This may take 30-60 seconds)\n")

    institution_types = ["Q33506", "Q7075", "Q166118"]  # museum, library, archive
    wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types)

    print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n")

    if not wikidata_results:
        print("⚠️  No Wikidata results, skipping fuzzy matching")
        return len(without_wikidata), 0

    # Fuzzy match
    print("🔗 Fuzzy matching names (threshold: 0.85)...\n")

    country_insts = [institutions[idx] for idx in without_wikidata]
    matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85)

    print(f"✨ Found {len(matches):,} high-confidence matches\n")

    # Show sample matches
    if matches:
        print(f"{'='*80}")
        print(f"📋 SAMPLE MATCHES (Top 5)")
        print(f"{'='*80}")
        for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]):
            inst = country_insts[local_idx]
            print(f"\n{i+1}. Confidence: {score:.3f}")
            print(f"   Local:    {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
            print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
            print(f"   Type:     {wd_data.get('type', 'Unknown')}")
            if "ISIL" in wd_data.get("identifiers", {}):
                print(f"   ISIL:     {wd_data['identifiers']['ISIL']}")

        print(f"\n{'='*80}\n")

        # Apply all matches
        print("✅ Applying all matches...\n")
        enriched_count = 0

        for local_idx, qid, score, wd_data in matches:
            global_idx = without_wikidata[local_idx]
            if enrich_institution(institutions[global_idx], wd_data):
                enriched_count += 1

        new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
        print(f"✨ Enriched {enriched_count:,} institutions")
        print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%\n")

        return len(without_wikidata), enriched_count
    else:
        print("❌ No matches found. Try lowering threshold.\n")
        return len(without_wikidata), 0


def main():
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
    output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_latam_enriched.yaml"

    print("="*80)
    print("🌎 LATIN AMERICA INSTITUTIONS FUZZY MATCHING")
    print("="*80)
    print(f"\n📖 Loading dataset...\n")

    start_time = time.time()

    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    load_time = time.time() - start_time
    print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s")

    # Setup SPARQL
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")

    # Process each country
    total_without_wikidata = 0
    total_enriched = 0

    for country_code in ['BR', 'MX', 'CL']:
        without, enriched = process_country(institutions, country_code, sparql)
        total_without_wikidata += without
        total_enriched += enriched

        # Rate limiting - be nice to Wikidata
        if country_code != 'CL':  # Don't sleep after last country
            print("⏸️  Waiting 5 seconds (Wikidata rate limiting)...\n")
            time.sleep(5)

    # Write output
    print("="*80)
    print("💾 WRITING ENRICHED DATASET")
    print("="*80 + "\n")

    header = f"""---
# Global Heritage Institutions - Latin America Fuzzy Match Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Latin America institutions processed: {sum(len([i for i in institutions if i.get('locations', [{}])[0].get('country') == cc]) for cc in ['BR', 'MX', 'CL']):,}
# New Latin America matches: {total_enriched:,}

"""

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(header)
        yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)

    print(f"✅ Complete! Output: {output_file}\n")

    # Final report
    print("="*80)
    print("📊 FINAL ENRICHMENT REPORT")
    print("="*80)
    print(f"\n✨ Results:")
    print(f"   Total institutions enriched: {total_enriched:,}")
    print(f"   Latin America institutions without Wikidata: {total_without_wikidata - total_enriched:,}")
    print(f"\n⏱️  Total processing time: {(time.time()-start_time)/60:.1f} minutes")
    print("="*80 + "\n")


if __name__ == "__main__":
    main()