glam/scripts/enrich_low_coverage_countries_fuzzy.py

#!/usr/bin/env python3
"""
Enrich low-coverage countries using fuzzy name matching with Wikidata.

Target countries with <30% Wikidata coverage:
- Brazil (BR): 1.0%
- Belgium (BE): 0.0%
- Italy (IT): 0.0%
- Thailand (TH): <30%
- Norway (NO): <30%
- Vietnam (VN): <30%

Strategy:
1. Query Wikidata for museums/libraries/archives in target country
2. Fuzzy match institution names (normalized, threshold 0.85)
3. Verify type compatibility (don't match museum → archive)
4. Enrich with Wikidata IDs, VIAF, founding dates, websites
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    # Lowercase
    name = name.lower()

    # Remove common prefixes/suffixes in multiple languages
    name = re.sub(r'^(stichting|gemeentearchief|regionaal archief|museum|museu|museo|biblioteca|bibliotheek|library|archive|archief|archivo)[\s\-]+', '', name)
    name = re.sub(r'[\s\-]+(archief|museum|museo|museu|bibliotheek|biblioteca|library|archive|archivo)$', '', name)

    # Remove organizational forms
    name = re.sub(r'\b(s\.a\.|sa|nv|bv|vzw|asbl|inc|ltd|gmbh)\b', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool:
    """
    Check if institution types are compatible.

    Prevents mismatches like museum → archive or library → museum.
    """
    # Define type keywords by language
    museum_kw = ['museum', 'museo', 'museu', 'muzeum']
    archive_kw = ['archief', 'archive', 'archivo', 'archivio']
    library_kw = ['bibliotheek', 'biblioteca', 'library', 'bibliothèque', 'bibliothek']

    inst_lower = (inst_name + ' ' + inst_type).lower()
    wd_lower = (wd_name + ' ' + wd_desc).lower()

    # Check institutional type
    inst_is_museum = any(kw in inst_lower for kw in museum_kw)
    inst_is_archive = any(kw in inst_lower for kw in archive_kw)
    inst_is_library = any(kw in inst_lower for kw in library_kw)

    wd_is_museum = any(kw in wd_lower for kw in museum_kw)
    wd_is_archive = any(kw in wd_lower for kw in archive_kw)
    wd_is_library = any(kw in wd_lower for kw in library_kw)

    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False

    return True


def query_country_institutions(sparql: SPARQLWrapper, country_code: str) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for GLAM institutions in a specific country.

    Returns: dict keyed by QID
    """
    # Map ISO 3166-1 alpha-2 to Wikidata QIDs
    country_qids = {
        "BR": "Q155",   # Brazil
        "BE": "Q31",    # Belgium
        "IT": "Q38",    # Italy
        "NO": "Q20",    # Norway
        "TH": "Q869",   # Thailand
        "VN": "Q881",   # Vietnam
        "MX": "Q96",    # Mexico
        "CL": "Q298",   # Chile
    }

    qid = country_qids.get(country_code)
    if not qid:
        print(f"   ⚠️  No Wikidata QID mapping for country code: {country_code}")
        return {}

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception
    WHERE {{
      # Institution is in target country
      ?item wdt:P17 wd:{qid} .

      # Institution is a GLAM type
      VALUES ?type {{
        wd:Q7075      # library
        wd:Q166118    # archive
        wd:Q33506     # museum
        wd:Q1007870   # art gallery
        wd:Q28564     # public library
        wd:Q11396180  # academic library
        wd:Q207694    # art museum
        wd:Q2772772   # history museum
      }}
      ?item wdt:P31 ?type .

      # Optional enrichment data
      OPTIONAL {{ ?item wdt:P791 ?isil . }}       # ISIL code
      OPTIONAL {{ ?item wdt:P214 ?viaf . }}       # VIAF ID
      OPTIONAL {{ ?item wdt:P625 ?coords . }}     # Coordinates
      OPTIONAL {{ ?item wdt:P856 ?website . }}    # Official website
      OPTIONAL {{ ?item wdt:P571 ?inception . }}  # Founding date

      # Get labels (adjust languages by region)
      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en,pt,es,nl,fr,it,no,th,vi" .
      }}
    }}
    LIMIT 2000
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        # Parse results into dict keyed by QID
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return {}


def fuzzy_match_institutions(
    institutions: list[dict[str, Any]],
    wikidata_results: dict[str, dict[str, Any]],
    threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any]]]:
    """
    Fuzzy match institutions with Wikidata results.

    Returns: List of (institution_idx, qid, confidence_score, wd_data)
    """
    matches = []

    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        inst_type = inst.get("institution_type", "")
        if not inst_name:
            continue

        # Skip if already has real Wikidata ID
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in inst.get("identifiers", [])
        )
        if has_wikidata:
            continue

        # Find best match
        best_score = 0.0
        best_qid = None
        best_data = None

        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_desc = wd_data.get("description", "")
            if not wd_name:
                continue

            # Check type compatibility
            if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc):
                continue

            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_qid = qid
                best_data = wd_data

        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data))

    return matches


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
    """Enrich an institution with Wikidata data. Returns True if enriched."""
    enriched = False

    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True

    # Add other identifiers
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value

            identifiers_list.append(id_obj)
            enriched = True

    # Add founding date
    if "founding_date" in wd_data and not inst.get("founding_date"):
        inst["founding_date"] = wd_data["founding_date"]
        enriched = True

    # Add coordinates if missing
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
                first_loc["latitude"] = wd_data["latitude"]
                first_loc["longitude"] = wd_data["longitude"]
                enriched = True

    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if existing_method:
                prov["extraction_method"] = f"{existing_method} + Wikidata fuzzy enrichment"
            else:
                prov["extraction_method"] = "Wikidata fuzzy enrichment"

    return enriched


def main():
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
    output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"

    # Target countries with low coverage
    target_countries = ["BR", "BE", "IT", "NO", "TH", "VN"]

    print("="*80)
    print("🌍 LOW-COVERAGE COUNTRIES FUZZY MATCHING")
    print("="*80)
    print(f"\n🎯 Target countries: {', '.join(target_countries)}\n")
    print(f"📖 Loading dataset...\n")

    start_time = time.time()

    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    load_time = time.time() - start_time
    print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n")

    # Setup SPARQL
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")

    total_enriched = 0

    # Process each country
    for country_code in target_countries:
        print("="*80)
        print(f"🌍 Processing {country_code}")
        print("="*80)

        # Filter institutions for this country
        country_institutions_idx = [
            idx for idx, inst in enumerate(institutions)
            if any(loc.get('country') == country_code for loc in inst.get('locations', []))
        ]

        if not country_institutions_idx:
            print(f"   ⚠️  No institutions found for {country_code}\n")
            continue

        print(f"   Found {len(country_institutions_idx):,} institutions")

        # Count those without Wikidata
        country_without_wikidata = [
            idx for idx in country_institutions_idx
            if not any(
                id_obj.get("identifier_scheme") == "Wikidata" and
                id_obj.get("identifier_value", "").startswith("Q") and
                int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
                for id_obj in institutions[idx].get("identifiers", [])
            )
        ]

        current_coverage = (len(country_institutions_idx) - len(country_without_wikidata)) / len(country_institutions_idx) * 100
        print(f"   Current Wikidata coverage: {current_coverage:.1f}%")
        print(f"   Institutions needing enrichment: {len(country_without_wikidata):,}\n")

        # Query Wikidata
        print(f"🔍 Querying Wikidata for {country_code} institutions...")
        print("   (This may take 30-60 seconds)\n")

        wikidata_results = query_country_institutions(sparql, country_code)
        print(f"✅ Found {len(wikidata_results):,} {country_code} institutions in Wikidata\n")

        if not wikidata_results:
            print(f"   ⚠️  No Wikidata results for {country_code}, skipping\n")
            continue

        # Fuzzy match
        print("🔗 Fuzzy matching names (threshold: 0.85)...\n")

        country_insts = [institutions[idx] for idx in country_without_wikidata]
        matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85)

        print(f"✨ Found {len(matches):,} high-confidence matches\n")

        # Show sample matches
        if matches:
            print(f"📋 Sample matches (showing first 5):")
            for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]):
                inst = country_insts[local_idx]
                print(f"\n{i+1}. Confidence: {score:.3f}")
                print(f"   Local:    {inst.get('name')}")
                print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
                print(f"   Type:     {wd_data.get('type', 'Unknown')}")

            print(f"\n✅ Applying {len(matches)} matches for {country_code}...\n")

            country_enriched = 0
            for local_idx, qid, score, wd_data in matches:
                global_idx = country_without_wikidata[local_idx]
                if enrich_institution(institutions[global_idx], wd_data):
                    country_enriched += 1

            print(f"✨ Enriched {country_enriched:,} institutions")

            # Calculate new coverage
            new_coverage = (len(country_institutions_idx) - len(country_without_wikidata) + country_enriched) / len(country_institutions_idx) * 100
            print(f"   Coverage: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}%)\n")

            total_enriched += country_enriched
        else:
            print(f"   ❌ No matches found for {country_code}\n")

        # Rate limiting between countries
        time.sleep(2.0)

    # Write output
    if total_enriched > 0:
        print("="*80)
        print("💾 Writing enriched dataset...")
        print("="*80 + "\n")

        with open(output_file, 'w', encoding='utf-8') as f:
            header = f"""---
# Global Heritage Institutions - Low-Coverage Countries Fuzzy Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Countries processed: {', '.join(target_countries)}
# Total new matches: {total_enriched:,}

"""
            f.write(header)
            yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)

        print(f"✅ Complete! Output: {output_file}\n")

        # Final report
        print("="*80)
        print("📊 ENRICHMENT REPORT")
        print("="*80)
        print(f"\n✨ Total institutions enriched: {total_enriched:,}")
        print(f"⏱️  Processing time: {(time.time()-start_time)/60:.1f} minutes")
        print("="*80 + "\n")
    else:
        print("❌ No institutions enriched\n")


if __name__ == "__main__":
    main()