glam/scripts/enrich_dutch_institutions_fuzzy.py

#!/usr/bin/env python3
"""
Enrich Dutch institutions using fuzzy name matching in Wikidata.

This script addresses the low Dutch coverage (4.8%) by querying Wikidata for
Dutch heritage institutions using name-based searches rather than ISIL codes.

Strategy:
1. Find Dutch institutions without Wikidata IDs
2. Query Wikidata for museums/archives/libraries in Netherlands
3. Fuzzy match names (normalized)
4. Manual verification for high-confidence matches (>0.85)
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    # Lowercase
    name = name.lower()

    # Remove common prefixes/suffixes
    name = re.sub(r'^(stichting|gemeentearchief|regionaal archief|museum)\s+', '', name)
    name = re.sub(r'\s+(archief|museum|bibliotheek|library|archive)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_dutch_institutions(sparql: SPARQLWrapper, institution_types: list[str]) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for Dutch heritage institutions.

    institution_types: List of Wikidata QIDs for institution types
        Q33506 - museum
        Q7075 - library
        Q166118 - archive
    """

    types_values = " ".join(f"wd:{qid}" for qid in institution_types)

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
    WHERE {{
      VALUES ?type {{ {types_values} }}

      ?item wdt:P31 ?type .           # instance of museum/library/archive
      ?item wdt:P17 wd:Q55 .          # country: Netherlands

      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
    }}
    LIMIT 2000
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        # Parse results into dict keyed by QID
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return {}


def institution_type_compatible(inst_name: str, wd_type: str) -> bool:
    """Check if institution types are compatible (avoid museum/archive mismatches)."""
    inst_lower = inst_name.lower()
    wd_lower = wd_type.lower()

    # Define type keywords
    museum_keywords = ['museum', 'museo', 'museu']
    archive_keywords = ['archief', 'archive', 'archivo']
    library_keywords = ['bibliotheek', 'library', 'biblioteca']

    # Check if institution name contains type keyword
    inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
    inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
    inst_is_library = any(kw in inst_lower for kw in library_keywords)

    # Check if Wikidata type contains type keyword
    wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
    wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
    wd_is_library = any(kw in wd_lower for kw in library_keywords)

    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False

    return True


def fuzzy_match_dutch_institutions(
    institutions: list[dict[str, Any]],
    wikidata_results: dict[str, dict[str, Any]],
    threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any]]]:
    """
    Fuzzy match Dutch institutions with Wikidata results.

    Returns: List of (institution_idx, qid, confidence_score, wd_data)
    """
    matches = []

    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        if not inst_name:
            continue

        # Skip if already has Wikidata ID
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in inst.get("identifiers", [])
        )
        if has_wikidata:
            continue

        # Find best match
        best_score = 0.0
        best_qid = None
        best_data = None

        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_type = wd_data.get("type", "")
            if not wd_name:
                continue

            # Check type compatibility
            if not institution_type_compatible(inst_name, wd_type):
                continue

            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_qid = qid
                best_data = wd_data

        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data))

    return matches


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
    """Enrich an institution with Wikidata data. Returns True if enriched."""
    enriched = False

    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True

    # Add other identifiers
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value

            identifiers_list.append(id_obj)
            enriched = True

    # Add founding date
    if "founding_date" in wd_data and not inst.get("founding_date"):
        inst["founding_date"] = wd_data["founding_date"]
        enriched = True

    # Add coordinates if missing
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
                first_loc["latitude"] = wd_data["latitude"]
                first_loc["longitude"] = wd_data["longitude"]
                enriched = True

    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if existing_method:
                prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (fuzzy name match)"
            else:
                prov["extraction_method"] = "Wikidata enrichment (fuzzy name match)"

    return enriched


def main():
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
    output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_dutch_enriched.yaml"

    print("="*80)
    print("🇳🇱 DUTCH INSTITUTIONS FUZZY MATCHING")
    print("="*80)
    print(f"\n📖 Loading dataset...\n")

    start_time = time.time()

    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    load_time = time.time() - start_time
    print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n")

    # Filter Dutch institutions
    dutch_institutions_idx = [
        idx for idx, inst in enumerate(institutions)
        if inst.get('locations', [{}])[0].get('country') == 'NL'
    ]

    print(f"🇳🇱 Found {len(dutch_institutions_idx):,} Dutch institutions\n")

    # Count those without Wikidata
    dutch_without_wikidata = [
        idx for idx in dutch_institutions_idx
        if not any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in institutions[idx].get("identifiers", [])
        )
    ]

    print(f"❓ Dutch institutions without Wikidata: {len(dutch_without_wikidata):,}\n")

    # Setup SPARQL
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")

    # Query Wikidata for Dutch institutions
    print("🔍 Querying Wikidata for Dutch museums, libraries, and archives...")
    print("   (This may take 30-60 seconds)\n")

    institution_types = ["Q33506", "Q7075", "Q166118"]  # museum, library, archive
    wikidata_results = query_dutch_institutions(sparql, institution_types)

    print(f"✅ Found {len(wikidata_results):,} Dutch institutions in Wikidata\n")

    # Fuzzy match
    print("🔗 Fuzzy matching names (threshold: 0.85)...\n")

    dutch_insts = [institutions[idx] for idx in dutch_without_wikidata]
    matches = fuzzy_match_dutch_institutions(dutch_insts, wikidata_results, threshold=0.85)

    print(f"✨ Found {len(matches):,} high-confidence matches\n")

    # Show sample matches for verification
    if matches:
        print("="*80)
        print("📋 SAMPLE MATCHES (Top 10)")
        print("="*80)
        for i, (local_idx, qid, score, wd_data) in enumerate(matches[:10]):
            inst = dutch_insts[local_idx]
            print(f"\n{i+1}. Confidence: {score:.3f}")
            print(f"   Local:    {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
            print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
            print(f"   Type:     {wd_data.get('type', 'Unknown')}")
            if "ISIL" in wd_data.get("identifiers", {}):
                print(f"   ISIL:     {wd_data['identifiers']['ISIL']}")

        print("\n" + "="*80)
        print("\n⚠️  AUTOMATIC APPLICATION")
        print("="*80)
        print("\nApplying all high-confidence matches (>0.85 similarity)...")

        choice = "1"  # Auto-apply

        if choice == "1":
            # Apply all matches
            print("\n✅ Applying all matches...\n")
            enriched_count = 0

            for local_idx, qid, score, wd_data in matches:
                global_idx = dutch_without_wikidata[local_idx]
                if enrich_institution(institutions[global_idx], wd_data):
                    enriched_count += 1

            print(f"✨ Enriched {enriched_count:,} institutions\n")

            # Write output
            print("💾 Writing enriched dataset...")

            header = f"""---
# Global Heritage Institutions - Dutch Fuzzy Match Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Dutch institutions: {len(dutch_institutions_idx):,}
# New Dutch matches: {enriched_count:,}

"""

            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(header)
                yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)

            print(f"✅ Complete! Output: {output_file}\n")

            # Final report
            print("="*80)
            print("📊 ENRICHMENT REPORT")
            print("="*80)
            print(f"\n✨ Results:")
            print(f"   Dutch institutions enriched: {enriched_count:,}")
            print(f"   New Dutch Wikidata coverage: {(49 + enriched_count) / len(dutch_institutions_idx) * 100:.1f}%")
            print(f"   (was 4.8%, now {(49 + enriched_count) / len(dutch_institutions_idx) * 100:.1f}%)")
            print(f"\n⏱️  Processing time: {(time.time()-start_time)/60:.1f} minutes")
            print("="*80 + "\n")

        elif choice == "2":
            print("\n⚠️  Interactive review not yet implemented")
            print("   Please review matches manually and run with choice 1 if approved\n")
        else:
            print("\n❌ Cancelled\n")
    else:
        print("❌ No matches found. Try lowering threshold.\n")


if __name__ == "__main__":
    main()