glam/scripts/enrich_phase2_mexico.py

#!/usr/bin/env python3
"""
Phase 2 Enrichment: Mexico (MX)

Target: 192 institutions, 17.7% Wikidata coverage → 35%+ (67+ institutions)
Strategy: SPARQL batch query + fuzzy name matching (Spanish normalization)
Based on: Brazil Phase 2 methodology (achieved 32.5% coverage from 13.7%)

GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment
"""

import sys
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching (Spanish + English)."""
    name = name.lower()

    # Remove common prefixes/suffixes (Spanish + English)
    name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro|memorial|parque|galería)\s+', '', name)
    name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|estatal|municipal|federal|regional|memorial)$', '', name)
    name = re.sub(r'^(foundation|museum|library|archive|center|centre|memorial|park|gallery)\s+', '', name)
    name = re.sub(r'\s+(museum|library|archive|national|state|federal|regional|municipal|memorial)$', '', name)

    # Remove abbreviations in parentheses
    name = re.sub(r'\s*\([^)]*\)\s*', ' ', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_wikidata_mexican_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
    """
    Query Wikidata for ALL heritage institutions in Mexico.

    Institution types: museums, libraries, archives, galleries, universities with collections
    """

    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
    WHERE {
      VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 }

      ?item wdt:P31/wdt:P279* ?type .  # instance of (or subclass of) institution type
      ?item wdt:P17 wd:Q96 .            # country = Mexico (Q96)

      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P571 ?inception . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,pt" . }
    }
    LIMIT 5000
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        # Parse results into dict keyed by QID
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return {}


def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
    """Check if institution types are compatible (prevent museum → library mismatches)."""
    inst_lower = inst_name.lower()
    wd_lower = wd_type.lower()

    museum_kw = ['museo', 'museu', 'museum']
    archive_kw = ['archivo', 'arquivo', 'archive']
    library_kw = ['biblioteca', 'library', 'bibliothèque']
    gallery_kw = ['galería', 'galeria', 'gallery', 'galerie']

    inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM"
    inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE"
    inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY"
    inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY"

    wd_is_museum = any(kw in wd_lower for kw in museum_kw)
    wd_is_archive = any(kw in wd_lower for kw in archive_kw)
    wd_is_library = any(kw in wd_lower for kw in library_kw)
    wd_is_gallery = any(kw in wd_lower for kw in gallery_kw)

    # MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type
    if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]:
        return True

    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False
    if inst_is_gallery and not wd_is_gallery:
        return False

    return True


def fuzzy_match_institutions(
    institutions: List[Dict[str, Any]],
    wikidata_results: Dict[str, Dict[str, Any]],
    threshold: float = 0.70
) -> List[Tuple[int, str, float, Dict[str, Any]]]:
    """
    Fuzzy match Mexican institutions with Wikidata results.

    Returns: List of (institution_idx, qid, confidence_score, wd_data)
    """
    matches = []

    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        inst_type = inst.get("institution_type", "")
        if not inst_name:
            continue

        # Skip if already has real Wikidata ID
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q")
            for id_obj in inst.get("identifiers", []) or []
        )
        if has_wikidata:
            continue

        # Find best match
        best_score = 0.0
        best_qid = None
        best_data = None

        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_type = wd_data.get("type", "")
            if not wd_name:
                continue

            # Check type compatibility
            if not institution_type_compatible(inst_name, inst_type, wd_type):
                continue

            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_qid = qid
                best_data = wd_data

        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data))

    return matches


def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool:
    """Enrich an institution with Wikidata data. Returns True if enriched."""
    enriched = False

    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True

    # Add other identifiers
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value

            identifiers_list.append(id_obj)
            enriched = True

    # Update provenance with enrichment metadata
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if existing_method:
                prov["extraction_method"] = f"{existing_method} + Phase 2 Mexico Wikidata enrichment"
            else:
                prov["extraction_method"] = "Phase 2 Mexico Wikidata enrichment"

            # Add enrichment history
            if "enrichment_history" not in prov:
                prov["enrichment_history"] = []

            prov["enrichment_history"].append({
                "enrichment_date": datetime.now(timezone.utc).isoformat(),
                "enrichment_method": "SPARQL query + fuzzy name matching (Spanish normalization, 70% threshold)",
                "enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"],
                "match_score": match_score,
                "enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'"
            })

    return enriched


def main():
    base_dir = Path(__file__).parent.parent
    master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml"
    backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_mexico_backup"

    print("="*80)
    print("🇲🇽 PHASE 2 MEXICO WIKIDATA ENRICHMENT")
    print("="*80)
    print(f"\n📖 Loading master dataset: {master_file.name}\n")

    start_time = time.time()

    # Load master dataset
    with open(master_file, 'r', encoding='utf-8') as f:
        all_institutions = yaml.safe_load(f)

    # Filter Mexico institutions
    mexico_institutions = [
        inst for inst in all_institutions
        if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
    ]

    # Get Mexico institution indices in master dataset
    mexico_indices = [
        i for i, inst in enumerate(all_institutions)
        if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
    ]

    load_time = time.time() - start_time
    print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s")
    print(f"✅ Found {len(mexico_institutions):,} Mexican institutions\n")

    # Count Wikidata coverage
    with_wikidata = sum(
        1 for inst in mexico_institutions
        if inst.get('identifiers') and any(
            id_obj.get("identifier_scheme") == "Wikidata"
            for id_obj in inst['identifiers']
        )
    )

    without_wikidata = len(mexico_institutions) - with_wikidata
    current_coverage = (with_wikidata / len(mexico_institutions) * 100) if mexico_institutions else 0

    print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)")
    print(f"❓ Without Wikidata: {without_wikidata:,}\n")

    if without_wikidata == 0:
        print("✨ All Mexican institutions already have Wikidata IDs!")
        return

    # Create backup
    print(f"💾 Creating backup: {backup_file.name}")
    with open(backup_file, 'w', encoding='utf-8') as f:
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
    print(f"✅ Backup created\n")

    # Query Wikidata
    print("🔍 Querying Wikidata for Mexican heritage institutions...")
    print("   (This may take 30-60 seconds)\n")

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.setTimeout(120)  # 2 minute timeout
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Mexico Enrichment)")

    query_start = time.time()
    wikidata_results = query_wikidata_mexican_institutions(sparql)
    query_time = time.time() - query_start

    print(f"✅ Found {len(wikidata_results):,} Mexican institutions in Wikidata (query took {query_time:.1f}s)\n")

    if not wikidata_results:
        print("⚠️  No Wikidata results, aborting enrichment")
        return

    # Fuzzy match
    print("🔗 Fuzzy matching names (threshold: 0.70, Spanish normalization)...\n")

    matches = fuzzy_match_institutions(mexico_institutions, wikidata_results, threshold=0.70)

    print(f"✨ Found {len(matches):,} high-confidence matches\n")

    if not matches:
        print("❌ No matches found. Try lowering threshold below 0.70.\n")
        return

    # Show sample matches
    print(f"{'='*80}")
    print(f"📋 SAMPLE MATCHES (Top 10)")
    print(f"{'='*80}")

    sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True)

    for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1):
        inst = mexico_institutions[idx]
        city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown"))

        print(f"\n{i}. Confidence: {score:.3f}")
        print(f"   Local:    {inst.get('name')} ({city})")
        print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
        print(f"   Type:     {wd_data.get('type', 'Unknown')}")

    print(f"\n{'='*80}\n")

    # Apply matches to master dataset
    print("✅ Applying matches to master dataset...\n")
    enriched_count = 0

    for local_idx, qid, score, wd_data in matches:
        master_idx = mexico_indices[local_idx]
        if enrich_institution(all_institutions[master_idx], wd_data, score):
            enriched_count += 1

    new_coverage = (with_wikidata + enriched_count) / len(mexico_institutions) * 100

    print(f"✨ Enriched {enriched_count:,} institutions")
    print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
    print(f"   (+{new_coverage - current_coverage:.1f} percentage points)\n")

    # Write updated master dataset
    print("="*80)
    print("💾 WRITING UPDATED MASTER DATASET")
    print("="*80 + "\n")
    print(f"📝 Writing {len(all_institutions):,} institutions to disk...")
    print("   (This may take 2-3 minutes for large datasets)\n")

    write_start = time.time()
    with open(master_file, 'w', encoding='utf-8') as f:
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
    write_time = time.time() - write_start

    print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n")

    # Final summary
    print("="*80)
    print("📊 ENRICHMENT COMPLETE")
    print("="*80)
    print(f"\n✨ Results:")
    print(f"   Mexican institutions enriched: {enriched_count:,}")
    print(f"   Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)")
    print(f"   Remaining without Wikidata: {without_wikidata - enriched_count:,}")
    print(f"   Overall dataset: {len(all_institutions):,} institutions")
    print(f"\n⏱️  Total processing time: {(time.time()-start_time)/60:.1f} minutes")
    print(f"\n🎯 Phase 2 Target: 35%+ coverage (67+ institutions)")
    if new_coverage >= 35:
        print(f"   ✅ TARGET ACHIEVED!")
    else:
        print(f"   ⏳ In progress... ({new_coverage:.1f}% / 35%)")
    print("="*80 + "\n")


if __name__ == "__main__":
    main()