glam/scripts/enrich_georgia_batch2_alternative_names.py

#!/usr/bin/env python3
"""
Enrich Georgian heritage institutions - Batch 2 (Alternative Names)

Strategy: Use alternative names (including Georgian names) for fuzzy matching
Target: 10 remaining institutions without Wikidata matches
Goal: Achieve 50%+ total coverage (7+ institutions)

Improvements over Batch 1:
1. Include alternative names in fuzzy matching
2. Try partial name matching (e.g., "Stalin Museum" → "Joseph Stalin Museum")
3. Lower fuzzy threshold to 0.80 for specific matches
4. Manual review of close matches (0.75-0.85)
"""

import sys
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    name = name.lower()

    # Remove common prefixes/suffixes
    name = re.sub(r'^(museum|muzeum|library|biblioteka|archive|arkivi|state|national|central)[\s\-]+', '', name)
    name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian|of georgia)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
    """Query Wikidata for GLAM institutions in Georgia."""
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?altLabel ?typeLabel ?isil ?viaf ?coords ?website ?inception
    WHERE {
      ?item wdt:P17 wd:Q230 .

      VALUES ?type {
        wd:Q7075 wd:Q166118 wd:Q33506 wd:Q1007870 wd:Q28564
        wd:Q11396180 wd:Q207694 wd:Q2772772 wd:Q768717 wd:Q7406919
      }
      ?item wdt:P31 ?type .

      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P571 ?inception . }
      OPTIONAL { ?item skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") }

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en,ka,ru" .
      }
    }
    LIMIT 500
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            # Collect alternative labels
            alt_names = []
            if "altLabel" in binding:
                alt_names.append(binding["altLabel"]["value"])

            if qid not in results:
                results[qid] = {
                    "qid": qid,
                    "name": binding.get("itemLabel", {}).get("value", ""),
                    "description": binding.get("itemDescription", {}).get("value", ""),
                    "type": binding.get("typeLabel", {}).get("value", ""),
                    "alternative_names": alt_names,
                    "identifiers": {}
                }
            else:
                # Merge alternative names
                if "altLabel" in binding:
                    results[qid]["alternative_names"].append(binding["altLabel"]["value"])

            if "isil" in binding:
                results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                results[qid]["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    results[qid]["latitude"] = float(lat)
                    results[qid]["longitude"] = float(lon)

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        return {}


def find_best_match(
    inst: Dict[str, Any],
    wikidata_results: Dict[str, Dict[str, Any]],
    threshold: float = 0.80
) -> Optional[tuple[Dict[str, Any], float, str]]:
    """
    Find best Wikidata match using primary and alternative names.

    Returns: (wikidata_entry, score, matched_name) or None
    """
    inst_names = [inst.get('name', '')]
    if 'alternative_names' in inst:
        inst_names.extend(inst['alternative_names'])

    best_match = None
    best_score = 0.0
    matched_name = ""

    for inst_name in inst_names:
        for qid, wd_data in wikidata_results.items():
            # Try primary name
            wd_names = [wd_data.get('name', '')]

            # Add alternative names
            if 'alternative_names' in wd_data:
                wd_names.extend(wd_data['alternative_names'])

            for wd_name in wd_names:
                score = similarity_score(inst_name, wd_name)

                if score > best_score:
                    best_score = score
                    best_match = wd_data
                    matched_name = f"{inst_name} → {wd_name}"

    if best_score >= threshold and best_match:
        return (best_match, best_score, matched_name)

    return None


def main():
    print("=" * 80)
    print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 2")
    print("=" * 80)
    print()
    print("Strategy: Alternative name matching with lower threshold (0.80)")
    print("Target: 10 institutions without Wikidata matches")
    print()

    # Paths
    data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia"
    input_file = data_dir / "georgian_institutions_enriched_batch1.yaml"
    output_file = data_dir / "georgian_institutions_enriched_batch2.yaml"

    # Load previous batch results
    print("📂 Loading Batch 1 results...")
    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    # Filter for institutions without Wikidata
    needs_enrichment = []
    already_enriched = 0

    for inst in institutions:
        has_wikidata = False
        if 'identifiers' in inst:
            for identifier in inst['identifiers']:
                if identifier.get('identifier_scheme') == 'Wikidata':
                    has_wikidata = True
                    already_enriched += 1
                    break

        if not has_wikidata:
            needs_enrichment.append(inst)

    print(f"   ✅ Already enriched: {already_enriched} institutions")
    print(f"   ⏳ Need enrichment: {len(needs_enrichment)} institutions")
    print()

    # Query Wikidata
    print("🌐 Querying Wikidata with alternative names support...")
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    wikidata_results = query_georgian_institutions(sparql)
    print(f"   ✅ Found {len(wikidata_results)} institutions in Wikidata")
    print()

    # Fuzzy matching with alternative names
    print("🔍 Matching with alternative names (threshold: 0.80)...")
    print()

    new_matches = 0

    for i, inst in enumerate(needs_enrichment, 1):
        inst_name = inst.get('name', 'Unknown')
        inst_type = inst.get('institution_type', 'MIXED')

        print(f"{i:2d}. {inst_name} ({inst_type})")

        # Check for alternative names
        alt_names = inst.get('alternative_names', [])
        if alt_names:
            print(f"      Alternative names: {len(alt_names)}")

        # Try matching
        match_result = find_best_match(inst, wikidata_results)

        if match_result:
            wd_data, score, matched_name = match_result
            qid = wd_data.get('qid', '')

            print(f"   ✅ Matched: {wd_data.get('name')} ({qid})")
            print(f"      Match: {matched_name}")
            print(f"      Score: {score:.2f}")

            # Add Wikidata identifier
            if 'identifiers' not in inst:
                inst['identifiers'] = []

            inst['identifiers'].append({
                'identifier_scheme': 'Wikidata',
                'identifier_value': qid,
                'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
            })

            # Add other identifiers
            for scheme, value in wd_data.get('identifiers', {}).items():
                if scheme == 'Website':
                    inst['identifiers'].append({
                        'identifier_scheme': 'Website',
                        'identifier_value': value,
                        'identifier_url': value
                    })
                else:
                    inst['identifiers'].append({
                        'identifier_scheme': scheme,
                        'identifier_value': value
                    })

            # Add coordinates
            if 'latitude' in wd_data and 'longitude' in wd_data:
                if 'locations' not in inst or not inst['locations']:
                    inst['locations'] = [{'country': 'GE'}]

                inst['locations'][0]['latitude'] = wd_data['latitude']
                inst['locations'][0]['longitude'] = wd_data['longitude']
                print(f"      📍 Coordinates: {wd_data['latitude']:.4f}, {wd_data['longitude']:.4f}")

            # Add founding date
            if 'founding_date' in wd_data:
                inst['founding_date'] = wd_data['founding_date']
                print(f"      📅 Founded: {wd_data['founding_date']}")

            # Update provenance
            if 'provenance' not in inst:
                inst['provenance'] = {}

            inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
            inst['provenance']['enrichment_history'].append({
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'enrichment_method': 'Wikidata SPARQL + alternative name fuzzy matching',
                'match_score': score,
                'verified': False
            })

            new_matches += 1

        else:
            print(f"   ⚠️  No match found (tried {1 + len(alt_names)} name variants)")

        print()

    # Save results
    print("💾 Saving Batch 2 results...")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)

    print(f"   ✅ Saved to: {output_file}")
    print()

    # Report
    total_enriched = already_enriched + new_matches
    total_institutions = len(institutions)

    print("=" * 80)
    print("📊 BATCH 2 RESULTS")
    print("=" * 80)
    print()
    print(f"Batch 1 enriched:       {already_enriched}")
    print(f"Batch 2 new matches:    {new_matches}")
    print(f"Total enriched:         {total_enriched} ({total_enriched/total_institutions*100:.1f}%)")
    print(f"Still need enrichment:  {total_institutions - total_enriched}")
    print()

    if total_enriched >= 7:
        print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
    else:
        print(f"⚠️  Below target: {7 - total_enriched} more matches needed")

    print()


if __name__ == "__main__":
    main()