glam/scripts/enrich_georgia_batch1.py

#!/usr/bin/env python3
"""
Enrich Georgian heritage institutions - Phase 1 Proof of Concept

Target: 14 Georgian institutions with 0% Wikidata coverage
Goal: Achieve 50%+ Wikidata coverage (7+ institutions matched)

Strategy:
1. Query Wikidata for museums/libraries/archives in Georgia (Q230)
2. Fuzzy match institution names with 0.85+ threshold
3. Verify type compatibility (museum, library, archive)
4. Enrich with Wikidata Q-numbers, VIAF, coordinates, websites
5. Geocode remaining institutions using Nominatim

CRITICAL: This follows the Chilean enrichment success pattern (78.9% coverage)
"""

import sys
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    name = name.lower()

    # Remove common prefixes/suffixes (English, Georgian transliterations)
    name = re.sub(r'^(museum|muzeum|museu|library|biblioteka|archive|arkivi)[\s\-]+', '', name)
    name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian)$', '', name)

    # Remove organizational forms
    name = re.sub(r'\b(foundation|institute|state|national|central)\b', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool:
    """
    Check if institution types are compatible.
    Prevents mismatches like museum → archive.
    """
    museum_kw = ['museum', 'muzeum', 'museu']
    archive_kw = ['archive', 'arkivi', 'archiv']
    library_kw = ['library', 'biblioteka', 'bibliothek']

    inst_lower = (inst_name + ' ' + inst_type).lower()
    wd_lower = (wd_name + ' ' + wd_desc).lower()

    inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == 'MUSEUM'
    inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == 'ARCHIVE'
    inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == 'LIBRARY'

    wd_is_museum = any(kw in wd_lower for kw in museum_kw)
    wd_is_archive = any(kw in wd_lower for kw in archive_kw)
    wd_is_library = any(kw in wd_lower for kw in library_kw)

    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False

    return True


def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
    """
    Query Wikidata for GLAM institutions in Georgia (Q230).

    Returns: dict keyed by QID with institution data
    """
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception
    WHERE {
      # Institution is in Georgia
      ?item wdt:P17 wd:Q230 .

      # Institution is a GLAM type
      VALUES ?type {
        wd:Q7075      # library
        wd:Q166118    # archive
        wd:Q33506     # museum
        wd:Q1007870   # art gallery
        wd:Q28564     # public library
        wd:Q11396180  # academic library
        wd:Q207694    # art museum
        wd:Q2772772   # history museum
        wd:Q768717    # ethnographic museum
        wd:Q7406919   # state museum
      }
      ?item wdt:P31 ?type .

      # Optional enrichment data
      OPTIONAL { ?item wdt:P791 ?isil . }       # ISIL code
      OPTIONAL { ?item wdt:P214 ?viaf . }       # VIAF ID
      OPTIONAL { ?item wdt:P625 ?coords . }     # Coordinates
      OPTIONAL { ?item wdt:P856 ?website . }    # Official website
      OPTIONAL { ?item wdt:P571 ?inception . }  # Founding date

      # Get labels (English, Georgian, Russian)
      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en,ka,ru" .
      }
    }
    LIMIT 500
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        return {}


def geocode_institution(name: str, country: str = "Georgia") -> Optional[Dict[str, Any]]:
    """
    Geocode institution using Nominatim.
    Respects 1 req/sec rate limit.
    """
    try:
        import requests

        # Try with institution name + country
        search_query = f"{name}, {country}"

        response = requests.get(
            "https://nominatim.openstreetmap.org/search",
            params={
                "q": search_query,
                "format": "json",
                "limit": 1,
                "countrycodes": "ge"  # Georgia ISO code
            },
            headers={"User-Agent": "GLAM-Dataset-Enrichment/1.0"}
        )

        if response.status_code == 200:
            results = response.json()
            if results:
                location = results[0]
                return {
                    "latitude": float(location["lat"]),
                    "longitude": float(location["lon"]),
                    "display_name": location.get("display_name", "")
                }

        # Rate limit: 1 request per second
        time.sleep(1.1)

    except Exception as e:
        print(f"      ⚠️  Geocoding error: {e}")

    return None


def load_georgia_institutions(yaml_path: Path) -> List[Dict[str, Any]]:
    """Load Georgian institutions from unified dataset."""
    with open(yaml_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Filter for Georgian institutions
    return [inst for inst in data if inst.get('locations', [{}])[0].get('country') == 'GE']


def enrich_institution(
    inst: Dict[str, Any],
    wikidata_results: Dict[str, Dict[str, Any]],
    fuzzy_threshold: float = 0.85
) -> Optional[Dict[str, Any]]:
    """
    Try to enrich institution with Wikidata data.

    Returns enrichment data if match found, None otherwise.
    """
    inst_name = inst.get('name', '')
    inst_type = inst.get('institution_type', '')

    # Try exact matches first (by existing identifiers)
    existing_ids = inst.get('identifiers', [])
    for identifier in existing_ids:
        if identifier.get('identifier_scheme') == 'Wikidata':
            qid = identifier.get('identifier_value', '')
            if qid in wikidata_results:
                return wikidata_results[qid]

    # Fuzzy matching by name
    best_match = None
    best_score = 0.0

    for qid, wd_data in wikidata_results.items():
        wd_name = wd_data.get('name', '')
        wd_desc = wd_data.get('description', '')

        # Check type compatibility first
        if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc):
            continue

        # Calculate similarity score
        score = similarity_score(inst_name, wd_name)

        if score > best_score:
            best_score = score
            best_match = wd_data

    # Return match if above threshold
    if best_score >= fuzzy_threshold and best_match:
        enriched_match = dict(best_match)
        enriched_match["match_score"] = best_score
        return enriched_match

    return None


def main():
    print("=" * 80)
    print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 1")
    print("=" * 80)
    print()
    print("Target: 14 institutions with 0% Wikidata coverage")
    print("Goal: Achieve 50%+ coverage (7+ institutions)")
    print()

    # Paths
    data_dir = Path(__file__).parent.parent / "data" / "instances"
    input_file = data_dir / "all" / "globalglam-20251111.yaml"
    output_file = data_dir / "georgia" / "georgian_institutions_enriched_batch1.yaml"

    output_file.parent.mkdir(parents=True, exist_ok=True)

    # Step 1: Load Georgian institutions
    print("📂 Loading Georgian institutions...")
    institutions = load_georgia_institutions(input_file)
    print(f"   ✅ Loaded {len(institutions)} Georgian institutions")
    print()

    # Step 2: Query Wikidata
    print("🌐 Querying Wikidata for Georgian GLAM institutions...")
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    wikidata_results = query_georgian_institutions(sparql)
    print(f"   ✅ Found {len(wikidata_results)} institutions in Wikidata")
    print()

    # Step 3: Fuzzy matching and enrichment
    print("🔍 Matching institutions with Wikidata (threshold: 0.85)...")
    print()

    enriched_count = 0
    geocoded_count = 0

    for i, inst in enumerate(institutions, 1):
        inst_name = inst.get('name', 'Unknown')
        inst_type = inst.get('institution_type', 'MIXED')

        print(f"{i:2d}. {inst_name} ({inst_type})")

        # Try Wikidata enrichment
        enrichment = enrich_institution(inst, wikidata_results)

        if enrichment:
            match_score = enrichment.get('match_score', 0.0)
            qid = enrichment.get('qid', '')
            print(f"   ✅ Matched: {enrichment.get('name')} ({qid}) - Score: {match_score:.2f}")

            # Add Wikidata identifier
            if 'identifiers' not in inst:
                inst['identifiers'] = []

            inst['identifiers'].append({
                'identifier_scheme': 'Wikidata',
                'identifier_value': qid,
                'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
            })

            # Add other identifiers
            for scheme, value in enrichment.get('identifiers', {}).items():
                if scheme == 'Website':
                    inst['identifiers'].append({
                        'identifier_scheme': 'Website',
                        'identifier_value': value,
                        'identifier_url': value
                    })
                else:
                    inst['identifiers'].append({
                        'identifier_scheme': scheme,
                        'identifier_value': value
                    })

            # Add coordinates if available
            if 'latitude' in enrichment and 'longitude' in enrichment:
                if 'locations' not in inst or not inst['locations']:
                    inst['locations'] = [{'country': 'GE'}]

                inst['locations'][0]['latitude'] = enrichment['latitude']
                inst['locations'][0]['longitude'] = enrichment['longitude']
                print(f"      📍 Coordinates: {enrichment['latitude']:.4f}, {enrichment['longitude']:.4f}")

            # Add founding date if available
            if 'founding_date' in enrichment:
                inst['founding_date'] = enrichment['founding_date']
                print(f"      📅 Founded: {enrichment['founding_date']}")

            # Add description from Wikidata
            if enrichment.get('description'):
                if not inst.get('description'):
                    inst['description'] = enrichment['description']
                    print(f"      📝 Description: {enrichment['description'][:60]}...")

            # Update provenance
            if 'provenance' not in inst:
                inst['provenance'] = {}

            inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
            inst['provenance']['enrichment_history'].append({
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
                'match_score': match_score,
                'verified': False
            })

            enriched_count += 1

        else:
            print(f"   ⚠️  No Wikidata match found")

            # Try geocoding as fallback
            geocode_result = geocode_institution(inst_name)

            if geocode_result:
                if 'locations' not in inst or not inst['locations']:
                    inst['locations'] = [{'country': 'GE'}]

                inst['locations'][0]['latitude'] = geocode_result['latitude']
                inst['locations'][0]['longitude'] = geocode_result['longitude']
                print(f"      📍 Geocoded: {geocode_result['latitude']:.4f}, {geocode_result['longitude']:.4f}")
                geocoded_count += 1

        print()

    # Step 4: Save enriched data
    print("💾 Saving enriched dataset...")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)

    print(f"   ✅ Saved to: {output_file}")
    print()

    # Step 5: Report results
    print("=" * 80)
    print("📊 ENRICHMENT RESULTS")
    print("=" * 80)
    print()
    print(f"Total institutions:     {len(institutions)}")
    print(f"Wikidata matches:       {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
    print(f"Geocoded (fallback):    {geocoded_count}")
    print(f"Still need enrichment:  {len(institutions) - enriched_count}")
    print()

    if enriched_count >= 7:
        print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
    else:
        print(f"⚠️  Below target: {7 - enriched_count} more matches needed for 50% coverage")

    print()
    print("Next steps:")
    print("1. Review matches manually (verify institution identities)")
    print("2. Update unified dataset with enriched Georgian records")
    print("3. Proceed with other critical countries (GB, BE, US, LU)")
    print()


if __name__ == "__main__":
    main()