#!/usr/bin/env python3
"""
Phase 2 Enrichment: Netherlands (NL)

Target: 622 institutions, 31.0% Wikidata coverage → 62%+ (385+ institutions)
Strategy: SPARQL batch query + fuzzy name matching (Dutch normalization)
Based on: Mexico Phase 2 methodology (achieved 50.0% coverage from 17.7%)

GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment
"""

import sys
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching (Dutch + English + German)."""
    name = name.lower()
    
    # Remove common prefixes/suffixes (Dutch + English + German)
    name = re.sub(r'^(stichting|museum|bibliotheek|archief|centrum|galerie|verzameling)\s+', '', name)
    name = re.sub(r'\s+(museum|bibliotheek|archief|nationaal|regionaal|gemeentelijk|provinciaal|stedelijk)$', '', name)
    name = re.sub(r'^(foundation|museum|library|archive|center|centre|gallery|collection)\s+', '', name)
    name = re.sub(r'\s+(museum|library|archive|national|regional|municipal|provincial|city)$', '', name)
    name = re.sub(r'^(stiftung|bibliothek|archiv|zentrum|galerie|sammlung)\s+', '', name)
    name = re.sub(r'\s+(national|regional|städtisch)$', '', name)
    
    # Remove abbreviations in parentheses
    name = re.sub(r'\s*\([^)]*\)\s*', ' ', name)
    
    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)
    
    # Normalize whitespace
    name = ' '.join(name.split())
    
    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_wikidata_dutch_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
    """
    Query Wikidata for ALL heritage institutions in the Netherlands.
    
    Institution types: museums, libraries, archives, galleries, universities with collections
    """
    
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
    WHERE {
      VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 }
      
      ?item wdt:P31/wdt:P279* ?type .  # instance of (or subclass of) institution type
      ?item wdt:P17 wd:Q55 .            # country = Netherlands (Q55)
      
      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P571 ?inception . }
      
      SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en,de" . }
    }
    LIMIT 5000
    """
    
    sparql.setQuery(query)
    
    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
        
        # Parse results into dict keyed by QID
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None
            
            if not qid or not qid.startswith("Q"):
                continue
            
            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "identifiers": {}
            }
            
            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]
            
            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]
            
            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]
            
            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]
            
            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)
            
            results[qid] = result
        
        return results
        
    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return {}


def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
    """Check if institution types are compatible (prevent museum → library mismatches)."""
    inst_lower = inst_name.lower()
    wd_lower = wd_type.lower()
    
    museum_kw = ['museum', 'museu', 'museo']
    archive_kw = ['archief', 'archive', 'archivo', 'arquivo']
    library_kw = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek']
    gallery_kw = ['galerie', 'gallery', 'galería', 'galeria']
    
    inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM"
    inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE"
    inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY"
    inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY"
    
    wd_is_museum = any(kw in wd_lower for kw in museum_kw)
    wd_is_archive = any(kw in wd_lower for kw in archive_kw)
    wd_is_library = any(kw in wd_lower for kw in library_kw)
    wd_is_gallery = any(kw in wd_lower for kw in gallery_kw)
    
    # MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type
    if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]:
        return True
    
    # If both have explicit types, they must match
    if inst_is_museum and not wd_is_museum:
        return False
    if inst_is_archive and not wd_is_archive:
        return False
    if inst_is_library and not wd_is_library:
        return False
    if inst_is_gallery and not wd_is_gallery:
        return False
    
    return True


def fuzzy_match_institutions(
    institutions: List[Dict[str, Any]],
    wikidata_results: Dict[str, Dict[str, Any]],
    threshold: float = 0.70
) -> List[Tuple[int, str, float, Dict[str, Any]]]:
    """
    Fuzzy match Dutch institutions with Wikidata results.
    
    Returns: List of (institution_idx, qid, confidence_score, wd_data)
    """
    matches = []
    
    for idx, inst in enumerate(institutions):
        inst_name = inst.get("name", "")
        inst_type = inst.get("institution_type", "")
        if not inst_name:
            continue
        
        # Skip if already has real Wikidata ID
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q")
            for id_obj in inst.get("identifiers", []) or []
        )
        if has_wikidata:
            continue
        
        # Find best match
        best_score = 0.0
        best_qid = None
        best_data = None
        
        for qid, wd_data in wikidata_results.items():
            wd_name = wd_data.get("name", "")
            wd_type = wd_data.get("type", "")
            if not wd_name:
                continue
            
            # Check type compatibility
            if not institution_type_compatible(inst_name, inst_type, wd_type):
                continue
            
            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_qid = qid
                best_data = wd_data
        
        # Only include matches above threshold
        if best_score >= threshold and best_qid and best_data:
            matches.append((idx, best_qid, best_score, best_data))
    
    return matches


def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool:
    """Enrich an institution with Wikidata data. Returns True if enriched."""
    enriched = False
    
    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []
    
    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
    
    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True
    
    # Add other identifiers
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }
            
            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value
            
            identifiers_list.append(id_obj)
            enriched = True
    
    # Update provenance with enrichment metadata
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if existing_method:
                prov["extraction_method"] = f"{existing_method} + Phase 2 Netherlands Wikidata enrichment"
            else:
                prov["extraction_method"] = "Phase 2 Netherlands Wikidata enrichment"
            
            # Add enrichment history
            if "enrichment_history" not in prov:
                prov["enrichment_history"] = []
            
            prov["enrichment_history"].append({
                "enrichment_date": datetime.now(timezone.utc).isoformat(),
                "enrichment_method": "SPARQL query + fuzzy name matching (Dutch normalization, 70% threshold)",
                "enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"],
                "match_score": match_score,
                "enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'"
            })
    
    return enriched


def main():
    base_dir = Path(__file__).parent.parent
    master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml"
    backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_netherlands_backup"
    
    print("="*80)
    print("🇳🇱 PHASE 2 NETHERLANDS WIKIDATA ENRICHMENT")
    print("="*80)
    print(f"\n📖 Loading master dataset: {master_file.name}\n")
    
    start_time = time.time()
    
    # Load master dataset
    with open(master_file, 'r', encoding='utf-8') as f:
        all_institutions = yaml.safe_load(f)
    
    # Filter Netherlands institutions
    netherlands_institutions = [
        inst for inst in all_institutions
        if inst.get('locations') and any(loc.get('country') == 'NL' for loc in inst['locations'])
    ]
    
    # Get Netherlands institution indices in master dataset
    netherlands_indices = [
        i for i, inst in enumerate(all_institutions)
        if inst.get('locations') and any(loc.get('country') == 'NL' for loc in inst['locations'])
    ]
    
    load_time = time.time() - start_time
    print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s")
    print(f"✅ Found {len(netherlands_institutions):,} Dutch institutions\n")
    
    # Count Wikidata coverage
    with_wikidata = sum(
        1 for inst in netherlands_institutions
        if inst.get('identifiers') and any(
            id_obj.get("identifier_scheme") == "Wikidata"
            for id_obj in inst['identifiers']
        )
    )
    
    without_wikidata = len(netherlands_institutions) - with_wikidata
    current_coverage = (with_wikidata / len(netherlands_institutions) * 100) if netherlands_institutions else 0
    
    print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)")
    print(f"❓ Without Wikidata: {without_wikidata:,}\n")
    
    if without_wikidata == 0:
        print("✨ All Dutch institutions already have Wikidata IDs!")
        return
    
    # Create backup
    print(f"💾 Creating backup: {backup_file.name}")
    with open(backup_file, 'w', encoding='utf-8') as f:
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
    print(f"✅ Backup created\n")
    
    # Query Wikidata
    print("🔍 Querying Wikidata for Dutch heritage institutions...")
    print("   (This may take 30-60 seconds)\n")
    
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.setTimeout(120)  # 2 minute timeout
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Netherlands Enrichment)")
    
    query_start = time.time()
    wikidata_results = query_wikidata_dutch_institutions(sparql)
    query_time = time.time() - query_start
    
    print(f"✅ Found {len(wikidata_results):,} Dutch institutions in Wikidata (query took {query_time:.1f}s)\n")
    
    if not wikidata_results:
        print("⚠️  No Wikidata results, aborting enrichment")
        return
    
    # Fuzzy match
    print("🔗 Fuzzy matching names (threshold: 0.70, Dutch normalization)...\n")
    
    matches = fuzzy_match_institutions(netherlands_institutions, wikidata_results, threshold=0.70)
    
    print(f"✨ Found {len(matches):,} high-confidence matches\n")
    
    if not matches:
        print("❌ No matches found. Try lowering threshold below 0.70.\n")
        return
    
    # Show sample matches
    print(f"{'='*80}")
    print(f"📋 SAMPLE MATCHES (Top 10)")
    print(f"{'='*80}")
    
    sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True)
    
    for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1):
        inst = netherlands_institutions[idx]
        city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown"))
        
        print(f"\n{i}. Confidence: {score:.3f}")
        print(f"   Local:    {inst.get('name')} ({city})")
        print(f"   Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
        print(f"   Type:     {wd_data.get('type', 'Unknown')}")
    
    print(f"\n{'='*80}\n")
    
    # Apply matches to master dataset
    print("✅ Applying matches to master dataset...\n")
    enriched_count = 0
    
    for local_idx, qid, score, wd_data in matches:
        master_idx = netherlands_indices[local_idx]
        if enrich_institution(all_institutions[master_idx], wd_data, score):
            enriched_count += 1
    
    new_coverage = (with_wikidata + enriched_count) / len(netherlands_institutions) * 100
    
    print(f"✨ Enriched {enriched_count:,} institutions")
    print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
    print(f"   (+{new_coverage - current_coverage:.1f} percentage points)\n")
    
    # Write updated master dataset
    print("="*80)
    print("💾 WRITING UPDATED MASTER DATASET")
    print("="*80 + "\n")
    print(f"📝 Writing {len(all_institutions):,} institutions to disk...")
    print("   (This may take 2-3 minutes for large datasets)\n")
    
    write_start = time.time()
    with open(master_file, 'w', encoding='utf-8') as f:
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
    write_time = time.time() - write_start
    
    print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n")
    
    # Final summary
    print("="*80)
    print("📊 ENRICHMENT COMPLETE")
    print("="*80)
    print(f"\n✨ Results:")
    print(f"   Dutch institutions enriched: {enriched_count:,}")
    print(f"   Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)")
    print(f"   Remaining without Wikidata: {without_wikidata - enriched_count:,}")
    print(f"   Overall dataset: {len(all_institutions):,} institutions")
    print(f"\n⏱️  Total processing time: {(time.time()-start_time)/60:.1f} minutes")
    print(f"\n🎯 Phase 2 Target: 62%+ coverage (385+ institutions)")
    if new_coverage >= 62:
        print(f"   ✅ TARGET ACHIEVED!")
    else:
        print(f"   ⏳ In progress... ({new_coverage:.1f}% / 62%)")
    print("="*80 + "\n")


if __name__ == "__main__":
    main()