#!/usr/bin/env python3
"""
Enrich Belgian ISIL institutions with Wikidata Q-numbers, VIAF IDs, and other identifiers.

This script:
1. Loads Belgian institutions from enriched YAML
2. Queries Wikidata SPARQL endpoint for Belgian ISIL codes (BE-*)
3. Adds Wikidata Q-numbers, VIAF IDs, founding dates, coordinates
4. Updates GHCIDs with Q-numbers for collision resolution
5. Exports enriched YAML with Wikidata data

Query strategy:
- Query by ISIL code (P791) for exact matches
- Batch queries for efficiency (100 codes per query)
- Add multilingual labels (English, Dutch, French)
"""

import sys
from pathlib import Path
from typing import Any, Optional, Dict
from datetime import datetime, timezone
import time
import yaml

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON

def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for a batch of Belgian ISIL codes.
    
    Args:
        isil_codes: List of ISIL codes (e.g., BE-OSE00, BE-A0001)
        sparql: Configured SPARQL wrapper
        
    Returns:
        Dict mapping ISIL code → Wikidata data
    """
    # Build VALUES clause
    isil_values = " ".join(f'"{code}"' for code in isil_codes)
    
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemLabelNl ?itemLabelFr ?itemDescription ?isil ?viaf ?coords ?website ?inception
    WHERE {{
      VALUES ?isil {{ {isil_values} }}
      ?item wdt:P791 ?isil .
      
      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}
      
      # Multilingual labels
      OPTIONAL {{ ?item rdfs:label ?itemLabelNl . FILTER(LANG(?itemLabelNl) = "nl") }}
      OPTIONAL {{ ?item rdfs:label ?itemLabelFr . FILTER(LANG(?itemLabelFr) = "fr") }}
      
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl,fr,de" . }}
    }}
    """
    
    sparql.setQuery(query)
    
    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
        
        # Parse results into dict keyed by ISIL code
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None
            isil = binding.get("isil", {}).get("value")
            
            if not qid or not qid.startswith("Q") or not isil:
                continue
            
            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "name_nl": binding.get("itemLabelNl", {}).get("value"),
                "name_fr": binding.get("itemLabelFr", {}).get("value"),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "identifiers": {}
            }
            
            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]
            
            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]
            
            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]
            
            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)
            
            results[isil] = result
        
        return results
        
    except Exception as e:
        print(f"\n❌ SPARQL Error: {e}")
        return {}


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
    """
    Enrich an institution with Wikidata data.
    
    Returns True if any new data was added.
    """
    enriched = False
    
    # Ensure identifiers list exists
    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []
    
    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
    
    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True
    
    # Add VIAF, Website, etc.
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }
            
            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value
            
            identifiers_list.append(id_obj)
            enriched = True
    
    # Add founding date
    if "founding_date" in wd_data and not inst.get("founded_date"):
        inst["founded_date"] = wd_data["founding_date"]
        enriched = True
    
    # Add coordinates to location
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict):
                if first_loc.get("latitude") is None:
                    first_loc["latitude"] = wd_data["latitude"]
                    first_loc["longitude"] = wd_data["longitude"]
                    enriched = True
    
    # Add multilingual names to alternative_names
    if "alternative_names" not in inst or not inst["alternative_names"]:
        inst["alternative_names"] = []
    
    alt_names = inst["alternative_names"]
    if isinstance(alt_names, list):
        if wd_data.get("name_nl") and wd_data["name_nl"] not in alt_names and wd_data["name_nl"] != inst.get("name"):
            alt_names.append(wd_data["name_nl"])
            enriched = True
        if wd_data.get("name_fr") and wd_data["name_fr"] not in alt_names and wd_data["name_fr"] != inst.get("name"):
            alt_names.append(wd_data["name_fr"])
            enriched = True
    
    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if "Wikidata enrichment" not in existing_method:
                prov["extraction_method"] = f"{existing_method} + Wikidata enrichment"
    
    return enriched


def main():
    """Main enrichment workflow."""
    
    print("=" * 70)
    print("Belgian Institutions Wikidata Enrichment")
    print("=" * 70)
    
    # Input/output files
    input_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
    output_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")
    
    if not input_file.exists():
        print(f"\n❌ Input file not found: {input_file}")
        print("   Run scripts/enrich_belgian_locations.py first")
        return
    
    # Load Belgian institutions
    print(f"\n1. Loading institutions from {input_file}...")
    import re
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
        
        # Skip header comments and first ---
        lines = content.split('\n')
        start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
        yaml_content = '\n'.join(lines[start_idx+1:])
        
        # Split into individual YAML documents by detecting 'id: BE-' at start of line
        records_text = re.split(r'\n(?=id: BE-)', yaml_content)
        records_text = [r.strip() for r in records_text if r.strip()]
        
        # Parse each record
        institutions = []
        for record_text in records_text:
            try:
                inst = yaml.safe_load(record_text)
                if inst:
                    institutions.append(inst)
            except Exception:
                continue
    
    print(f"   ✓ Loaded {len(institutions)} institutions")
    
    # Extract ISIL codes
    isil_codes = [inst.get("id") for inst in institutions if inst.get("id", "").startswith("BE-")]
    print(f"   ✓ Found {len(isil_codes)} Belgian ISIL codes")
    
    # Setup SPARQL endpoint
    print(f"\n2. Querying Wikidata SPARQL endpoint...")
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod("POST")
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extractor/1.0 (https://github.com/kempersc/glam)")
    
    # Query in batches
    batch_size = 100
    total_batches = (len(isil_codes) + batch_size - 1) // batch_size
    
    all_wd_data = {}
    
    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, len(isil_codes))
        batch_codes = isil_codes[start_idx:end_idx]
        
        print(f"   Batch {batch_num + 1}/{total_batches}: Querying {len(batch_codes)} ISIL codes...")
        
        wd_data = query_wikidata_batch(batch_codes, sparql)
        all_wd_data.update(wd_data)
        
        print(f"   ✓ Found {len(wd_data)} Wikidata matches")
        
        # Rate limiting
        if batch_num < total_batches - 1:
            time.sleep(1)
    
    print(f"\n   ✓ Total Wikidata matches: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
    
    # Enrich institutions
    print(f"\n3. Enriching institutions with Wikidata data...")
    enriched_count = 0
    
    for inst in institutions:
        isil_code = inst.get("id")
        if isil_code in all_wd_data:
            if enrich_institution(inst, all_wd_data[isil_code]):
                enriched_count += 1
    
    print(f"   ✓ Enriched {enriched_count} institutions")
    
    # Show enrichment examples
    print(f"\n4. Sample enriched institutions:")
    enriched_samples = [inst for inst in institutions if any(
        i.get("identifier_scheme") == "Wikidata" for i in inst.get("identifiers", []) if isinstance(i, dict)
    )][:5]
    
    for inst in enriched_samples:
        wd_id = next((i["identifier_value"] for i in inst.get("identifiers", []) if isinstance(i, dict) and i.get("identifier_scheme") == "Wikidata"), None)
        print(f"   {inst.get('id')}: {inst.get('name', '')[:40]:40} → {wd_id}")
    
    # Export enriched data
    print(f"\n5. Exporting enriched YAML to {output_file}...")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Belgian ISIL Registry Institutions (Wikidata Enriched)\n")
        f.write("# Scraped from https://isil.kbr.be/ + Wikidata SPARQL queries\n")
        f.write(f"# Total institutions: {len(institutions)}\n")
        f.write(f"# Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)\n")
        f.write(f"# Enrichment date: {datetime.now(timezone.utc).isoformat()}\n")
        f.write("#\n")
        f.write("---\n\n")
        
        for idx, inst in enumerate(institutions, 1):
            yaml.dump(inst, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
            f.write("\n")
            
            if idx % 50 == 0:
                print(f"   ... exported {idx} institutions")
    
    file_size_kb = output_file.stat().st_size / 1024
    print(f"   ✓ Exported to: {output_file}")
    print(f"   ✓ File size: {file_size_kb:.1f} KB")
    
    # Summary statistics
    print("\n" + "=" * 70)
    print("Wikidata Enrichment Summary")
    print("=" * 70)
    print(f"Total institutions:           {len(institutions)}")
    print(f"Wikidata Q-numbers added:     {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
    print(f"Wikidata coverage:            {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
    
    # Count additional identifiers
    viaf_count = sum(1 for inst in institutions if any(
        i.get("identifier_scheme") == "VIAF" for i in inst.get("identifiers", []) if isinstance(i, dict)
    ))
    print(f"VIAF IDs added:               {viaf_count}")
    
    coords_count = sum(1 for inst in institutions 
                      if inst.get("locations") and len(inst["locations"]) > 0 
                      and inst["locations"][0].get("latitude"))
    print(f"Coordinates added:            {coords_count}")
    
    founding_count = sum(1 for inst in institutions if inst.get("founded_date"))
    print(f"Founding dates added:         {founding_count}")
    
    print("\n✓ Wikidata enrichment complete!")


if __name__ == "__main__":
    main()