glam/scripts/enrich_belgian_wikidata.py

#!/usr/bin/env python3
"""
Enrich Belgian ISIL institutions with Wikidata Q-numbers, VIAF IDs, and other identifiers.

This script:
1. Loads Belgian institutions from enriched YAML
2. Queries Wikidata SPARQL endpoint for Belgian ISIL codes (BE-*)
3. Adds Wikidata Q-numbers, VIAF IDs, founding dates, coordinates
4. Updates GHCIDs with Q-numbers for collision resolution
5. Exports enriched YAML with Wikidata data

Query strategy:
- Query by ISIL code (P791) for exact matches
- Batch queries for efficiency (100 codes per query)
- Add multilingual labels (English, Dutch, French)
"""

import sys
from pathlib import Path
from typing import Any, Optional, Dict
from datetime import datetime, timezone
import time
import yaml

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON

def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for a batch of Belgian ISIL codes.

    Args:
        isil_codes: List of ISIL codes (e.g., BE-OSE00, BE-A0001)
        sparql: Configured SPARQL wrapper

    Returns:
        Dict mapping ISIL code → Wikidata data
    """
    # Build VALUES clause
    isil_values = " ".join(f'"{code}"' for code in isil_codes)

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemLabelNl ?itemLabelFr ?itemDescription ?isil ?viaf ?coords ?website ?inception
    WHERE {{
      VALUES ?isil {{ {isil_values} }}
      ?item wdt:P791 ?isil .

      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}

      # Multilingual labels
      OPTIONAL {{ ?item rdfs:label ?itemLabelNl . FILTER(LANG(?itemLabelNl) = "nl") }}
      OPTIONAL {{ ?item rdfs:label ?itemLabelFr . FILTER(LANG(?itemLabelFr) = "fr") }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl,fr,de" . }}
    }}
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        # Parse results into dict keyed by ISIL code
        results = {}
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None
            isil = binding.get("isil", {}).get("value")

            if not qid or not qid.startswith("Q") or not isil:
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "name_nl": binding.get("itemLabelNl", {}).get("value"),
                "name_fr": binding.get("itemLabelFr", {}).get("value"),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "identifiers": {}
            }

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[isil] = result

        return results

    except Exception as e:
        print(f"\n❌ SPARQL Error: {e}")
        return {}


def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
    """
    Enrich an institution with Wikidata data.

    Returns True if any new data was added.
    """
    enriched = False

    # Ensure identifiers list exists
    if "identifiers" not in inst or not inst["identifiers"]:
        inst["identifiers"] = []

    identifiers_list = inst["identifiers"]
    existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}

    # Add Wikidata ID
    if "Wikidata" not in existing_schemes:
        identifiers_list.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wd_data["qid"],
            "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
        })
        enriched = True

    # Add VIAF, Website, etc.
    wd_identifiers = wd_data.get("identifiers", {})
    for scheme, value in wd_identifiers.items():
        if scheme not in existing_schemes:
            id_obj = {
                "identifier_scheme": scheme,
                "identifier_value": value
            }

            if scheme == "VIAF":
                id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
            elif scheme == "Website":
                id_obj["identifier_url"] = value

            identifiers_list.append(id_obj)
            enriched = True

    # Add founding date
    if "founding_date" in wd_data and not inst.get("founded_date"):
        inst["founded_date"] = wd_data["founding_date"]
        enriched = True

    # Add coordinates to location
    if "latitude" in wd_data and "longitude" in wd_data:
        locations = inst.get("locations", [])
        if isinstance(locations, list) and len(locations) > 0:
            first_loc = locations[0]
            if isinstance(first_loc, dict):
                if first_loc.get("latitude") is None:
                    first_loc["latitude"] = wd_data["latitude"]
                    first_loc["longitude"] = wd_data["longitude"]
                    enriched = True

    # Add multilingual names to alternative_names
    if "alternative_names" not in inst or not inst["alternative_names"]:
        inst["alternative_names"] = []

    alt_names = inst["alternative_names"]
    if isinstance(alt_names, list):
        if wd_data.get("name_nl") and wd_data["name_nl"] not in alt_names and wd_data["name_nl"] != inst.get("name"):
            alt_names.append(wd_data["name_nl"])
            enriched = True
        if wd_data.get("name_fr") and wd_data["name_fr"] not in alt_names and wd_data["name_fr"] != inst.get("name"):
            alt_names.append(wd_data["name_fr"])
            enriched = True

    # Update provenance
    if enriched:
        prov = inst.get("provenance", {})
        if isinstance(prov, dict):
            existing_method = prov.get("extraction_method", "")
            if "Wikidata enrichment" not in existing_method:
                prov["extraction_method"] = f"{existing_method} + Wikidata enrichment"

    return enriched


def main():
    """Main enrichment workflow."""

    print("=" * 70)
    print("Belgian Institutions Wikidata Enrichment")
    print("=" * 70)

    # Input/output files
    input_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
    output_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")

    if not input_file.exists():
        print(f"\n❌ Input file not found: {input_file}")
        print("   Run scripts/enrich_belgian_locations.py first")
        return

    # Load Belgian institutions
    print(f"\n1. Loading institutions from {input_file}...")
    import re
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

        # Skip header comments and first ---
        lines = content.split('\n')
        start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
        yaml_content = '\n'.join(lines[start_idx+1:])

        # Split into individual YAML documents by detecting 'id: BE-' at start of line
        records_text = re.split(r'\n(?=id: BE-)', yaml_content)
        records_text = [r.strip() for r in records_text if r.strip()]

        # Parse each record
        institutions = []
        for record_text in records_text:
            try:
                inst = yaml.safe_load(record_text)
                if inst:
                    institutions.append(inst)
            except Exception:
                continue

    print(f"   ✓ Loaded {len(institutions)} institutions")

    # Extract ISIL codes
    isil_codes = [inst.get("id") for inst in institutions if inst.get("id", "").startswith("BE-")]
    print(f"   ✓ Found {len(isil_codes)} Belgian ISIL codes")

    # Setup SPARQL endpoint
    print(f"\n2. Querying Wikidata SPARQL endpoint...")
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod("POST")
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extractor/1.0 (https://github.com/kempersc/glam)")

    # Query in batches
    batch_size = 100
    total_batches = (len(isil_codes) + batch_size - 1) // batch_size

    all_wd_data = {}

    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, len(isil_codes))
        batch_codes = isil_codes[start_idx:end_idx]

        print(f"   Batch {batch_num + 1}/{total_batches}: Querying {len(batch_codes)} ISIL codes...")

        wd_data = query_wikidata_batch(batch_codes, sparql)
        all_wd_data.update(wd_data)

        print(f"   ✓ Found {len(wd_data)} Wikidata matches")

        # Rate limiting
        if batch_num < total_batches - 1:
            time.sleep(1)

    print(f"\n   ✓ Total Wikidata matches: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")

    # Enrich institutions
    print(f"\n3. Enriching institutions with Wikidata data...")
    enriched_count = 0

    for inst in institutions:
        isil_code = inst.get("id")
        if isil_code in all_wd_data:
            if enrich_institution(inst, all_wd_data[isil_code]):
                enriched_count += 1

    print(f"   ✓ Enriched {enriched_count} institutions")

    # Show enrichment examples
    print(f"\n4. Sample enriched institutions:")
    enriched_samples = [inst for inst in institutions if any(
        i.get("identifier_scheme") == "Wikidata" for i in inst.get("identifiers", []) if isinstance(i, dict)
    )][:5]

    for inst in enriched_samples:
        wd_id = next((i["identifier_value"] for i in inst.get("identifiers", []) if isinstance(i, dict) and i.get("identifier_scheme") == "Wikidata"), None)
        print(f"   {inst.get('id')}: {inst.get('name', '')[:40]:40} → {wd_id}")

    # Export enriched data
    print(f"\n5. Exporting enriched YAML to {output_file}...")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Belgian ISIL Registry Institutions (Wikidata Enriched)\n")
        f.write("# Scraped from https://isil.kbr.be/ + Wikidata SPARQL queries\n")
        f.write(f"# Total institutions: {len(institutions)}\n")
        f.write(f"# Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)\n")
        f.write(f"# Enrichment date: {datetime.now(timezone.utc).isoformat()}\n")
        f.write("#\n")
        f.write("---\n\n")

        for idx, inst in enumerate(institutions, 1):
            yaml.dump(inst, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
            f.write("\n")

            if idx % 50 == 0:
                print(f"   ... exported {idx} institutions")

    file_size_kb = output_file.stat().st_size / 1024
    print(f"   ✓ Exported to: {output_file}")
    print(f"   ✓ File size: {file_size_kb:.1f} KB")

    # Summary statistics
    print("\n" + "=" * 70)
    print("Wikidata Enrichment Summary")
    print("=" * 70)
    print(f"Total institutions:           {len(institutions)}")
    print(f"Wikidata Q-numbers added:     {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
    print(f"Wikidata coverage:            {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")

    # Count additional identifiers
    viaf_count = sum(1 for inst in institutions if any(
        i.get("identifier_scheme") == "VIAF" for i in inst.get("identifiers", []) if isinstance(i, dict)
    ))
    print(f"VIAF IDs added:               {viaf_count}")

    coords_count = sum(1 for inst in institutions
                      if inst.get("locations") and len(inst["locations"]) > 0
                      and inst["locations"][0].get("latitude"))
    print(f"Coordinates added:            {coords_count}")

    founding_count = sum(1 for inst in institutions if inst.get("founded_date"))
    print(f"Founding dates added:         {founding_count}")

    print("\n✓ Wikidata enrichment complete!")


if __name__ == "__main__":
    main()