glam/scripts/enrich_kb_libraries_wikidata.py

#!/usr/bin/env python3
"""
Enrich KB Netherlands library entries with Wikidata data.

This script reads the KB ISIL library entries from data/nde/enriched/entries/
and enriches them with Wikidata data by:
1. Searching for Dutch public libraries in Wikidata by ISIL code
2. Falling back to fuzzy name matching for libraries not found by ISIL
3. Adding Wikidata IDs, coordinates, founding dates, etc.

Usage:
    python scripts/enrich_kb_libraries_wikidata.py [--dry-run] [--limit N]
"""

import os
import sys
import time
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import logging
import argparse
from difflib import SequenceMatcher
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
SPARQL_URL = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-KB-Library-Enricher/1.0 (https://github.com/sst/glam)"

# Rate limiting
REQUEST_DELAY = 0.5


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    name = name.lower()

    # Remove common prefixes/suffixes
    name = re.sub(r'^(stichting|bibliotheek|openbare bibliotheek|ob|)\s*', '', name)
    name = re.sub(r'\s*(bibliotheek|library|bieb|bibl\.?)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name.strip()


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_dutch_libraries_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]:
    """
    Query Wikidata for Dutch libraries by ISIL codes.

    Returns dict mapping ISIL code to Wikidata data.
    """
    if not isil_codes:
        return {}

    # Build VALUES clause for ISIL codes
    isil_values = " ".join(f'"{code}"' for code in isil_codes)

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
    WHERE {{
      VALUES ?isil {{ {isil_values} }}

      ?item wdt:P791 ?isil .

      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
    }}
    """

    headers = {
        "Accept": "application/sparql-results+json",
        "User-Agent": USER_AGENT,
    }

    try:
        response = client.get(
            SPARQL_URL,
            params={"query": query, "format": "json"},
            headers=headers,
            timeout=60.0
        )
        response.raise_for_status()
        data = response.json()

        results = {}
        for binding in data.get("results", {}).get("bindings", []):
            isil = binding.get("isil", {}).get("value", "")
            if not isil:
                continue

            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "isil": isil,
                "identifiers": {}
            }

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[isil] = result

        return results

    except Exception as e:
        logger.error(f"Error querying Wikidata by ISIL: {e}")
        return {}


def query_dutch_public_libraries(client: httpx.Client) -> Dict[str, Dict[str, Any]]:
    """
    Query Wikidata for all Dutch public libraries.

    Returns dict mapping QID to library data.
    """
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
    WHERE {
      # Libraries in Netherlands
      ?item wdt:P31/wdt:P279* wd:Q7075 .  # instance of library (or subclass)
      ?item wdt:P17 wd:Q55 .              # country: Netherlands

      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P571 ?inception . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" . }
    }
    LIMIT 1000
    """

    headers = {
        "Accept": "application/sparql-results+json",
        "User-Agent": USER_AGENT,
    }

    try:
        response = client.get(
            SPARQL_URL,
            params={"query": query, "format": "json"},
            headers=headers,
            timeout=60.0
        )
        response.raise_for_status()
        data = response.json()

        results = {}
        for binding in data.get("results", {}).get("bindings", []):
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            result = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "identifiers": {}
            }

            if "isil" in binding:
                result["isil"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            if "inception" in binding:
                result["founding_date"] = binding["inception"]["value"].split("T")[0]

            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            results[qid] = result

        return results

    except Exception as e:
        logger.error(f"Error querying Wikidata for Dutch libraries: {e}")
        return {}


def find_best_match(
    name: str,
    city: str,
    libraries: Dict[str, Dict[str, Any]],
    threshold: float = 0.85
) -> Optional[Dict[str, Any]]:
    """
    Find best matching library by name and city.
    """
    best_score = 0.0
    best_match = None

    for qid, lib_data in libraries.items():
        lib_name = lib_data.get("name", "")
        if not lib_name:
            continue

        # Calculate name similarity
        name_score = similarity_score(name, lib_name)

        # Boost score if city appears in library name or description
        city_boost = 0.0
        if city:
            city_lower = city.lower()
            if city_lower in lib_name.lower():
                city_boost = 0.15
            elif city_lower in lib_data.get("description", "").lower():
                city_boost = 0.1

        total_score = name_score + city_boost

        if total_score > best_score:
            best_score = total_score
            best_match = lib_data

    if best_score >= threshold and best_match:
        best_match["match_score"] = best_score
        return best_match

    return None


def enrich_entry_with_wikidata(
    entry: Dict[str, Any],
    wikidata: Dict[str, Any],
    match_method: str
) -> Dict[str, Any]:
    """
    Enrich an entry with Wikidata data.
    """
    enrichment = {
        "wikidata_entity_id": wikidata["qid"],
        "wikidata_label": wikidata.get("name"),
        "wikidata_description": wikidata.get("description"),
        "fetch_timestamp": datetime.now(timezone.utc).isoformat(),
        "match_method": match_method,
    }

    # Add coordinates if available
    if "latitude" in wikidata and "longitude" in wikidata:
        enrichment["wikidata_coordinates"] = {
            "latitude": wikidata["latitude"],
            "longitude": wikidata["longitude"]
        }

    # Add founding date
    if "founding_date" in wikidata:
        enrichment["wikidata_inception"] = wikidata["founding_date"]

    # Add identifiers
    if wikidata.get("identifiers"):
        enrichment["wikidata_identifiers"] = wikidata["identifiers"]

    # Add match score if available
    if "match_score" in wikidata:
        enrichment["match_confidence"] = round(wikidata["match_score"], 3)

    entry["wikidata_enrichment"] = enrichment

    return entry


def process_kb_entries(
    entries_dir: Path,
    dry_run: bool = False,
    limit: Optional[int] = None,
) -> Dict[str, int]:
    """
    Process all KB ISIL library entries.
    """
    stats = {
        "total_files": 0,
        "isil_matches": 0,
        "fuzzy_matches": 0,
        "not_found": 0,
        "already_enriched": 0,
        "errors": 0,
    }

    # Find all KB ISIL files
    kb_files = sorted(entries_dir.glob("*_kb_isil.yaml"))
    stats["total_files"] = len(kb_files)

    if limit:
        kb_files = kb_files[:limit]

    logger.info(f"Found {stats['total_files']} KB library entries")
    logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})")

    # Collect all ISIL codes first
    entries_data = []
    isil_codes = []

    for yaml_file in kb_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if not entry:
                continue

            # Check if already has Wikidata enrichment
            if entry.get("wikidata_enrichment"):
                stats["already_enriched"] += 1
                continue

            # Get ISIL code from KB enrichment
            kb_enrichment = entry.get("kb_enrichment", {})
            isil_code = kb_enrichment.get("isil_code") or entry.get("original_entry", {}).get("isil_code_kb")

            if isil_code:
                isil_codes.append(isil_code)

            entries_data.append({
                "file": yaml_file,
                "entry": entry,
                "isil_code": isil_code,
                "name": kb_enrichment.get("name") or entry.get("original_entry", {}).get("organisatie", ""),
                "city": kb_enrichment.get("city") or entry.get("original_entry", {}).get("plaatsnaam_bezoekadres", ""),
            })

        except Exception as e:
            logger.error(f"Error loading {yaml_file.name}: {e}")
            stats["errors"] += 1

    if not entries_data:
        logger.info("No entries to process")
        return stats

    logger.info(f"Collected {len(isil_codes)} ISIL codes for SPARQL query")

    with httpx.Client(timeout=60.0) as client:
        # Step 1: Query Wikidata for all ISIL codes at once
        logger.info("Querying Wikidata for libraries by ISIL codes...")
        isil_results = query_dutch_libraries_by_isil(client, isil_codes)
        logger.info(f"Found {len(isil_results)} libraries by ISIL code")

        time.sleep(REQUEST_DELAY)

        # Step 2: Query Wikidata for all Dutch libraries (for fuzzy matching)
        logger.info("Querying Wikidata for all Dutch libraries (for fuzzy matching)...")
        all_libraries = query_dutch_public_libraries(client)
        logger.info(f"Found {len(all_libraries)} Dutch libraries in Wikidata")

        time.sleep(REQUEST_DELAY)

        # Step 3: Process each entry
        for entry_data in entries_data:
            yaml_file = entry_data["file"]
            entry = entry_data["entry"]
            isil_code = entry_data["isil_code"]
            name = entry_data["name"]
            city = entry_data["city"]

            logger.info(f"\nProcessing: {name} ({isil_code})")

            matched = False

            # Try ISIL match first
            if isil_code and isil_code in isil_results:
                wikidata = isil_results[isil_code]
                logger.info(f"  -> ISIL match: {wikidata['name']} ({wikidata['qid']})")
                entry = enrich_entry_with_wikidata(entry, wikidata, "isil_code_match")
                stats["isil_matches"] += 1
                matched = True

            # Try fuzzy name matching if no ISIL match
            if not matched and name:
                fuzzy_match = find_best_match(name, city, all_libraries, threshold=0.75)
                if fuzzy_match:
                    logger.info(f"  -> Fuzzy match: {fuzzy_match['name']} ({fuzzy_match['qid']}) [score: {fuzzy_match['match_score']:.3f}]")
                    entry = enrich_entry_with_wikidata(entry, fuzzy_match, "fuzzy_name_match")
                    stats["fuzzy_matches"] += 1
                    matched = True

            if not matched:
                logger.info(f"  -> No match found")
                entry["wikidata_enrichment_status"] = "NOT_FOUND"
                entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
                stats["not_found"] += 1

            # Save updated entry
            if not dry_run:
                try:
                    with open(yaml_file, 'w', encoding='utf-8') as f:
                        yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
                except Exception as e:
                    logger.error(f"Error saving {yaml_file.name}: {e}")
                    stats["errors"] += 1

    return stats


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Enrich KB library entries with Wikidata data"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Don't save changes, just show what would be done"
    )
    parser.add_argument(
        "--limit",
        type=int,
        help="Limit number of entries to process"
    )
    parser.add_argument(
        "--entries-dir",
        type=Path,
        default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
        help="Path to entries directory"
    )

    args = parser.parse_args()

    if args.dry_run:
        logger.info("DRY RUN MODE - no changes will be saved")

    if not args.entries_dir.exists():
        logger.error(f"Entries directory not found: {args.entries_dir}")
        return 1

    # Process entries
    stats = process_kb_entries(
        entries_dir=args.entries_dir,
        dry_run=args.dry_run,
        limit=args.limit,
    )

    # Print summary
    logger.info("\n" + "=" * 60)
    logger.info("WIKIDATA ENRICHMENT COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Total KB library files: {stats['total_files']}")
    logger.info(f"Already enriched: {stats['already_enriched']}")
    logger.info(f"ISIL code matches: {stats['isil_matches']}")
    logger.info(f"Fuzzy name matches: {stats['fuzzy_matches']}")
    logger.info(f"Not found: {stats['not_found']}")
    logger.info(f"Errors: {stats['errors']}")

    total_enriched = stats["isil_matches"] + stats["fuzzy_matches"]
    total_processed = stats["total_files"] - stats["already_enriched"] - stats["errors"]
    if total_processed > 0:
        success_rate = total_enriched / total_processed * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Save stats
    if not args.dry_run:
        stats_file = args.entries_dir.parent / f"kb_wikidata_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(stats_file, 'w') as f:
            json.dump({
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "dry_run": args.dry_run,
                "limit": args.limit,
                **stats
            }, f, indent=2)
        logger.info(f"Stats saved to: {stats_file}")

    return 0


if __name__ == "__main__":
    sys.exit(main())