glam/scripts/enrich_czech_sigla.py

#!/usr/bin/env python3
"""
Enrich Czech custodian files using Sigla identifier matching against Wikidata.

Czech libraries have Sigla codes (e.g., "BEG501") which are stored in Wikidata
as property P9559. This script:
1. Fetches all Sigla→Wikidata mappings from Wikidata
2. Matches our CZ files by Sigla code
3. Enriches matched files with Wikidata metadata

Usage:
    python scripts/enrich_czech_sigla.py [--dry-run] [--limit N]
"""

import argparse
import glob
import json
import logging
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

import httpx
import yaml

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('cz_sigla_enrichment.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1/entities/items"

# Rate limiting
REQUEST_DELAY = 0.5  # seconds between requests


def fetch_all_sigla_mappings() -> dict[str, dict]:
    """
    Fetch all Czech Sigla → Wikidata QID mappings.

    Returns:
        Dict mapping Sigla code to {qid, label}
    """
    logger.info("Fetching all Czech Sigla mappings from Wikidata...")

    query = """
    SELECT ?item ?itemLabel ?sigla ?itemDescription ?coord WHERE {
      ?item wdt:P9559 ?sigla .
      OPTIONAL { ?item wdt:P625 ?coord . }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "cs,en" }
    }
    """

    headers = {
        "Accept": "application/sparql-results+json",
        "User-Agent": "GLAM-Heritage-Custodian-Project/1.0 (https://github.com/heritage-custodian; contact@example.org) Python/httpx"
    }

    try:
        with httpx.Client(timeout=60.0) as client:
            response = client.post(
                WIKIDATA_SPARQL_ENDPOINT,
                data={"query": query, "format": "json"},
                headers=headers
            )
            response.raise_for_status()
            data = response.json()
    except Exception as e:
        logger.error(f"Failed to fetch Sigla mappings: {e}")
        return {}

    mappings = {}
    for binding in data.get("results", {}).get("bindings", []):
        sigla = binding.get("sigla", {}).get("value", "")
        qid = binding.get("item", {}).get("value", "").split("/")[-1]
        label = binding.get("itemLabel", {}).get("value", "")
        description = binding.get("itemDescription", {}).get("value", "")
        coord = binding.get("coord", {}).get("value", "")

        if sigla and qid:
            mappings[sigla] = {
                "qid": qid,
                "label": label,
                "description": description,
                "coordinates": coord
            }

    logger.info(f"Fetched {len(mappings)} Sigla→Wikidata mappings")
    return mappings


def fetch_wikidata_details(qid: str) -> dict | None:
    """Fetch detailed entity data from Wikidata REST API."""
    url = f"{WIKIDATA_REST_API}/{qid}"
    headers = {
        "Accept": "application/json",
        "User-Agent": "GLAM-Heritage-Custodian-Project/1.0 (https://github.com/heritage-custodian; contact@example.org) Python/httpx"
    }

    try:
        with httpx.Client(timeout=30.0) as client:
            response = client.get(url, headers=headers)
            if response.status_code == 404:
                return None
            response.raise_for_status()
            return response.json()
    except Exception as e:
        logger.warning(f"Failed to fetch details for {qid}: {e}")
        return None


def extract_sigla_from_file(filepath: Path) -> str | None:
    """Extract Sigla identifier from a custodian YAML file."""
    try:
        with open(filepath) as f:
            data = yaml.safe_load(f)

        # Check original_entry.identifiers
        identifiers = data.get("original_entry", {}).get("identifiers", [])
        for ident in identifiers:
            if ident.get("identifier_scheme") == "Sigla":
                return ident.get("identifier_value")

        # Also check top-level identifiers
        identifiers = data.get("identifiers", [])
        for ident in identifiers:
            if ident.get("identifier_scheme") == "Sigla":
                return ident.get("identifier_value")

    except Exception as e:
        logger.warning(f"Error reading {filepath}: {e}")

    return None


def is_already_enriched(filepath: Path) -> bool:
    """Check if file already has Wikidata enrichment."""
    try:
        with open(filepath) as f:
            data = yaml.safe_load(f)

        # Check for wikidata_enrichment section
        if data.get("wikidata_enrichment"):
            return True

        # Check for Wikidata identifier
        for ident in data.get("identifiers", []):
            if ident.get("identifier_scheme") == "Wikidata":
                return True

    except:
        pass

    return False


def enrich_file(filepath: Path, wikidata_info: dict, fetch_details: bool = True) -> bool:
    """
    Enrich a custodian file with Wikidata data.

    Args:
        filepath: Path to YAML file
        wikidata_info: Dict with qid, label, description from SPARQL
        fetch_details: Whether to fetch additional details via REST API

    Returns:
        True if file was enriched, False otherwise
    """
    try:
        with open(filepath) as f:
            data = yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Error reading {filepath}: {e}")
        return False

    qid = wikidata_info["qid"]

    # Optionally fetch additional details
    details = None
    if fetch_details:
        time.sleep(REQUEST_DELAY)
        details = fetch_wikidata_details(qid)

    # Build enrichment data
    enrichment = {
        "wikidata_id": qid,
        "wikidata_url": f"https://www.wikidata.org/wiki/{qid}",
        "matched_by": "sigla_identifier",
        "matched_sigla": extract_sigla_from_file(filepath),
        "wikidata_label": wikidata_info.get("label", ""),
        "wikidata_description": wikidata_info.get("description", ""),
        "enrichment_date": datetime.now(timezone.utc).isoformat(),
        "enrichment_version": "2.1.0"
    }

    # Add coordinates if available
    if wikidata_info.get("coordinates"):
        coord_str = wikidata_info["coordinates"]
        # Parse "Point(lon lat)" format
        if coord_str.startswith("Point("):
            try:
                coords = coord_str.replace("Point(", "").replace(")", "").split()
                enrichment["wikidata_coordinates"] = {
                    "longitude": float(coords[0]),
                    "latitude": float(coords[1])
                }
            except:
                pass

    # Extract additional info from REST API response
    if details:
        statements = details.get("statements", {})

        # P856 - official website
        if "P856" in statements:
            for stmt in statements["P856"]:
                val = stmt.get("value", {}).get("content")
                if val:
                    enrichment["official_website"] = val
                    break

        # P18 - image
        if "P18" in statements:
            for stmt in statements["P18"]:
                val = stmt.get("value", {}).get("content")
                if val:
                    enrichment["image"] = f"https://commons.wikimedia.org/wiki/Special:FilePath/{val.replace(' ', '_')}"
                    break

        # P31 - instance of (to get institution type)
        if "P31" in statements:
            instance_types = []
            for stmt in statements["P31"]:
                val = stmt.get("value", {}).get("content")
                if val:
                    instance_types.append(val)
            if instance_types:
                enrichment["instance_of"] = instance_types

        # P571 - inception date
        if "P571" in statements:
            for stmt in statements["P571"]:
                val = stmt.get("value", {}).get("content", {})
                if isinstance(val, dict) and "time" in val:
                    enrichment["inception"] = val["time"]
                    break

        # P131 - located in administrative entity
        if "P131" in statements:
            for stmt in statements["P131"]:
                val = stmt.get("value", {}).get("content")
                if val:
                    enrichment["located_in"] = val
                    break

    # Update the file
    data["wikidata_enrichment"] = enrichment

    # Also add Wikidata identifier to identifiers list if not present
    identifiers = data.get("identifiers", [])
    has_wikidata_id = any(i.get("identifier_scheme") == "Wikidata" for i in identifiers)
    if not has_wikidata_id:
        identifiers.append({
            "identifier_scheme": "Wikidata",
            "identifier_value": qid,
            "identifier_url": f"https://www.wikidata.org/wiki/{qid}"
        })
        data["identifiers"] = identifiers

    # Write back
    try:
        with open(filepath, "w") as f:
            yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
        return True
    except Exception as e:
        logger.error(f"Error writing {filepath}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Enrich Czech custodian files using Sigla matching")
    parser.add_argument("--dry-run", action="store_true", help="Don't write changes, just report matches")
    parser.add_argument("--limit", type=int, default=0, help="Limit number of files to process (0=all)")
    parser.add_argument("--no-details", action="store_true", help="Skip fetching detailed entity data")
    args = parser.parse_args()

    # Find all CZ files
    cz_files = sorted(glob.glob("/Users/kempersc/apps/glam/data/custodian/CZ-*.yaml"))
    logger.info(f"Found {len(cz_files)} Czech custodian files")

    # Fetch all Sigla mappings
    sigla_mappings = fetch_all_sigla_mappings()
    if not sigla_mappings:
        logger.error("Failed to fetch Sigla mappings, aborting")
        return 1

    # Track statistics
    stats = {
        "total_files": len(cz_files),
        "files_with_sigla": 0,
        "already_enriched": 0,
        "matches_found": 0,
        "files_enriched": 0,
        "no_match": 0,
        "errors": 0
    }

    processed = 0
    for filepath in cz_files:
        filepath = Path(filepath)

        # Check limit
        if args.limit > 0 and processed >= args.limit:
            logger.info(f"Reached limit of {args.limit} files")
            break

        # Extract Sigla from file
        sigla = extract_sigla_from_file(filepath)
        if not sigla:
            continue

        stats["files_with_sigla"] += 1
        processed += 1

        # Check if already enriched
        if is_already_enriched(filepath):
            stats["already_enriched"] += 1
            continue

        # Look up in Wikidata mappings
        if sigla not in sigla_mappings:
            stats["no_match"] += 1
            if processed % 500 == 0:
                logger.info(f"Processed {processed} files, {stats['matches_found']} matches so far")
            continue

        wikidata_info = sigla_mappings[sigla]
        stats["matches_found"] += 1

        logger.info(f"Match: {filepath.name} (Sigla: {sigla}) → {wikidata_info['qid']} ({wikidata_info['label']})")

        if args.dry_run:
            continue

        # Enrich the file
        if enrich_file(filepath, wikidata_info, fetch_details=not args.no_details):
            stats["files_enriched"] += 1
        else:
            stats["errors"] += 1

        # Progress update
        if stats["files_enriched"] % 50 == 0:
            logger.info(f"Progress: {stats['files_enriched']} files enriched")

    # Final report
    logger.info("=" * 60)
    logger.info("Czech Sigla Enrichment Complete")
    logger.info("=" * 60)
    logger.info(f"Total CZ files: {stats['total_files']}")
    logger.info(f"Files with Sigla: {stats['files_with_sigla']}")
    logger.info(f"Already enriched: {stats['already_enriched']}")
    logger.info(f"Sigla matches found: {stats['matches_found']}")
    logger.info(f"Files enriched: {stats['files_enriched']}")
    logger.info(f"No Wikidata match: {stats['no_match']}")
    logger.info(f"Errors: {stats['errors']}")

    match_rate = (stats['matches_found'] / stats['files_with_sigla'] * 100) if stats['files_with_sigla'] > 0 else 0
    logger.info(f"Match rate: {match_rate:.1f}%")

    return 0


if __name__ == "__main__":
    sys.exit(main())