glam/scripts/enrich_nde_fast.py

#!/usr/bin/env python3
"""
Fast Wikidata enrichment for NDE Register NL entries.

This script:
- Stores each enriched entry as a separate YAML file
- Allows resuming from last enriched entry
- Uses faster rate limiting (with exponential backoff on errors)
- Adds comprehensive API metadata to each entry

Usage:
    python scripts/enrich_nde_fast.py

Output:
    data/nde/enriched/entries/{index}_{wikidata_id}.yaml
"""

import os
import sys
import time
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import logging
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
SPARQL_URL = "https://query.wikidata.org/sparql"

# Rate limiting - FASTER but with backoff
# Wikimedia guidelines: be polite, use User-Agent, back off on errors
# Anonymous: 500 req/hr = ~8.3 req/min, we'll try 5 req/min (12 sec delay)
# If we get 429s, we back off exponentially
BASE_DELAY = 2.0  # Start with 2 seconds between requests
MAX_DELAY = 60.0  # Max backoff delay
BACKOFF_FACTOR = 2.0  # Double delay on each error

WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "")
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
USER_AGENT = f"GLAMDataExtractor/1.0 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"

# Headers
HEADERS = {
    "Accept": "application/json",
    "User-Agent": USER_AGENT,
}
if WIKIDATA_API_TOKEN:
    HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}"
    logger.info("Using authenticated mode")
else:
    logger.info("Using anonymous mode")


@dataclass
class APIMetadata:
    """Metadata about the API call."""
    api_endpoint: str
    entity_id: str
    request_url: str
    response_status: int
    response_time_ms: float
    fetch_timestamp: str
    user_agent: str
    authenticated: bool
    rate_limit_delay_used: float


@dataclass
class WikidataEnrichment:
    """Container for all Wikidata data extracted for an entity."""
    entity_id: str
    labels: Dict[str, str] = field(default_factory=dict)
    descriptions: Dict[str, str] = field(default_factory=dict)
    aliases: Dict[str, List[str]] = field(default_factory=dict)
    sitelinks: Dict[str, str] = field(default_factory=dict)
    claims: Dict[str, Any] = field(default_factory=dict)
    identifiers: Dict[str, str] = field(default_factory=dict)
    instance_of: List[Dict[str, str]] = field(default_factory=list)
    country: Optional[Dict[str, str]] = None
    location: Optional[Dict[str, str]] = None
    coordinates: Optional[Dict[str, float]] = None
    inception: Optional[str] = None
    dissolution: Optional[str] = None
    official_website: Optional[str] = None
    image: Optional[str] = None
    logo: Optional[str] = None
    api_metadata: Optional[Dict[str, Any]] = None


# Property IDs for heritage institutions
PROPERTY_LABELS = {
    "P31": "instance_of", "P17": "country", "P131": "located_in",
    "P625": "coordinates", "P571": "inception", "P576": "dissolution",
    "P856": "official_website", "P18": "image", "P154": "logo",
    "P791": "isil", "P214": "viaf", "P227": "gnd", "P244": "lcnaf",
    "P268": "bnf", "P269": "idref", "P213": "isni", "P1566": "geonames",
    "P2427": "grid", "P3500": "ringgold", "P5785": "museofile",
    "P8168": "factgrid", "P361": "part_of", "P355": "subsidiaries",
    "P749": "parent_org", "P127": "owned_by", "P1037": "director",
    "P159": "headquarters", "P463": "member_of", "P1435": "heritage_status",
    "P910": "topic_category", "P373": "commons_category", "P2044": "elevation",
    "P6375": "street_address", "P281": "postal_code", "P1329": "phone",
    "P968": "email", "P973": "described_at_url", "P8402": "kvk_number",
}


def fetch_entity_with_metadata(entity_id: str, client: httpx.Client, delay_used: float) -> tuple[Optional[Dict], Optional[APIMetadata]]:
    """
    Fetch entity data with comprehensive metadata tracking.

    Returns:
        Tuple of (entity_data, api_metadata)
    """
    url = f"{WIKIDATA_REST_API}/entities/items/{entity_id}"
    fetch_timestamp = datetime.now(timezone.utc).isoformat()

    start_time = time.perf_counter()

    try:
        response = client.get(url, headers=HEADERS)
        elapsed_ms = (time.perf_counter() - start_time) * 1000

        metadata = APIMetadata(
            api_endpoint=WIKIDATA_REST_API,
            entity_id=entity_id,
            request_url=url,
            response_status=response.status_code,
            response_time_ms=round(elapsed_ms, 2),
            fetch_timestamp=fetch_timestamp,
            user_agent=USER_AGENT,
            authenticated=bool(WIKIDATA_API_TOKEN),
            rate_limit_delay_used=delay_used,
        )

        if response.status_code == 429:
            # Rate limited
            logger.warning(f"Rate limited on {entity_id}")
            return None, metadata

        response.raise_for_status()
        return response.json(), metadata

    except httpx.HTTPStatusError as e:
        elapsed_ms = (time.perf_counter() - start_time) * 1000
        metadata = APIMetadata(
            api_endpoint=WIKIDATA_REST_API,
            entity_id=entity_id,
            request_url=url,
            response_status=e.response.status_code,
            response_time_ms=round(elapsed_ms, 2),
            fetch_timestamp=fetch_timestamp,
            user_agent=USER_AGENT,
            authenticated=bool(WIKIDATA_API_TOKEN),
            rate_limit_delay_used=delay_used,
        )
        logger.error(f"HTTP error fetching {entity_id}: {e}")
        return None, metadata

    except Exception as e:
        logger.error(f"Error fetching {entity_id}: {e}")
        return None, None


def extract_value_from_statement(statement: Dict) -> Any:
    """Extract the value from a Wikidata statement structure."""
    try:
        value_data = statement.get("value", {})
        value_type = value_data.get("type")
        content = value_data.get("content")

        if value_type == "value":
            return content
        elif isinstance(content, dict):
            if "entity-type" in content or "id" in content:
                return content.get("id", content)
            elif "time" in content:
                return content.get("time")
            elif "latitude" in content and "longitude" in content:
                return {
                    "latitude": content.get("latitude"),
                    "longitude": content.get("longitude"),
                    "precision": content.get("precision")
                }
            else:
                return content
        else:
            return content
    except Exception:
        return None


def parse_entity_data(entity_id: str, data: Dict, api_metadata: APIMetadata) -> WikidataEnrichment:
    """Parse the full entity data into a WikidataEnrichment object."""
    enrichment = WikidataEnrichment(entity_id=entity_id)

    # Store API metadata
    enrichment.api_metadata = {
        "api_endpoint": api_metadata.api_endpoint,
        "request_url": api_metadata.request_url,
        "response_status": api_metadata.response_status,
        "response_time_ms": api_metadata.response_time_ms,
        "fetch_timestamp": api_metadata.fetch_timestamp,
        "user_agent": api_metadata.user_agent,
        "authenticated": api_metadata.authenticated,
        "rate_limit_delay_used": api_metadata.rate_limit_delay_used,
    }

    # Extract labels, descriptions, aliases
    enrichment.labels = data.get("labels", {})
    enrichment.descriptions = data.get("descriptions", {})
    enrichment.aliases = data.get("aliases", {})

    # Extract sitelinks
    sitelinks = data.get("sitelinks", {})
    for site, link_data in sitelinks.items():
        if isinstance(link_data, dict):
            enrichment.sitelinks[site] = link_data.get("title", str(link_data))
        else:
            enrichment.sitelinks[site] = str(link_data)

    # Extract statements/claims
    statements = data.get("statements", {})

    for prop_id, prop_statements in statements.items():
        prop_name = PROPERTY_LABELS.get(prop_id, prop_id)

        if not prop_statements:
            continue

        values = []
        for stmt in prop_statements:
            value = extract_value_from_statement(stmt)
            if value is not None:
                values.append(value)

        if not values:
            continue

        # Handle specific properties
        if prop_id == "P31":
            enrichment.instance_of = [{"id": v} if isinstance(v, str) else v for v in values]
        elif prop_id == "P17":
            enrichment.country = {"id": values[0]} if values else None
        elif prop_id == "P131":
            enrichment.location = {"id": values[0]} if values else None
        elif prop_id == "P625":
            if values and isinstance(values[0], dict):
                enrichment.coordinates = values[0]
        elif prop_id == "P571":
            enrichment.inception = values[0] if values else None
        elif prop_id == "P576":
            enrichment.dissolution = values[0] if values else None
        elif prop_id == "P856":
            enrichment.official_website = values[0] if values else None
        elif prop_id == "P18":
            enrichment.image = values[0] if values else None
        elif prop_id == "P154":
            enrichment.logo = values[0] if values else None
        elif prop_id in ["P791", "P214", "P227", "P244", "P268", "P269",
                         "P213", "P1566", "P2427", "P3500", "P5785", "P8168", "P8402"]:
            enrichment.identifiers[prop_name] = values[0] if values else None
        else:
            enrichment.claims[prop_name] = values[0] if len(values) == 1 else values

    return enrichment


def enrichment_to_dict(enrichment: WikidataEnrichment) -> Dict[str, Any]:
    """Convert WikidataEnrichment to a clean dictionary for YAML output."""
    result: Dict[str, Any] = {
        "wikidata_entity_id": enrichment.entity_id,
    }

    # Add API metadata first
    if enrichment.api_metadata:
        result["api_metadata"] = enrichment.api_metadata

    # Add labels (prioritize nl, en)
    if enrichment.labels:
        result["wikidata_labels"] = enrichment.labels
        if "nl" in enrichment.labels:
            result["wikidata_label_nl"] = enrichment.labels["nl"]
        if "en" in enrichment.labels:
            result["wikidata_label_en"] = enrichment.labels["en"]

    # Add descriptions
    if enrichment.descriptions:
        result["wikidata_descriptions"] = enrichment.descriptions
        if "nl" in enrichment.descriptions:
            result["wikidata_description_nl"] = enrichment.descriptions["nl"]
        if "en" in enrichment.descriptions:
            result["wikidata_description_en"] = enrichment.descriptions["en"]

    # Add aliases
    if enrichment.aliases:
        result["wikidata_aliases"] = enrichment.aliases

    # Add identifiers
    if enrichment.identifiers:
        result["wikidata_identifiers"] = {k: v for k, v in enrichment.identifiers.items() if v}

    # Add instance types
    if enrichment.instance_of:
        result["wikidata_instance_of"] = enrichment.instance_of

    # Add location data
    if enrichment.country:
        result["wikidata_country"] = enrichment.country
    if enrichment.location:
        result["wikidata_located_in"] = enrichment.location
    if enrichment.coordinates:
        result["wikidata_coordinates"] = enrichment.coordinates

    # Add temporal data
    if enrichment.inception:
        result["wikidata_inception"] = enrichment.inception
    if enrichment.dissolution:
        result["wikidata_dissolution"] = enrichment.dissolution

    # Add web presence
    if enrichment.official_website:
        result["wikidata_official_website"] = enrichment.official_website

    # Add media
    if enrichment.image:
        result["wikidata_image"] = enrichment.image
    if enrichment.logo:
        result["wikidata_logo"] = enrichment.logo

    # Add sitelinks (Wikipedia links)
    if enrichment.sitelinks:
        result["wikidata_sitelinks"] = enrichment.sitelinks

    # Add other claims
    if enrichment.claims:
        result["wikidata_claims"] = enrichment.claims

    return result


def get_processed_entries(entries_dir: Path) -> set:
    """Get set of already processed entry indices."""
    processed = set()
    for f in entries_dir.glob("*.yaml"):
        # Extract index from filename like "0001_Q12345.yaml"
        match = re.match(r"(\d+)_", f.name)
        if match:
            processed.add(int(match.group(1)))
    return processed


def save_entry(entries_dir: Path, index: int, entry: Dict, enrichment_data: Optional[Dict], error: Optional[str] = None):
    """Save a single entry to a YAML file."""
    wikidata_id = entry.get("wikidata_id", "unknown")

    # Sanitize wikidata_id for use in filename
    # Handle cases where wikidata_id is a URL or contains invalid characters
    if wikidata_id and isinstance(wikidata_id, str):
        # Check if it's a valid Q-number
        if re.match(r'^Q\d+$', wikidata_id):
            safe_id = wikidata_id
        else:
            # Invalid wikidata_id (URL or other format) - mark as invalid
            safe_id = "invalid_id"
    else:
        safe_id = "unknown"

    filename = f"{index:04d}_{safe_id}.yaml"
    filepath = entries_dir / filename

    output = {
        "original_entry": entry,
        "entry_index": index,
        "processing_timestamp": datetime.now(timezone.utc).isoformat(),
    }

    if enrichment_data:
        output["wikidata_enrichment"] = enrichment_data
        output["enrichment_status"] = "success"
    elif error:
        output["enrichment_error"] = error
        output["enrichment_status"] = "error"
    else:
        output["enrichment_status"] = "skipped"
        output["skip_reason"] = "no_wikidata_id"

    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(output, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    return filepath


def main():
    """Main entry point with per-entry storage and resume capability."""
    # Paths
    script_dir = Path(__file__).parent
    data_dir = script_dir.parent / "data" / "nde"
    input_file = data_dir / "nde_register_nl.yaml"
    entries_dir = data_dir / "enriched" / "entries"

    # Ensure entries directory exists
    entries_dir.mkdir(parents=True, exist_ok=True)

    logger.info(f"Input file: {input_file}")
    logger.info(f"Entries directory: {entries_dir}")

    # Load input YAML
    logger.info("Loading input YAML file...")
    with open(input_file, 'r', encoding='utf-8') as f:
        entries = yaml.safe_load(f)

    total_entries = len(entries)
    logger.info(f"Loaded {total_entries} entries")

    # Check for already processed entries
    processed_indices = get_processed_entries(entries_dir)
    logger.info(f"Found {len(processed_indices)} already processed entries")

    # Count entries with wikidata_id
    entries_with_wikidata = [i for i, e in enumerate(entries) if e.get("wikidata_id")]
    remaining_to_process = [i for i in entries_with_wikidata if i not in processed_indices]

    logger.info(f"Entries with wikidata_id: {len(entries_with_wikidata)}")
    logger.info(f"Remaining to process: {len(remaining_to_process)}")

    # Stats
    success_count = 0
    skip_count = 0
    error_count = 0
    rate_limit_count = 0

    # Adaptive delay
    current_delay = BASE_DELAY
    consecutive_successes = 0

    with httpx.Client(timeout=30.0) as client:
        for i, entry in enumerate(entries):
            # Skip already processed
            if i in processed_indices:
                continue

            wikidata_id = entry.get("wikidata_id")
            org_name = entry.get("organisatie", "Unknown")

            if not wikidata_id:
                # Save as skipped
                save_entry(entries_dir, i, entry, None)
                skip_count += 1
                continue

            # Validate and normalize wikidata_id
            wikidata_id_str = str(wikidata_id).strip()

            # Check if it's a valid Q-number or can be converted to one
            if re.match(r'^Q\d+$', wikidata_id_str):
                # Already valid Q-number
                pass
            elif re.match(r'^\d+$', wikidata_id_str):
                # Just a number, add Q prefix
                wikidata_id_str = f"Q{wikidata_id_str}"
            else:
                # Invalid format (URL, text, etc.) - skip with error
                logger.warning(f"[{i+1}/{total_entries}] Invalid wikidata_id format: {wikidata_id_str} for {org_name}")
                save_entry(entries_dir, i, entry, None, f"Invalid wikidata_id format: {wikidata_id_str}")
                error_count += 1
                continue

            # Log progress
            progress_pct = (i + 1) / total_entries * 100
            logger.info(f"[{i+1}/{total_entries}] ({progress_pct:.1f}%) Enriching: {org_name} ({wikidata_id_str}) [delay={current_delay:.1f}s]")

            # Fetch and enrich
            try:
                data, metadata = fetch_entity_with_metadata(wikidata_id_str, client, current_delay)

                if data and metadata:
                    enrichment = parse_entity_data(wikidata_id_str, data, metadata)
                    enrichment_dict = enrichment_to_dict(enrichment)
                    save_entry(entries_dir, i, entry, enrichment_dict)
                    success_count += 1
                    consecutive_successes += 1

                    # Reduce delay after consecutive successes (min BASE_DELAY)
                    if consecutive_successes >= 5 and current_delay > BASE_DELAY:
                        current_delay = max(BASE_DELAY, current_delay / 1.5)
                        consecutive_successes = 0
                        logger.info(f"Reducing delay to {current_delay:.1f}s after successful requests")

                elif metadata and metadata.response_status == 429:
                    # Rate limited - back off
                    rate_limit_count += 1
                    consecutive_successes = 0
                    current_delay = min(MAX_DELAY, current_delay * BACKOFF_FACTOR)
                    logger.warning(f"Rate limited! Increasing delay to {current_delay:.1f}s")

                    # Save as error but allow retry later
                    save_entry(entries_dir, i, entry, None, f"Rate limited (429)")
                    error_count += 1

                    # Extra wait on rate limit
                    time.sleep(current_delay * 2)
                else:
                    save_entry(entries_dir, i, entry, None, "Failed to fetch from Wikidata")
                    error_count += 1
                    consecutive_successes = 0

            except Exception as e:
                logger.error(f"Error processing {org_name}: {e}")
                save_entry(entries_dir, i, entry, None, str(e))
                error_count += 1
                consecutive_successes = 0

            # Rate limiting delay
            time.sleep(current_delay)

    # Final summary
    logger.info("=" * 60)
    logger.info("ENRICHMENT COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Total entries: {total_entries}")
    logger.info(f"Entries with wikidata_id: {len(entries_with_wikidata)}")
    logger.info(f"Already processed (skipped): {len(processed_indices)}")
    logger.info(f"Successfully enriched this run: {success_count}")
    logger.info(f"Skipped (no wikidata_id): {skip_count}")
    logger.info(f"Errors: {error_count}")
    logger.info(f"Rate limit hits: {rate_limit_count}")
    logger.info(f"Entries directory: {entries_dir}")

    # Create summary log
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    log_file = data_dir / "enriched" / f"enrichment_log_{timestamp}.json"
    log_data = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "input_file": str(input_file),
        "entries_directory": str(entries_dir),
        "total_entries": total_entries,
        "entries_with_wikidata_id": len(entries_with_wikidata),
        "previously_processed": len(processed_indices),
        "successfully_enriched_this_run": success_count,
        "skipped_no_wikidata_id": skip_count,
        "errors": error_count,
        "rate_limit_hits": rate_limit_count,
        "authenticated": bool(WIKIDATA_API_TOKEN),
        "base_delay_seconds": BASE_DELAY,
        "final_delay_seconds": current_delay,
    }
    with open(log_file, 'w', encoding='utf-8') as f:
        json.dump(log_data, f, indent=2)
    logger.info(f"Log file: {log_file}")

    return 0


if __name__ == "__main__":
    sys.exit(main())