glam/scripts/resolve_qp_labels.py

#!/usr/bin/env python3
"""
Resolve Q and P numbers to human-readable labels in enriched entries.

This script:
- Scans enriched YAML files for Q-numbers (entities) and P-numbers (properties)
- Fetches labels and basic info from Wikidata
- Updates entries IN-PLACE, keeping original Q/P numbers and adding labels
- Caches resolved entities to avoid duplicate API calls

Usage:
    python scripts/resolve_qp_labels.py

Output:
    Updates files in data/nde/enriched/entries/ with resolved labels
"""

import os
import sys
import time
import json
import yaml
import httpx
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Set
from dataclasses import dataclass, field
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
WIKIDATA_API = "https://www.wikidata.org/w/api.php"

# Rate limiting
BASE_DELAY = 0.5  # Faster since we batch requests
MAX_BATCH_SIZE = 50  # Wikidata API limit for wbgetentities

WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
USER_AGENT = f"GLAMDataExtractor/1.0 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"

HEADERS = {
    "Accept": "application/json",
    "User-Agent": USER_AGENT,
}


@dataclass
class EntityInfo:
    """Basic info about a Wikidata entity."""
    id: str
    label_en: Optional[str] = None
    label_nl: Optional[str] = None
    description_en: Optional[str] = None
    description_nl: Optional[str] = None
    instance_of: Optional[List[str]] = None  # For Q entities
    property_type: Optional[str] = None  # For P entities (e.g., "external-id", "wikibase-item")


# Cache for resolved entities
entity_cache: Dict[str, EntityInfo] = {}


def extract_qp_numbers(obj: Any, found: Set[str] = None) -> Set[str]:
    """Recursively extract all Q and P numbers from a nested structure."""
    if found is None:
        found = set()

    if isinstance(obj, str):
        # Match Q or P followed by digits
        matches = re.findall(r'\b([QP]\d+)\b', obj)
        found.update(matches)
    elif isinstance(obj, dict):
        for key, value in obj.items():
            # Check keys too (like "P276")
            if re.match(r'^[QP]\d+$', key):
                found.add(key)
            extract_qp_numbers(value, found)
    elif isinstance(obj, list):
        for item in obj:
            extract_qp_numbers(item, found)

    return found


def fetch_entities_batch(entity_ids: List[str], client: httpx.Client) -> Dict[str, EntityInfo]:
    """Fetch multiple entities in a single API call."""
    if not entity_ids:
        return {}

    # Separate Q and P entities
    results = {}

    # Use wbgetentities API for batch fetching
    ids_str = "|".join(entity_ids)

    params = {
        "action": "wbgetentities",
        "ids": ids_str,
        "props": "labels|descriptions|claims|datatype",
        "languages": "en|nl",
        "format": "json",
    }

    try:
        response = client.get(WIKIDATA_API, params=params, headers=HEADERS)
        response.raise_for_status()
        data = response.json()

        entities = data.get("entities", {})

        for entity_id, entity_data in entities.items():
            if "missing" in entity_data:
                continue

            info = EntityInfo(id=entity_id)

            # Extract labels
            labels = entity_data.get("labels", {})
            if "en" in labels:
                info.label_en = labels["en"].get("value")
            if "nl" in labels:
                info.label_nl = labels["nl"].get("value")

            # Extract descriptions
            descriptions = entity_data.get("descriptions", {})
            if "en" in descriptions:
                info.description_en = descriptions["en"].get("value")
            if "nl" in descriptions:
                info.description_nl = descriptions["nl"].get("value")

            # For properties, get datatype
            if entity_id.startswith("P"):
                info.property_type = entity_data.get("datatype")

            # For entities, get instance_of (P31)
            if entity_id.startswith("Q"):
                claims = entity_data.get("claims", {})
                p31_claims = claims.get("P31", [])
                instance_of_ids = []
                for claim in p31_claims[:3]:  # Limit to first 3
                    mainsnak = claim.get("mainsnak", {})
                    datavalue = mainsnak.get("datavalue", {})
                    if datavalue.get("type") == "wikibase-entityid":
                        qid = datavalue.get("value", {}).get("id")
                        if qid:
                            instance_of_ids.append(qid)
                if instance_of_ids:
                    info.instance_of = instance_of_ids

            results[entity_id] = info
            entity_cache[entity_id] = info

        return results

    except Exception as e:
        logger.error(f"Error fetching batch: {e}")
        return {}


def entity_info_to_dict(info: EntityInfo) -> Dict[str, Any]:
    """Convert EntityInfo to a dictionary for YAML output."""
    result = {"id": info.id}

    if info.label_en:
        result["label_en"] = info.label_en
    if info.label_nl:
        result["label_nl"] = info.label_nl
    if info.description_en:
        result["description_en"] = info.description_en
    if info.description_nl:
        result["description_nl"] = info.description_nl
    if info.instance_of:
        result["instance_of"] = info.instance_of
    if info.property_type:
        result["property_type"] = info.property_type

    return result


def resolve_value(value: Any, resolved_entities: Dict[str, EntityInfo]) -> Any:
    """
    Resolve Q/P numbers in a value, keeping originals and adding labels.

    Transforms:
        "Q33506" -> {"id": "Q33506", "label_en": "museum", ...}
        {"id": "Q55"} -> {"id": "Q55", "label_en": "Netherlands", ...}
    """
    if isinstance(value, str):
        # Check if it's a bare Q/P number
        if re.match(r'^[QP]\d+$', value):
            if value in resolved_entities:
                return entity_info_to_dict(resolved_entities[value])
            return value
        return value

    elif isinstance(value, dict):
        # Check if it's an entity reference like {"id": "Q55"}
        if "id" in value and isinstance(value["id"], str):
            entity_id = value["id"]
            if re.match(r'^[QP]\d+$', entity_id) and entity_id in resolved_entities:
                # Merge original dict with resolved info
                resolved = entity_info_to_dict(resolved_entities[entity_id])
                # Keep any additional fields from original
                for k, v in value.items():
                    if k not in resolved:
                        resolved[k] = v
                return resolved

        # Recursively process dict values
        return {k: resolve_value(v, resolved_entities) for k, v in value.items()}

    elif isinstance(value, list):
        return [resolve_value(item, resolved_entities) for item in value]

    return value


def resolve_claims_keys(claims: Dict[str, Any], resolved_entities: Dict[str, EntityInfo]) -> Dict[str, Any]:
    """
    Resolve P-number keys in claims dict to include labels.

    Transforms:
        {"P276": "Q3028083", ...}
    To:
        {"P276_location": {"property": {"id": "P276", "label_en": "location"}, "value": {...}}, ...}
    """
    resolved_claims = {}

    for key, value in claims.items():
        if re.match(r'^P\d+$', key):
            # It's a P-number key
            if key in resolved_entities:
                prop_info = resolved_entities[key]
                # Create a more descriptive key
                label = prop_info.label_en or prop_info.label_nl or key
                # Sanitize label for use as key
                safe_label = re.sub(r'[^a-zA-Z0-9_]', '_', label.lower())
                new_key = f"{key}_{safe_label}"

                resolved_claims[new_key] = {
                    "property": entity_info_to_dict(prop_info),
                    "value": resolve_value(value, resolved_entities)
                }
            else:
                resolved_claims[key] = resolve_value(value, resolved_entities)
        else:
            # Keep as-is but still resolve values
            resolved_claims[key] = resolve_value(value, resolved_entities)

    return resolved_claims


def process_enrichment(enrichment: Dict[str, Any], resolved_entities: Dict[str, EntityInfo]) -> Dict[str, Any]:
    """Process the wikidata_enrichment section to add resolved labels."""

    result = dict(enrichment)

    # Add a resolved_entities section with all entity info
    qp_in_enrichment = extract_qp_numbers(enrichment)
    result["_resolved_entities"] = {
        qp: entity_info_to_dict(resolved_entities[qp])
        for qp in sorted(qp_in_enrichment)
        if qp in resolved_entities
    }

    # Resolve specific sections

    # instance_of
    if "wikidata_instance_of" in result:
        result["wikidata_instance_of"] = resolve_value(
            result["wikidata_instance_of"], resolved_entities
        )

    # country
    if "wikidata_country" in result:
        result["wikidata_country"] = resolve_value(
            result["wikidata_country"], resolved_entities
        )

    # located_in
    if "wikidata_located_in" in result:
        result["wikidata_located_in"] = resolve_value(
            result["wikidata_located_in"], resolved_entities
        )

    # claims - resolve both keys and values
    if "wikidata_claims" in result:
        result["wikidata_claims"] = resolve_claims_keys(
            result["wikidata_claims"], resolved_entities
        )

    return result


def process_file(filepath: Path, resolved_entities: Dict[str, EntityInfo]) -> bool:
    """Process a single enriched entry file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        if not data:
            return False

        # Check if already processed
        enrichment = data.get("wikidata_enrichment")
        if not enrichment:
            return False

        if "_resolved_entities" in enrichment:
            # Already processed
            return False

        # Process the enrichment
        data["wikidata_enrichment"] = process_enrichment(enrichment, resolved_entities)
        data["qp_resolution_timestamp"] = datetime.now(timezone.utc).isoformat()

        # Write back
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        return True

    except Exception as e:
        logger.error(f"Error processing {filepath}: {e}")
        return False


def main():
    """Main entry point."""
    script_dir = Path(__file__).parent
    entries_dir = script_dir.parent / "data" / "nde" / "enriched" / "entries"
    cache_file = entries_dir.parent / "entity_cache.json"

    logger.info(f"Entries directory: {entries_dir}")

    # Load existing cache
    if cache_file.exists():
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                cache_data = json.load(f)
            for entity_id, info_dict in cache_data.items():
                entity_cache[entity_id] = EntityInfo(
                    id=info_dict["id"],
                    label_en=info_dict.get("label_en"),
                    label_nl=info_dict.get("label_nl"),
                    description_en=info_dict.get("description_en"),
                    description_nl=info_dict.get("description_nl"),
                    instance_of=info_dict.get("instance_of"),
                    property_type=info_dict.get("property_type"),
                )
            logger.info(f"Loaded {len(entity_cache)} cached entities")
        except Exception as e:
            logger.warning(f"Could not load cache: {e}")

    # Get all YAML files
    yaml_files = sorted(entries_dir.glob("*.yaml"))
    logger.info(f"Found {len(yaml_files)} entry files")

    # First pass: collect all Q/P numbers
    logger.info("Scanning files for Q/P numbers...")
    all_qp_numbers: Set[str] = set()
    files_to_process = []

    for filepath in yaml_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                continue

            enrichment = data.get("wikidata_enrichment")
            if not enrichment:
                continue

            # Skip already processed
            if "_resolved_entities" in enrichment:
                continue

            files_to_process.append(filepath)
            qp_numbers = extract_qp_numbers(enrichment)
            all_qp_numbers.update(qp_numbers)

        except Exception as e:
            logger.warning(f"Error scanning {filepath}: {e}")

    logger.info(f"Found {len(all_qp_numbers)} unique Q/P numbers in {len(files_to_process)} files to process")

    # Remove already cached
    to_fetch = [qp for qp in all_qp_numbers if qp not in entity_cache]
    logger.info(f"Need to fetch {len(to_fetch)} entities (have {len(entity_cache)} cached)")

    # Fetch in batches
    with httpx.Client(timeout=30.0) as client:
        for i in range(0, len(to_fetch), MAX_BATCH_SIZE):
            batch = to_fetch[i:i + MAX_BATCH_SIZE]
            logger.info(f"Fetching batch {i // MAX_BATCH_SIZE + 1}/{(len(to_fetch) + MAX_BATCH_SIZE - 1) // MAX_BATCH_SIZE} ({len(batch)} entities)")

            fetch_entities_batch(batch, client)
            time.sleep(BASE_DELAY)

    logger.info(f"Total cached entities: {len(entity_cache)}")

    # Save cache
    cache_data = {
        entity_id: entity_info_to_dict(info)
        for entity_id, info in entity_cache.items()
    }
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(cache_data, f, indent=2, ensure_ascii=False)
    logger.info(f"Saved cache to {cache_file}")

    # Second pass: process files
    logger.info("Processing files...")
    processed_count = 0

    for i, filepath in enumerate(files_to_process):
        if (i + 1) % 50 == 0:
            logger.info(f"Processing file {i + 1}/{len(files_to_process)}")

        if process_file(filepath, entity_cache):
            processed_count += 1

    logger.info("=" * 60)
    logger.info("Q/P RESOLUTION COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Files processed: {processed_count}")
    logger.info(f"Entities resolved: {len(entity_cache)}")
    logger.info(f"Cache file: {cache_file}")

    return 0


if __name__ == "__main__":
    sys.exit(main())