glam/scripts/enrich_custodians_generic.py

#!/usr/bin/env python3
"""
Generic enrichment script for custodian YAML files.

This script can process any country's custodian files and enrich them from
multiple sources based on available identifiers:

1. **Wikidata ID available** → Full Wikidata enrichment (labels, temporal, identifiers, location, etc.)
2. **ISIL code available** → Resolve to Wikidata via P791, then full enrichment
3. **Website URL available** → Could use for web scraping/validation (future)

Sources for Wikidata ID detection:
- wikidata_enrichment.wikidata_entity_id
- identifiers[].identifier_scheme == "Wikidata"
- original_entry.wikidata_id
- original_entry.identifiers[].identifier_scheme == "Wikidata"

Usage:
    python scripts/enrich_custodians_generic.py [--country XX] [--dry-run] [--limit N] [--force]

Examples:
    # Dry run on Japan files (first 10)
    python scripts/enrich_custodians_generic.py --country JP --limit 10 --dry-run

    # Enrich all Czech files
    python scripts/enrich_custodians_generic.py --country CZ

    # Force re-enrichment of already enriched files
    python scripts/enrich_custodians_generic.py --country JP --force

    # Resume from last checkpoint
    python scripts/enrich_custodians_generic.py --country JP --resume

Options:
    --country XX    Only process files for country code XX (e.g., JP, CZ, NL)
    --dry-run       Show what would be enriched without modifying files
    --limit N       Process only first N files (for testing)
    --force         Re-enrich even if already has wikidata_enrichment
    --resume        Resume from last checkpoint
    --verbose       Show detailed progress information

Environment Variables:
    WIKIDATA_API_TOKEN - Optional OAuth2 token for increased rate limits (5,000 req/hr)

See AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
"""

import argparse
import json
import logging
import os
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Set

import httpx
import yaml

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
PROGRESS_FILE = Path(__file__).parent.parent / "data" / "custodian" / ".generic_enrichment_progress.json"

# Rate limiting
WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "")
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
USER_AGENT = f"GLAMDataExtractor/1.2 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"

if WIKIDATA_API_TOKEN:
    REQUEST_DELAY = 0.75  # ~4800 requests per hour
    logger.info("Using authenticated mode: 5,000 req/hr limit")
else:
    REQUEST_DELAY = 7.5  # ~480 requests per hour
    logger.info("Using anonymous mode: 500 req/hr limit")

# HTTP Headers
HEADERS = {
    "Accept": "application/json",
    "User-Agent": USER_AGENT,
}
if WIKIDATA_API_TOKEN:
    HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}"

# COMPREHENSIVE Property mapping - capturing ALL useful properties for heritage institutions
# Organized by category for clarity
PROPERTY_MAPPING = {
    # === TEMPORAL PROPERTIES ===
    "P571": {"name": "inception", "type": "time", "category": "temporal"},
    "P576": {"name": "dissolution", "type": "time", "category": "temporal"},
    "P1619": {"name": "date_of_official_opening", "type": "time", "category": "temporal"},
    "P580": {"name": "start_time", "type": "time", "category": "temporal"},
    "P582": {"name": "end_time", "type": "time", "category": "temporal"},

    # === ORGANIZATIONAL PROPERTIES ===
    "P31": {"name": "instance_of", "type": "entity_list", "category": "classification"},
    "P17": {"name": "country", "type": "entity", "category": "location"},
    "P131": {"name": "located_in_admin_entity", "type": "entity", "category": "location"},
    "P276": {"name": "location", "type": "entity", "category": "location"},
    "P159": {"name": "headquarters_location", "type": "entity", "category": "location"},
    "P625": {"name": "coordinates", "type": "coordinates", "category": "location"},
    "P969": {"name": "located_at_street_address", "type": "string", "category": "location"},
    "P281": {"name": "postal_code", "type": "string", "category": "location"},
    "P749": {"name": "parent_organization", "type": "entity", "category": "organization"},
    "P355": {"name": "subsidiary", "type": "entity_list", "category": "organization"},
    "P361": {"name": "part_of", "type": "entity", "category": "organization"},
    "P527": {"name": "has_parts", "type": "entity_list", "category": "organization"},
    "P463": {"name": "member_of", "type": "entity_list", "category": "organization"},
    "P101": {"name": "field_of_work", "type": "entity_list", "category": "classification"},
    "P921": {"name": "main_subject", "type": "entity_list", "category": "classification"},
    "P3032": {"name": "adjacent_building", "type": "entity", "category": "location"},
    "P1435": {"name": "heritage_designation", "type": "entity_list", "category": "classification"},
    "P112": {"name": "founded_by", "type": "entity_list", "category": "organization"},
    "P169": {"name": "chief_executive_officer", "type": "entity", "category": "organization"},
    "P488": {"name": "chairperson", "type": "entity", "category": "organization"},

    # === IDENTIFIERS ===
    "P791": {"name": "isil", "type": "string", "category": "identifier"},
    "P214": {"name": "viaf", "type": "string", "category": "identifier"},
    "P227": {"name": "gnd", "type": "string", "category": "identifier"},
    "P244": {"name": "lcnaf", "type": "string", "category": "identifier"},
    "P268": {"name": "bnf", "type": "string", "category": "identifier"},
    "P269": {"name": "idref", "type": "string", "category": "identifier"},
    "P213": {"name": "isni", "type": "string", "category": "identifier"},
    "P1566": {"name": "geonames_id", "type": "string", "category": "identifier"},
    "P349": {"name": "ndl_authority_id", "type": "string", "category": "identifier"},
    "P271": {"name": "nacsis_cat_id", "type": "string", "category": "identifier"},
    "P2671": {"name": "google_knowledge_graph_id", "type": "string", "category": "identifier"},
    "P3134": {"name": "tripadvisor_id", "type": "string", "category": "identifier"},
    "P11693": {"name": "openstreetmap_node_id", "type": "string", "category": "identifier"},
    "P11496": {"name": "cinii_research_id", "type": "string", "category": "identifier"},
    "P5587": {"name": "libris_uri", "type": "string", "category": "identifier"},
    "P496": {"name": "orcid", "type": "string", "category": "identifier"},
    "P1015": {"name": "noraf_id", "type": "string", "category": "identifier"},
    "P1006": {"name": "nta_id", "type": "string", "category": "identifier"},
    "P409": {"name": "nla_id", "type": "string", "category": "identifier"},
    "P950": {"name": "bne_id", "type": "string", "category": "identifier"},
    "P906": {"name": "selibr", "type": "string", "category": "identifier"},
    "P1017": {"name": "bac_id", "type": "string", "category": "identifier"},
    "P7859": {"name": "worldcat_identities_id", "type": "string", "category": "identifier"},
    "P3500": {"name": "ringgold_id", "type": "string", "category": "identifier"},
    "P2427": {"name": "grid_id", "type": "string", "category": "identifier"},
    "P6782": {"name": "ror_id", "type": "string", "category": "identifier"},
    "P3153": {"name": "crossref_funder_id", "type": "string", "category": "identifier"},

    # === WEB PRESENCE ===
    "P856": {"name": "official_website", "type": "url", "category": "web"},
    "P1581": {"name": "official_blog_url", "type": "url", "category": "web"},
    "P973": {"name": "described_at_url", "type": "url", "category": "web"},
    "P2013": {"name": "facebook_id", "type": "string", "category": "social"},
    "P2002": {"name": "twitter_username", "type": "string", "category": "social"},
    "P2003": {"name": "instagram_username", "type": "string", "category": "social"},
    "P2397": {"name": "youtube_channel_id", "type": "string", "category": "social"},
    "P4264": {"name": "linkedin_company_id", "type": "string", "category": "social"},
    "P4003": {"name": "facebook_page_id", "type": "string", "category": "social"},
    "P8687": {"name": "social_media_followers", "type": "quantity", "category": "social"},

    # === MEDIA ===
    "P18": {"name": "image", "type": "commons_media", "category": "media"},
    "P154": {"name": "logo", "type": "commons_media", "category": "media"},
    "P41": {"name": "flag_image", "type": "commons_media", "category": "media"},
    "P94": {"name": "coat_of_arms", "type": "commons_media", "category": "media"},
    "P373": {"name": "commons_category", "type": "string", "category": "media"},
    "P935": {"name": "commons_gallery", "type": "string", "category": "media"},

    # === CONTACT ===
    "P968": {"name": "email", "type": "string", "category": "contact"},
    "P1329": {"name": "phone_number", "type": "string", "category": "contact"},
    "P3740": {"name": "number_of_works", "type": "quantity", "category": "collection"},
    "P1436": {"name": "collection_items_count", "type": "quantity", "category": "collection"},

    # === AWARDS & RECOGNITION ===
    "P166": {"name": "award_received", "type": "entity_list", "category": "recognition"},

    # === ARCHITECTURE ===
    "P149": {"name": "architectural_style", "type": "entity_list", "category": "architecture"},
    "P84": {"name": "architect", "type": "entity_list", "category": "architecture"},
    "P631": {"name": "structural_engineer", "type": "entity_list", "category": "architecture"},
}

# CRITICAL KEYS that must NEVER be deleted during enrichment
# See AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
PROTECTED_KEYS = {
    'location', 'original_entry', 'ghcid', 'custodian_name', 'identifiers',
    'provenance', 'ch_annotator', 'google_maps_enrichment', 'osm_enrichment',
    'unesco_mow_enrichment', 'web_enrichment', 'linkedin_enrichment',
    'zcbs_enrichment', 'person_observations'
}


@dataclass
class FullWikidataEnrichment:
    """Container for comprehensive Wikidata enrichment data."""
    entity_id: str
    labels: Dict[str, str] = field(default_factory=dict)
    descriptions: Dict[str, str] = field(default_factory=dict)
    aliases: Dict[str, List[str]] = field(default_factory=dict)
    sitelinks: Dict[str, str] = field(default_factory=dict)

    # All extracted properties organized by category
    temporal: Dict[str, Any] = field(default_factory=dict)
    classification: Dict[str, Any] = field(default_factory=dict)
    location: Dict[str, Any] = field(default_factory=dict)
    organization: Dict[str, Any] = field(default_factory=dict)
    identifiers: Dict[str, str] = field(default_factory=dict)
    web: Dict[str, str] = field(default_factory=dict)
    social: Dict[str, str] = field(default_factory=dict)
    media: Dict[str, str] = field(default_factory=dict)
    contact: Dict[str, str] = field(default_factory=dict)
    collection: Dict[str, Any] = field(default_factory=dict)
    recognition: Dict[str, Any] = field(default_factory=dict)
    architecture: Dict[str, Any] = field(default_factory=dict)

    # Metadata
    fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    properties_found: List[str] = field(default_factory=list)


def extract_value_from_statement(statement: Dict) -> Any:
    """Extract the value from a Wikidata statement structure."""
    try:
        value_data = statement.get("value", {})
        content = value_data.get("content")

        if isinstance(content, dict):
            if "entity-type" in content or "id" in content:
                return content.get("id", content)
            elif "time" in content:
                time_val = content.get("time", "")
                if time_val.startswith("+") or time_val.startswith("-"):
                    time_val = time_val[1:]
                if "T" in time_val:
                    time_val = time_val.split("T")[0]
                return time_val
            elif "latitude" in content and "longitude" in content:
                return {
                    "latitude": content.get("latitude"),
                    "longitude": content.get("longitude"),
                    "precision": content.get("precision")
                }
            elif "amount" in content:
                return content.get("amount", "").lstrip("+")
            else:
                return content
        else:
            return content
    except Exception:
        return None


def fetch_entity_labels_batch(entity_ids: Set[str], client: httpx.Client) -> Dict[str, Dict[str, str]]:
    """Fetch labels for multiple entities in a batch using SPARQL."""
    if not entity_ids:
        return {}

    # Limit batch size
    entity_ids_list = list(entity_ids)[:50]

    entity_values = " ".join([f"wd:{eid}" for eid in entity_ids_list])
    query = f"""
    SELECT ?entity ?entityLabel ?entityDescription WHERE {{
      VALUES ?entity {{ {entity_values} }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,de,fr,cs,zh". }}
    }}
    """

    try:
        response = client.get(
            WIKIDATA_SPARQL_ENDPOINT,
            params={"query": query, "format": "json"},
            headers={"User-Agent": USER_AGENT, "Accept": "application/sparql-results+json"}
        )
        response.raise_for_status()
        results = response.json()

        labels = {}
        for binding in results.get("results", {}).get("bindings", []):
            entity_uri = binding.get("entity", {}).get("value", "")
            entity_id = entity_uri.split("/")[-1] if entity_uri else None
            if entity_id:
                labels[entity_id] = {
                    "id": entity_id,
                    "label": binding.get("entityLabel", {}).get("value", entity_id),
                    "description": binding.get("entityDescription", {}).get("value", "")
                }
        return labels
    except Exception as e:
        logger.warning(f"SPARQL label fetch failed: {e}")
        return {eid: {"id": eid, "label": eid} for eid in entity_ids_list}


def fetch_entity_data(entity_id: str, client: httpx.Client) -> Optional[Dict]:
    """Fetch full entity data from Wikibase REST API."""
    url = f"{WIKIDATA_REST_API}/entities/items/{entity_id}"

    try:
        response = client.get(url, headers=HEADERS)

        if response.status_code == 403:
            headers_no_auth = {k: v for k, v in HEADERS.items() if k != "Authorization"}
            response = client.get(url, headers=headers_no_auth)

        response.raise_for_status()
        return response.json()

    except httpx.HTTPStatusError as e:
        if e.response.status_code == 404:
            logger.warning(f"Entity {entity_id} not found")
        else:
            logger.error(f"HTTP error fetching {entity_id}: {e}")
        return None
    except Exception as e:
        logger.error(f"Error fetching {entity_id}: {e}")
        return None


def resolve_isil_to_wikidata(isil_code: str, client: httpx.Client) -> Optional[str]:
    """Resolve an ISIL code to a Wikidata entity ID via P791."""
    query = f"""
    SELECT ?item WHERE {{
      ?item wdt:P791 "{isil_code}" .
    }}
    LIMIT 1
    """

    try:
        response = client.get(
            WIKIDATA_SPARQL_ENDPOINT,
            params={"query": query, "format": "json"},
            headers={"User-Agent": USER_AGENT, "Accept": "application/sparql-results+json"}
        )
        response.raise_for_status()
        results = response.json()

        bindings = results.get("results", {}).get("bindings", [])
        if bindings:
            entity_uri = bindings[0].get("item", {}).get("value", "")
            entity_id = entity_uri.split("/")[-1] if entity_uri else None
            if entity_id and entity_id.startswith("Q"):
                logger.info(f"  Resolved ISIL {isil_code} → {entity_id}")
                return entity_id
        return None
    except Exception as e:
        logger.warning(f"SPARQL ISIL resolution failed for {isil_code}: {e}")
        return None


def parse_entity_data_full(entity_id: str, data: Dict, client: httpx.Client) -> FullWikidataEnrichment:
    """Parse the full entity data with label resolution."""
    enrichment = FullWikidataEnrichment(entity_id=entity_id)

    # Extract labels
    enrichment.labels = data.get("labels", {})
    enrichment.descriptions = data.get("descriptions", {})
    enrichment.aliases = data.get("aliases", {})

    # Extract sitelinks
    sitelinks = data.get("sitelinks", {})
    enrichment.sitelinks = {k: v.get("title", "") for k, v in sitelinks.items() if isinstance(v, dict)}

    # Collect entity IDs that need label resolution
    entity_ids_to_resolve: Set[str] = set()

    # Process all statements
    statements = data.get("statements", {})

    for prop_id, prop_statements in statements.items():
        if not prop_statements:
            continue

        prop_config = PROPERTY_MAPPING.get(prop_id)
        if not prop_config:
            continue  # Skip unknown properties

        enrichment.properties_found.append(prop_id)
        prop_name: str = prop_config["name"]
        prop_type: str = prop_config["type"]
        category: str = prop_config["category"]

        values: List[Any] = []
        for stmt in prop_statements:
            value = extract_value_from_statement(stmt)
            if value is not None:
                values.append(value)
                # Collect entity IDs for label resolution
                if prop_type in ("entity", "entity_list") and isinstance(value, str) and value.startswith("Q"):
                    entity_ids_to_resolve.add(value)

        if not values:
            continue

        # Store values in appropriate category
        target_dict = getattr(enrichment, category, None)
        if target_dict is None:
            continue

        if prop_type == "entity":
            target_dict[prop_name] = values[0]
        elif prop_type == "entity_list":
            target_dict[prop_name] = values
        elif prop_type in ("string", "url"):
            target_dict[prop_name] = values[0] if len(values) == 1 else values
        elif prop_type == "time":
            target_dict[prop_name] = values[0]
        elif prop_type == "coordinates":
            target_dict[prop_name] = values[0]
        elif prop_type == "commons_media":
            target_dict[prop_name] = values[0]
        elif prop_type == "quantity":
            target_dict[prop_name] = values[0]

    # Resolve entity labels
    if entity_ids_to_resolve:
        time.sleep(0.2)  # Small delay before SPARQL query
        labels_map = fetch_entity_labels_batch(entity_ids_to_resolve, client)

        # Replace entity IDs with resolved labels
        for category_name in ["classification", "location", "organization", "recognition", "architecture"]:
            category_dict = getattr(enrichment, category_name, {})
            for key, value in list(category_dict.items()):
                if isinstance(value, str) and value in labels_map:
                    category_dict[key] = labels_map[value]
                elif isinstance(value, list):
                    category_dict[key] = [
                        labels_map.get(v, {"id": v, "label": v}) if isinstance(v, str) and v.startswith("Q") else v
                        for v in value
                    ]

    return enrichment


def enrichment_to_dict(enrichment: FullWikidataEnrichment) -> Dict:
    """Convert FullWikidataEnrichment to a dictionary for YAML output."""
    result = {
        "wikidata_entity_id": enrichment.entity_id,
        "api_metadata": {
            "api_endpoint": WIKIDATA_REST_API,
            "fetch_timestamp": enrichment.fetch_timestamp,
            "user_agent": USER_AGENT,
            "enrichment_version": "2.1_generic",
            "properties_found": enrichment.properties_found,
        }
    }

    # Add labels
    if enrichment.labels:
        result["wikidata_labels"] = enrichment.labels
        for lang in ["en", "nl", "ja", "de", "fr", "es", "cs", "zh"]:
            if lang in enrichment.labels:
                result[f"wikidata_label_{lang}"] = enrichment.labels[lang]

    # Add descriptions
    if enrichment.descriptions:
        result["wikidata_descriptions"] = enrichment.descriptions
        if "en" in enrichment.descriptions:
            result["wikidata_description_en"] = enrichment.descriptions["en"]

    # Add aliases
    if enrichment.aliases:
        result["wikidata_aliases"] = enrichment.aliases

    # Add sitelinks (Wikipedia articles)
    if enrichment.sitelinks:
        result["wikidata_sitelinks"] = enrichment.sitelinks

    # Add all category data with readable prefixes
    if enrichment.temporal:
        result["wikidata_temporal"] = enrichment.temporal
        # Promote key dates to top level for easy access
        if "inception" in enrichment.temporal:
            result["wikidata_inception"] = enrichment.temporal["inception"]
        if "dissolution" in enrichment.temporal:
            result["wikidata_dissolution"] = enrichment.temporal["dissolution"]
        if "date_of_official_opening" in enrichment.temporal:
            result["wikidata_opening_date"] = enrichment.temporal["date_of_official_opening"]

    if enrichment.classification:
        result["wikidata_classification"] = enrichment.classification
        if "instance_of" in enrichment.classification:
            result["wikidata_instance_of"] = enrichment.classification["instance_of"]
        if "field_of_work" in enrichment.classification:
            result["wikidata_field_of_work"] = enrichment.classification["field_of_work"]

    if enrichment.location:
        result["wikidata_location"] = enrichment.location
        if "country" in enrichment.location:
            result["wikidata_country"] = enrichment.location["country"]
        if "located_in_admin_entity" in enrichment.location:
            result["wikidata_located_in"] = enrichment.location["located_in_admin_entity"]
        if "coordinates" in enrichment.location:
            result["wikidata_coordinates"] = enrichment.location["coordinates"]

    if enrichment.organization:
        result["wikidata_organization"] = enrichment.organization

    if enrichment.identifiers:
        result["wikidata_identifiers"] = enrichment.identifiers

    if enrichment.web:
        result["wikidata_web"] = enrichment.web
        if "official_website" in enrichment.web:
            result["wikidata_official_website"] = enrichment.web["official_website"]

    if enrichment.social:
        result["wikidata_social_media"] = enrichment.social

    if enrichment.media:
        result["wikidata_media"] = enrichment.media
        if "image" in enrichment.media:
            result["wikidata_image"] = enrichment.media["image"]
        if "logo" in enrichment.media:
            result["wikidata_logo"] = enrichment.media["logo"]

    if enrichment.contact:
        result["wikidata_contact"] = enrichment.contact

    if enrichment.collection:
        result["wikidata_collection"] = enrichment.collection

    if enrichment.recognition:
        result["wikidata_recognition"] = enrichment.recognition

    if enrichment.architecture:
        result["wikidata_architecture"] = enrichment.architecture

    return result


def get_wikidata_entity_id(data: Dict) -> Optional[str]:
    """
    Extract Wikidata entity ID from a custodian YAML file.

    Checks multiple locations where Wikidata ID might be stored:
    1. wikidata_enrichment.wikidata_entity_id
    2. identifiers[].identifier_scheme == "Wikidata"
    3. original_entry.wikidata_id
    4. original_entry.identifiers[].identifier_scheme == "Wikidata"
    """
    # Check wikidata_enrichment
    wd = data.get("wikidata_enrichment", {})
    if wd and wd.get("wikidata_entity_id"):
        return wd.get("wikidata_entity_id")

    # Check identifiers list
    identifiers = data.get("identifiers", [])
    for ident in identifiers:
        if isinstance(ident, dict):
            scheme = ident.get("identifier_scheme", "")
            if scheme.lower() == "wikidata":
                return ident.get("identifier_value")

    # Check original_entry
    original = data.get("original_entry", {})

    # Direct wikidata_id in original_entry
    if original.get("wikidata_id"):
        return original.get("wikidata_id")

    # Identifiers in original_entry
    for ident in original.get("identifiers", []):
        if isinstance(ident, dict):
            scheme = ident.get("identifier_scheme", "")
            if scheme.lower() == "wikidata":
                return ident.get("identifier_value")

    return None


def get_isil_code(data: Dict) -> Optional[str]:
    """
    Extract ISIL code from a custodian YAML file.

    Checks multiple locations where ISIL might be stored:
    1. identifiers[].identifier_scheme == "ISIL"
    2. original_entry.isil_code
    3. original_entry.identifiers[].identifier_scheme == "ISIL"
    4. wikidata_enrichment.wikidata_identifiers.isil
    """
    # Check identifiers list
    identifiers = data.get("identifiers", [])
    for ident in identifiers:
        if isinstance(ident, dict):
            scheme = ident.get("identifier_scheme", "")
            if scheme.lower() == "isil":
                return ident.get("identifier_value")

    # Check original_entry
    original = data.get("original_entry", {})

    if original.get("isil_code"):
        return original.get("isil_code")

    for ident in original.get("identifiers", []):
        if isinstance(ident, dict):
            scheme = ident.get("identifier_scheme", "")
            if scheme.lower() == "isil":
                return ident.get("identifier_value")

    # Check wikidata_enrichment.wikidata_identifiers
    wd = data.get("wikidata_enrichment", {})
    if wd.get("wikidata_identifiers", {}).get("isil"):
        return wd.get("wikidata_identifiers", {}).get("isil")

    return None


def is_fully_enriched(data: Dict) -> bool:
    """Check if file has been fully enriched with v2.0+."""
    wd = data.get("wikidata_enrichment", {})
    api_meta = wd.get("api_metadata", {})
    version = api_meta.get("enrichment_version", "")
    return version.startswith("2.") if version else False


def load_progress(progress_file: Path) -> Dict:
    """Load progress from checkpoint file."""
    if progress_file.exists():
        try:
            with open(progress_file, 'r') as f:
                return json.load(f)
        except Exception:
            pass
    return {"processed_files": [], "stats": {}, "isil_resolutions": {}}


def save_progress(progress: Dict, progress_file: Path):
    """Save progress to checkpoint file."""
    try:
        with open(progress_file, 'w') as f:
            json.dump(progress, f, indent=2)
    except Exception as e:
        logger.error(f"Failed to save progress: {e}")


def main():
    parser = argparse.ArgumentParser(
        description="Generic Wikidata enrichment for custodian files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Dry run on Japan files (first 10)
  python scripts/enrich_custodians_generic.py --country JP --limit 10 --dry-run

  # Enrich all Czech files
  python scripts/enrich_custodians_generic.py --country CZ

  # Force re-enrichment of already enriched files
  python scripts/enrich_custodians_generic.py --country JP --force

  # Resume from last checkpoint
  python scripts/enrich_custodians_generic.py --country JP --resume
        """
    )
    parser.add_argument("--country", type=str, help="Only process files for country code XX (e.g., JP, CZ, NL)")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be enriched without modifying files")
    parser.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = no limit)")
    parser.add_argument("--force", action="store_true", help="Re-enrich even if already has v2.0+ enrichment")
    parser.add_argument("--resume", action="store_true", help="Resume from last checkpoint")
    parser.add_argument("--verbose", action="store_true", help="Show detailed progress information")
    parser.add_argument("--resolve-isil", action="store_true", help="Try to resolve ISIL codes to Wikidata IDs")
    args = parser.parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    # Use country-specific progress file if country is specified
    progress_file = PROGRESS_FILE
    if args.country:
        progress_file = CUSTODIAN_DIR / f".enrichment_progress_{args.country}.json"

    progress = load_progress(progress_file) if args.resume else {"processed_files": [], "stats": {}, "isil_resolutions": {}}
    processed_files = set(progress.get("processed_files", []))
    isil_resolutions = progress.get("isil_resolutions", {})  # Cache ISIL → Wikidata mappings

    stats = {
        "total_scanned": 0,
        "needs_enrichment": 0,
        "already_enriched_v2": 0,
        "no_wikidata_id": 0,
        "has_isil_only": 0,
        "isil_resolved": 0,
        "enriched_successfully": 0,
        "errors": 0,
        "skipped_already_processed": 0,
        "properties_counts": {},
    }

    pattern = f"{args.country}-*.yaml" if args.country else "*.yaml"
    yaml_files = sorted(CUSTODIAN_DIR.glob(pattern))

    logger.info(f"Found {len(yaml_files)} YAML files matching pattern '{pattern}'")

    files_to_process = []

    # First pass: scan files and identify candidates
    logger.info("Scanning files to identify enrichment candidates...")
    for yaml_file in yaml_files:
        stats["total_scanned"] += 1

        if args.resume and yaml_file.name in processed_files:
            stats["skipped_already_processed"] += 1
            continue

        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                continue

            entity_id = get_wikidata_entity_id(data)

            if not entity_id and args.resolve_isil:
                # Try to find ISIL code
                isil_code = get_isil_code(data)
                if isil_code:
                    stats["has_isil_only"] += 1
                    # Check cache first
                    if isil_code in isil_resolutions:
                        entity_id = isil_resolutions[isil_code]
                        if entity_id:
                            stats["isil_resolved"] += 1
                    else:
                        # Will resolve later during processing
                        pass

            if not entity_id:
                stats["no_wikidata_id"] += 1
                # Still add to process list if we might resolve ISIL
                if args.resolve_isil:
                    isil_code = get_isil_code(data)
                    if isil_code and isil_code not in isil_resolutions:
                        files_to_process.append((yaml_file, None, isil_code))
                continue

            if not args.force and is_fully_enriched(data):
                stats["already_enriched_v2"] += 1
                continue

            stats["needs_enrichment"] += 1
            files_to_process.append((yaml_file, entity_id, None))

        except Exception as e:
            logger.error(f"Error reading {yaml_file}: {e}")
            stats["errors"] += 1

    logger.info(f"\n{'='*60}")
    logger.info(f"SCAN COMPLETE")
    logger.info(f"{'='*60}")
    logger.info(f"Total files scanned: {stats['total_scanned']}")
    logger.info(f"Files needing enrichment: {stats['needs_enrichment']}")
    logger.info(f"Files already enriched (v2.0+): {stats['already_enriched_v2']}")
    logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
    if args.resolve_isil:
        logger.info(f"Files with ISIL only: {stats['has_isil_only']}")
    logger.info(f"Skipped (already processed): {stats['skipped_already_processed']}")
    logger.info(f"{'='*60}\n")

    if args.limit > 0:
        files_to_process = files_to_process[:args.limit]
        logger.info(f"Limited to first {args.limit} files")

    if args.dry_run:
        logger.info("DRY RUN - No files will be modified")
        logger.info(f"\nFiles to process ({len(files_to_process)}):")
        for yaml_file, entity_id, isil_code in files_to_process[:20]:
            if entity_id:
                logger.info(f"  {yaml_file.name} → Wikidata: {entity_id}")
            elif isil_code:
                logger.info(f"  {yaml_file.name} → ISIL: {isil_code} (needs resolution)")
        if len(files_to_process) > 20:
            logger.info(f"  ... and {len(files_to_process) - 20} more")
        return

    # Second pass: process files
    with httpx.Client(timeout=30.0) as client:
        for i, (yaml_file, entity_id, isil_code) in enumerate(files_to_process):
            try:
                # Resolve ISIL to Wikidata if needed
                if not entity_id and isil_code:
                    if isil_code in isil_resolutions:
                        entity_id = isil_resolutions[isil_code]
                    else:
                        logger.info(f"[{i+1}/{len(files_to_process)}] Resolving ISIL {isil_code}...")
                        entity_id = resolve_isil_to_wikidata(isil_code, client)
                        isil_resolutions[isil_code] = entity_id
                        time.sleep(REQUEST_DELAY)

                    if entity_id:
                        stats["isil_resolved"] += 1
                    else:
                        logger.info(f"  Could not resolve ISIL {isil_code}")
                        continue

                if not entity_id:
                    continue

                logger.info(f"[{i+1}/{len(files_to_process)}] Enriching {yaml_file.name} ({entity_id})")

                # Re-read the file immediately before modifying
                with open(yaml_file, 'r', encoding='utf-8') as f:
                    data = yaml.safe_load(f)

                if not data:
                    logger.warning(f"  File is empty or invalid: {yaml_file.name}")
                    stats["errors"] += 1
                    continue

                # Record which protected keys exist BEFORE modification
                keys_before = set(data.keys())
                protected_keys_before = keys_before & PROTECTED_KEYS

                entity_data = fetch_entity_data(entity_id, client)

                if entity_data is None:
                    logger.warning(f"  Could not fetch data for {entity_id}")
                    stats["errors"] += 1
                    continue

                enrichment = parse_entity_data_full(entity_id, entity_data, client)
                enrichment_dict = enrichment_to_dict(enrichment)

                data["wikidata_enrichment"] = enrichment_dict

                # SAFETY CHECK: Verify no protected keys were lost
                keys_after = set(data.keys())
                protected_keys_after = keys_after & PROTECTED_KEYS
                lost_keys = protected_keys_before - protected_keys_after
                if lost_keys:
                    logger.error(f"  CRITICAL: Protected keys lost during enrichment: {lost_keys}")
                    logger.error(f"  Skipping file to prevent data loss!")
                    stats["errors"] += 1
                    continue

                # Track property statistics
                for prop in enrichment.properties_found:
                    stats["properties_counts"][prop] = stats["properties_counts"].get(prop, 0) + 1

                stats["enriched_successfully"] += 1

                # Log key findings
                findings = []
                if enrichment.temporal.get("inception"):
                    findings.append(f"inception: {enrichment.temporal['inception']}")
                if enrichment.temporal.get("date_of_official_opening"):
                    findings.append(f"opened: {enrichment.temporal['date_of_official_opening']}")
                if enrichment.classification.get("field_of_work"):
                    fow = enrichment.classification["field_of_work"]
                    if isinstance(fow, list) and fow:
                        label = fow[0].get("label", fow[0]) if isinstance(fow[0], dict) else fow[0]
                        findings.append(f"field: {label}")
                if enrichment.identifiers:
                    findings.append(f"{len(enrichment.identifiers)} identifiers")

                if findings:
                    logger.info(f"  Found: {', '.join(findings)}")

                with open(yaml_file, 'w', encoding='utf-8') as f:
                    yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

                processed_files.add(yaml_file.name)
                progress["processed_files"] = list(processed_files)
                progress["stats"] = stats
                progress["isil_resolutions"] = isil_resolutions

                if (i + 1) % 10 == 0:
                    save_progress(progress, progress_file)

                time.sleep(REQUEST_DELAY)

            except Exception as e:
                logger.error(f"Error processing {yaml_file.name}: {e}")
                stats["errors"] += 1

    save_progress(progress, progress_file)

    logger.info("\n" + "=" * 60)
    logger.info("ENRICHMENT COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Total files scanned: {stats['total_scanned']}")
    logger.info(f"Files needing enrichment: {stats['needs_enrichment']}")
    logger.info(f"Already enriched (v2.0+): {stats['already_enriched_v2']}")
    logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
    if args.resolve_isil:
        logger.info(f"ISIL codes resolved: {stats['isil_resolved']}")
    logger.info(f"Successfully enriched: {stats['enriched_successfully']}")
    logger.info(f"Errors: {stats['errors']}")
    logger.info("")
    logger.info("Top properties found:")
    sorted_props = sorted(stats["properties_counts"].items(), key=lambda x: x[1], reverse=True)[:15]
    for prop, count in sorted_props:
        prop_name = PROPERTY_MAPPING.get(prop, {}).get("name", prop)
        logger.info(f"  {prop} ({prop_name}): {count}")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()