#!/usr/bin/env python3 """ Enrich custodian YAML files with full Wikidata data, specifically targeting inception dates. This script: 1. Scans all YAML files in data/custodian/ 2. Finds records with wikidata_entity_id but missing wikidata_inception 3. Fetches full Wikidata data from REST API (including P571 inception) 4. Updates the YAML files with enriched wikidata_enrichment section 5. Generates a report of enriched records The script respects Wikidata rate limits and supports resumable processing. Usage: python scripts/enrich_custodians_wikidata_inception.py [--dry-run] [--limit N] [--country XX] Options: --dry-run Show what would be enriched without modifying files --limit N Process only first N files (for testing) --country XX Only process files for country code XX (e.g., JP, CZ, NL) --skip-existing Skip files that already have wikidata_inception Environment Variables: WIKIDATA_API_TOKEN - Optional OAuth2 token for increased rate limits (5,000 req/hr) WIKIMEDIA_CONTACT_EMAIL - Contact email for User-Agent (required by Wikimedia policy) """ import argparse import json import logging import os import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional import httpx import yaml # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1" CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" PROGRESS_FILE = Path(__file__).parent.parent / "data" / "custodian" / ".inception_enrichment_progress.json" # Rate limiting: 500 req/hr for anonymous, 5000 req/hr with token WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "") WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com") USER_AGENT = f"GLAMDataExtractor/1.0 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx" # Request delay based on authentication status if WIKIDATA_API_TOKEN: REQUEST_DELAY = 0.75 # ~4800 requests per hour (below 5000 limit) logger.info("Using authenticated mode: 5,000 req/hr limit") else: REQUEST_DELAY = 7.5 # ~480 requests per hour (below 500 limit) logger.info("Using anonymous mode: 500 req/hr limit (use WIKIDATA_API_TOKEN for faster processing)") # Headers HEADERS = { "Accept": "application/json", "User-Agent": USER_AGENT, } if WIKIDATA_API_TOKEN: HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}" # Property IDs for heritage institutions PROPERTY_LABELS = { "P31": "instance_of", "P17": "country", "P131": "located_in", "P625": "coordinates", "P571": "inception", "P576": "dissolution", "P856": "official_website", "P18": "image", "P154": "logo", "P791": "isil", "P214": "viaf", "P227": "gnd", "P244": "lcnaf", "P268": "bnf", "P269": "idref", "P213": "isni", "P1566": "geonames", } @dataclass class WikidataEnrichment: """Container for Wikidata enrichment data.""" entity_id: str labels: Dict[str, str] = field(default_factory=dict) descriptions: Dict[str, str] = field(default_factory=dict) instance_of: List[Dict[str, str]] = field(default_factory=list) country: Optional[Dict[str, str]] = None location: Optional[Dict[str, str]] = None coordinates: Optional[Dict[str, float]] = None inception: Optional[str] = None dissolution: Optional[str] = None official_website: Optional[str] = None image: Optional[str] = None logo: Optional[str] = None identifiers: Dict[str, str] = field(default_factory=dict) fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) def extract_value_from_statement(statement: Dict) -> Any: """Extract the value from a Wikidata statement structure.""" try: value_data = statement.get("value", {}) value_type = value_data.get("type") content = value_data.get("content") # Handle dict content (time, coordinates, etc.) regardless of value_type if isinstance(content, dict): if "entity-type" in content or "id" in content: return content.get("id", content) elif "time" in content: # Parse time value to extract just the date time_val = content.get("time", "") # Format: "+1854-11-28T00:00:00Z" -> "1854-11-28" if time_val.startswith("+") or time_val.startswith("-"): time_val = time_val[1:] if "T" in time_val: time_val = time_val.split("T")[0] return time_val elif "latitude" in content and "longitude" in content: return { "latitude": content.get("latitude"), "longitude": content.get("longitude"), "precision": content.get("precision") } else: return content else: return content except Exception: return None def fetch_entity_data(entity_id: str, client: httpx.Client) -> Optional[Dict]: """Fetch full entity data from Wikibase REST API.""" url = f"{WIKIDATA_REST_API}/entities/items/{entity_id}" try: response = client.get(url, headers=HEADERS) # Handle OAuth errors (retry without auth) if response.status_code == 403: headers_no_auth = {k: v for k, v in HEADERS.items() if k != "Authorization"} response = client.get(url, headers=headers_no_auth) response.raise_for_status() return response.json() except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.warning(f"Entity {entity_id} not found") else: logger.error(f"HTTP error fetching {entity_id}: {e}") return None except Exception as e: logger.error(f"Error fetching {entity_id}: {e}") return None def parse_entity_data(entity_id: str, data: Dict) -> WikidataEnrichment: """Parse the full entity data into a WikidataEnrichment object.""" enrichment = WikidataEnrichment(entity_id=entity_id) # Extract labels enrichment.labels = data.get("labels", {}) # Extract descriptions enrichment.descriptions = data.get("descriptions", {}) # Extract statements/claims statements = data.get("statements", {}) for prop_id, prop_statements in statements.items(): if not prop_statements: continue values = [] for stmt in prop_statements: value = extract_value_from_statement(stmt) if value is not None: values.append(value) if not values: continue # Handle specific properties if prop_id == "P31": # Instance of enrichment.instance_of = [{"id": v} if isinstance(v, str) else v for v in values] elif prop_id == "P17": # Country enrichment.country = {"id": values[0]} if values else None elif prop_id == "P131": # Located in enrichment.location = {"id": values[0]} if values else None elif prop_id == "P625": # Coordinates if values and isinstance(values[0], dict): enrichment.coordinates = values[0] elif prop_id == "P571": # Inception enrichment.inception = values[0] if values else None elif prop_id == "P576": # Dissolution enrichment.dissolution = values[0] if values else None elif prop_id == "P856": # Official website enrichment.official_website = values[0] if values else None elif prop_id == "P18": # Image enrichment.image = values[0] if values else None elif prop_id == "P154": # Logo enrichment.logo = values[0] if values else None elif prop_id in PROPERTY_LABELS: prop_name = PROPERTY_LABELS[prop_id] if prop_name not in ["instance_of", "country", "located_in", "coordinates", "inception", "dissolution", "official_website", "image", "logo"]: if values and values[0]: enrichment.identifiers[prop_name] = str(values[0]) return enrichment def enrichment_to_dict(enrichment: WikidataEnrichment) -> Dict: """Convert WikidataEnrichment to a dictionary for YAML output.""" result = { "wikidata_entity_id": enrichment.entity_id, "api_metadata": { "api_endpoint": WIKIDATA_REST_API, "fetch_timestamp": enrichment.fetch_timestamp, "user_agent": USER_AGENT, } } # Add labels if enrichment.labels: result["wikidata_labels"] = enrichment.labels if "en" in enrichment.labels: result["wikidata_label_en"] = enrichment.labels["en"] if "nl" in enrichment.labels: result["wikidata_label_nl"] = enrichment.labels["nl"] if "ja" in enrichment.labels: result["wikidata_label_ja"] = enrichment.labels["ja"] # Add descriptions if enrichment.descriptions: result["wikidata_descriptions"] = enrichment.descriptions if "en" in enrichment.descriptions: result["wikidata_description_en"] = enrichment.descriptions["en"] # Add identifiers if enrichment.identifiers: result["wikidata_identifiers"] = {k: v for k, v in enrichment.identifiers.items() if v} # Add instance types if enrichment.instance_of: result["wikidata_instance_of"] = enrichment.instance_of # Add location data if enrichment.country: result["wikidata_country"] = enrichment.country if enrichment.location: result["wikidata_located_in"] = enrichment.location if enrichment.coordinates: result["wikidata_coordinates"] = enrichment.coordinates # Add temporal data (the key field we're enriching!) if enrichment.inception: result["wikidata_inception"] = enrichment.inception if enrichment.dissolution: result["wikidata_dissolution"] = enrichment.dissolution # Add web presence if enrichment.official_website: result["wikidata_official_website"] = enrichment.official_website # Add media if enrichment.image: result["wikidata_image"] = enrichment.image if enrichment.logo: result["wikidata_logo"] = enrichment.logo return result def get_wikidata_entity_id(data: Dict) -> Optional[str]: """Extract Wikidata entity ID from a custodian YAML file.""" # Check wikidata_enrichment section first wd = data.get("wikidata_enrichment", {}) if wd and wd.get("wikidata_entity_id"): return wd.get("wikidata_entity_id") # Check identifiers array identifiers = data.get("identifiers", []) for ident in identifiers: if isinstance(ident, dict): scheme = ident.get("identifier_scheme", "") if scheme.lower() == "wikidata": return ident.get("identifier_value") # Check original_entry identifiers original = data.get("original_entry", {}) for ident in original.get("identifiers", []): if isinstance(ident, dict): scheme = ident.get("identifier_scheme", "") if scheme.lower() == "wikidata": return ident.get("identifier_value") return None def has_wikidata_inception(data: Dict) -> bool: """Check if a custodian YAML file already has wikidata_inception.""" wd = data.get("wikidata_enrichment", {}) if wd and wd.get("wikidata_inception"): return True return False def load_progress() -> Dict: """Load progress from checkpoint file.""" if PROGRESS_FILE.exists(): try: with open(PROGRESS_FILE, 'r') as f: return json.load(f) except Exception: pass return {"processed_files": [], "stats": {}} def save_progress(progress: Dict): """Save progress to checkpoint file.""" try: with open(PROGRESS_FILE, 'w') as f: json.dump(progress, f, indent=2) except Exception as e: logger.error(f"Failed to save progress: {e}") def main(): parser = argparse.ArgumentParser(description="Enrich custodian files with Wikidata inception dates") parser.add_argument("--dry-run", action="store_true", help="Show what would be enriched without modifying files") parser.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = no limit)") parser.add_argument("--country", type=str, help="Only process files for country code XX (e.g., JP, CZ)") parser.add_argument("--skip-existing", action="store_true", help="Skip files that already have wikidata_inception") parser.add_argument("--resume", action="store_true", help="Resume from last checkpoint") args = parser.parse_args() # Load progress if resuming progress = load_progress() if args.resume else {"processed_files": [], "stats": {}} processed_files = set(progress.get("processed_files", [])) # Statistics stats = { "total_scanned": 0, "needs_enrichment": 0, "already_has_inception": 0, "no_wikidata_id": 0, "enriched_with_inception": 0, "enriched_no_inception": 0, "errors": 0, "skipped_already_processed": 0, } # Find all YAML files pattern = f"{args.country}-*.yaml" if args.country else "*.yaml" yaml_files = sorted(CUSTODIAN_DIR.glob(pattern)) logger.info(f"Found {len(yaml_files)} YAML files in {CUSTODIAN_DIR}") # Filter and prepare files to process files_to_process = [] for yaml_file in yaml_files: stats["total_scanned"] += 1 # Skip if already processed in previous run if args.resume and yaml_file.name in processed_files: stats["skipped_already_processed"] += 1 continue try: with open(yaml_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: continue # Check if has wikidata_id entity_id = get_wikidata_entity_id(data) if not entity_id: stats["no_wikidata_id"] += 1 continue # Check if already has inception if has_wikidata_inception(data): stats["already_has_inception"] += 1 if args.skip_existing: continue stats["needs_enrichment"] += 1 files_to_process.append((yaml_file, data, entity_id)) except Exception as e: logger.error(f"Error reading {yaml_file}: {e}") stats["errors"] += 1 logger.info(f"Files needing enrichment: {len(files_to_process)}") logger.info(f"Files already with inception: {stats['already_has_inception']}") logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}") if args.limit > 0: files_to_process = files_to_process[:args.limit] logger.info(f"Limited to first {args.limit} files") if args.dry_run: logger.info("DRY RUN - No files will be modified") for yaml_file, _, entity_id in files_to_process[:20]: logger.info(f" Would enrich: {yaml_file.name} ({entity_id})") if len(files_to_process) > 20: logger.info(f" ... and {len(files_to_process) - 20} more") return # Process files with httpx.Client(timeout=30.0) as client: for i, (yaml_file, data, entity_id) in enumerate(files_to_process): try: logger.info(f"[{i+1}/{len(files_to_process)}] Enriching {yaml_file.name} ({entity_id})") # Fetch Wikidata data entity_data = fetch_entity_data(entity_id, client) if entity_data is None: logger.warning(f" Could not fetch data for {entity_id}") stats["errors"] += 1 continue # Parse enrichment enrichment = parse_entity_data(entity_id, entity_data) enrichment_dict = enrichment_to_dict(enrichment) # Update the YAML data data["wikidata_enrichment"] = enrichment_dict if enrichment.inception: stats["enriched_with_inception"] += 1 logger.info(f" Found inception: {enrichment.inception}") else: stats["enriched_no_inception"] += 1 logger.info(f" No inception found in Wikidata") # Write back to file with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Update progress processed_files.add(yaml_file.name) progress["processed_files"] = list(processed_files) progress["stats"] = stats # Save progress periodically if (i + 1) % 10 == 0: save_progress(progress) # Rate limiting time.sleep(REQUEST_DELAY) except Exception as e: logger.error(f"Error processing {yaml_file.name}: {e}") stats["errors"] += 1 # Save final progress save_progress(progress) # Print summary logger.info("\n" + "=" * 60) logger.info("ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total files scanned: {stats['total_scanned']}") logger.info(f"Files needing enrichment: {stats['needs_enrichment']}") logger.info(f"Files already with inception: {stats['already_has_inception']}") logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}") logger.info(f"Successfully enriched with inception: {stats['enriched_with_inception']}") logger.info(f"Enriched but no inception in Wikidata: {stats['enriched_no_inception']}") logger.info(f"Errors: {stats['errors']}") logger.info("=" * 60) if __name__ == "__main__": main()