glam/scripts/enrich_custodians_wikidata_full.py
2025-12-09 12:25:16 +01:00

714 lines
31 KiB
Python

#!/usr/bin/env python3
"""
Enrich custodian YAML files with FULL Wikidata data from detected entities.
This script captures ALL available Wikidata properties, not just a subset.
It resolves entity references (Q-numbers) to human-readable labels.
Enhanced from enrich_custodians_wikidata_inception.py to capture:
- All temporal data (P571 inception, P576 dissolution, P1619 official opening)
- All identifiers (ISIL, VIAF, GND, LCNAF, BnF, OSM, TripAdvisor, Google KG, etc.)
- All location data with resolved labels
- Field of work / subject areas
- Related entities (parent org, adjacent buildings, etc.)
- External URLs (blog, social media, etc.)
Usage:
python scripts/enrich_custodians_wikidata_full.py [--dry-run] [--limit N] [--country XX]
Options:
--dry-run Show what would be enriched without modifying files
--limit N Process only first N files (for testing)
--country XX Only process files for country code XX (e.g., JP, CZ, NL)
--force Re-enrich even if already has wikidata_enrichment
Environment Variables:
WIKIDATA_API_TOKEN - Optional OAuth2 token for increased rate limits (5,000 req/hr)
"""
import argparse
import json
import logging
import os
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
import httpx
import yaml
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
PROGRESS_FILE = Path(__file__).parent.parent / "data" / "custodian" / ".full_enrichment_progress.json"
# Rate limiting
WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "")
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
USER_AGENT = f"GLAMDataExtractor/1.1 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"
if WIKIDATA_API_TOKEN:
REQUEST_DELAY = 0.75 # ~4800 requests per hour
logger.info("Using authenticated mode: 5,000 req/hr limit")
else:
REQUEST_DELAY = 7.5 # ~480 requests per hour
logger.info("Using anonymous mode: 500 req/hr limit")
# HTTP Headers
HEADERS = {
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
if WIKIDATA_API_TOKEN:
HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}"
# COMPREHENSIVE Property mapping - capturing ALL useful properties for heritage institutions
# Organized by category for clarity
PROPERTY_MAPPING = {
# === TEMPORAL PROPERTIES ===
"P571": {"name": "inception", "type": "time", "category": "temporal"},
"P576": {"name": "dissolution", "type": "time", "category": "temporal"},
"P1619": {"name": "date_of_official_opening", "type": "time", "category": "temporal"},
"P580": {"name": "start_time", "type": "time", "category": "temporal"},
"P582": {"name": "end_time", "type": "time", "category": "temporal"},
# === ORGANIZATIONAL PROPERTIES ===
"P31": {"name": "instance_of", "type": "entity_list", "category": "classification"},
"P17": {"name": "country", "type": "entity", "category": "location"},
"P131": {"name": "located_in_admin_entity", "type": "entity", "category": "location"},
"P276": {"name": "location", "type": "entity", "category": "location"},
"P159": {"name": "headquarters_location", "type": "entity", "category": "location"},
"P625": {"name": "coordinates", "type": "coordinates", "category": "location"},
"P969": {"name": "located_at_street_address", "type": "string", "category": "location"},
"P281": {"name": "postal_code", "type": "string", "category": "location"},
"P749": {"name": "parent_organization", "type": "entity", "category": "organization"},
"P355": {"name": "subsidiary", "type": "entity_list", "category": "organization"},
"P361": {"name": "part_of", "type": "entity", "category": "organization"},
"P527": {"name": "has_parts", "type": "entity_list", "category": "organization"},
"P463": {"name": "member_of", "type": "entity_list", "category": "organization"},
"P101": {"name": "field_of_work", "type": "entity_list", "category": "classification"},
"P921": {"name": "main_subject", "type": "entity_list", "category": "classification"},
"P3032": {"name": "adjacent_building", "type": "entity", "category": "location"},
"P1435": {"name": "heritage_designation", "type": "entity_list", "category": "classification"},
"P112": {"name": "founded_by", "type": "entity_list", "category": "organization"},
"P169": {"name": "chief_executive_officer", "type": "entity", "category": "organization"},
"P488": {"name": "chairperson", "type": "entity", "category": "organization"},
# === IDENTIFIERS ===
"P791": {"name": "isil", "type": "string", "category": "identifier"},
"P214": {"name": "viaf", "type": "string", "category": "identifier"},
"P227": {"name": "gnd", "type": "string", "category": "identifier"},
"P244": {"name": "lcnaf", "type": "string", "category": "identifier"},
"P268": {"name": "bnf", "type": "string", "category": "identifier"},
"P269": {"name": "idref", "type": "string", "category": "identifier"},
"P213": {"name": "isni", "type": "string", "category": "identifier"},
"P1566": {"name": "geonames_id", "type": "string", "category": "identifier"},
"P349": {"name": "ndl_authority_id", "type": "string", "category": "identifier"},
"P271": {"name": "nacsis_cat_id", "type": "string", "category": "identifier"},
"P2671": {"name": "google_knowledge_graph_id", "type": "string", "category": "identifier"},
"P3134": {"name": "tripadvisor_id", "type": "string", "category": "identifier"},
"P11693": {"name": "openstreetmap_node_id", "type": "string", "category": "identifier"},
"P11496": {"name": "cinii_research_id", "type": "string", "category": "identifier"},
"P5587": {"name": "libris_uri", "type": "string", "category": "identifier"},
"P496": {"name": "orcid", "type": "string", "category": "identifier"},
"P1015": {"name": "noraf_id", "type": "string", "category": "identifier"},
"P1006": {"name": "nta_id", "type": "string", "category": "identifier"},
"P409": {"name": "nla_id", "type": "string", "category": "identifier"},
"P950": {"name": "bne_id", "type": "string", "category": "identifier"},
"P906": {"name": "selibr", "type": "string", "category": "identifier"},
"P1017": {"name": "bac_id", "type": "string", "category": "identifier"},
"P7859": {"name": "worldcat_identities_id", "type": "string", "category": "identifier"},
"P3500": {"name": "ringgold_id", "type": "string", "category": "identifier"},
"P2427": {"name": "grid_id", "type": "string", "category": "identifier"},
"P6782": {"name": "ror_id", "type": "string", "category": "identifier"},
"P3153": {"name": "crossref_funder_id", "type": "string", "category": "identifier"},
# === WEB PRESENCE ===
"P856": {"name": "official_website", "type": "url", "category": "web"},
"P1581": {"name": "official_blog_url", "type": "url", "category": "web"},
"P973": {"name": "described_at_url", "type": "url", "category": "web"},
"P2013": {"name": "facebook_id", "type": "string", "category": "social"},
"P2002": {"name": "twitter_username", "type": "string", "category": "social"},
"P2003": {"name": "instagram_username", "type": "string", "category": "social"},
"P2397": {"name": "youtube_channel_id", "type": "string", "category": "social"},
"P4264": {"name": "linkedin_company_id", "type": "string", "category": "social"},
"P4003": {"name": "facebook_page_id", "type": "string", "category": "social"},
"P8687": {"name": "social_media_followers", "type": "quantity", "category": "social"},
# === MEDIA ===
"P18": {"name": "image", "type": "commons_media", "category": "media"},
"P154": {"name": "logo", "type": "commons_media", "category": "media"},
"P41": {"name": "flag_image", "type": "commons_media", "category": "media"},
"P94": {"name": "coat_of_arms", "type": "commons_media", "category": "media"},
"P373": {"name": "commons_category", "type": "string", "category": "media"},
"P935": {"name": "commons_gallery", "type": "string", "category": "media"},
# === CONTACT ===
"P968": {"name": "email", "type": "string", "category": "contact"},
"P1329": {"name": "phone_number", "type": "string", "category": "contact"},
"P3740": {"name": "number_of_works", "type": "quantity", "category": "collection"},
"P1436": {"name": "collection_items_count", "type": "quantity", "category": "collection"},
# === AWARDS & RECOGNITION ===
"P166": {"name": "award_received", "type": "entity_list", "category": "recognition"},
# === ARCHITECTURE ===
"P149": {"name": "architectural_style", "type": "entity_list", "category": "architecture"},
"P84": {"name": "architect", "type": "entity_list", "category": "architecture"},
"P631": {"name": "structural_engineer", "type": "entity_list", "category": "architecture"},
}
@dataclass
class FullWikidataEnrichment:
"""Container for comprehensive Wikidata enrichment data."""
entity_id: str
labels: Dict[str, str] = field(default_factory=dict)
descriptions: Dict[str, str] = field(default_factory=dict)
aliases: Dict[str, List[str]] = field(default_factory=dict)
sitelinks: Dict[str, str] = field(default_factory=dict)
# All extracted properties organized by category
temporal: Dict[str, Any] = field(default_factory=dict)
classification: Dict[str, Any] = field(default_factory=dict)
location: Dict[str, Any] = field(default_factory=dict)
organization: Dict[str, Any] = field(default_factory=dict)
identifiers: Dict[str, str] = field(default_factory=dict)
web: Dict[str, str] = field(default_factory=dict)
social: Dict[str, str] = field(default_factory=dict)
media: Dict[str, str] = field(default_factory=dict)
contact: Dict[str, str] = field(default_factory=dict)
collection: Dict[str, Any] = field(default_factory=dict)
recognition: Dict[str, Any] = field(default_factory=dict)
architecture: Dict[str, Any] = field(default_factory=dict)
# Metadata
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
properties_found: List[str] = field(default_factory=list)
def extract_value_from_statement(statement: Dict) -> Any:
"""Extract the value from a Wikidata statement structure."""
try:
value_data = statement.get("value", {})
content = value_data.get("content")
if isinstance(content, dict):
if "entity-type" in content or "id" in content:
return content.get("id", content)
elif "time" in content:
time_val = content.get("time", "")
if time_val.startswith("+") or time_val.startswith("-"):
time_val = time_val[1:]
if "T" in time_val:
time_val = time_val.split("T")[0]
return time_val
elif "latitude" in content and "longitude" in content:
return {
"latitude": content.get("latitude"),
"longitude": content.get("longitude"),
"precision": content.get("precision")
}
elif "amount" in content:
return content.get("amount", "").lstrip("+")
else:
return content
else:
return content
except Exception:
return None
def fetch_entity_labels_batch(entity_ids: Set[str], client: httpx.Client) -> Dict[str, Dict[str, str]]:
"""Fetch labels for multiple entities in a batch using SPARQL."""
if not entity_ids:
return {}
# Limit batch size
entity_ids_list = list(entity_ids)[:50]
entity_values = " ".join([f"wd:{eid}" for eid in entity_ids_list])
query = f"""
SELECT ?entity ?entityLabel ?entityDescription WHERE {{
VALUES ?entity {{ {entity_values} }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,de,fr". }}
}}
"""
try:
response = client.get(
WIKIDATA_SPARQL_ENDPOINT,
params={"query": query, "format": "json"},
headers={"User-Agent": USER_AGENT, "Accept": "application/sparql-results+json"}
)
response.raise_for_status()
results = response.json()
labels = {}
for binding in results.get("results", {}).get("bindings", []):
entity_uri = binding.get("entity", {}).get("value", "")
entity_id = entity_uri.split("/")[-1] if entity_uri else None
if entity_id:
labels[entity_id] = {
"id": entity_id,
"label": binding.get("entityLabel", {}).get("value", entity_id),
"description": binding.get("entityDescription", {}).get("value", "")
}
return labels
except Exception as e:
logger.warning(f"SPARQL label fetch failed: {e}")
return {eid: {"id": eid, "label": eid} for eid in entity_ids_list}
def fetch_entity_data(entity_id: str, client: httpx.Client) -> Optional[Dict]:
"""Fetch full entity data from Wikibase REST API."""
url = f"{WIKIDATA_REST_API}/entities/items/{entity_id}"
try:
response = client.get(url, headers=HEADERS)
if response.status_code == 403:
headers_no_auth = {k: v for k, v in HEADERS.items() if k != "Authorization"}
response = client.get(url, headers=headers_no_auth)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
logger.warning(f"Entity {entity_id} not found")
else:
logger.error(f"HTTP error fetching {entity_id}: {e}")
return None
except Exception as e:
logger.error(f"Error fetching {entity_id}: {e}")
return None
def parse_entity_data_full(entity_id: str, data: Dict, client: httpx.Client) -> FullWikidataEnrichment:
"""Parse the full entity data with label resolution."""
enrichment = FullWikidataEnrichment(entity_id=entity_id)
# Extract labels
enrichment.labels = data.get("labels", {})
enrichment.descriptions = data.get("descriptions", {})
enrichment.aliases = data.get("aliases", {})
# Extract sitelinks
sitelinks = data.get("sitelinks", {})
enrichment.sitelinks = {k: v.get("title", "") for k, v in sitelinks.items() if isinstance(v, dict)}
# Collect entity IDs that need label resolution
entity_ids_to_resolve: Set[str] = set()
# Process all statements
statements = data.get("statements", {})
for prop_id, prop_statements in statements.items():
if not prop_statements:
continue
prop_config = PROPERTY_MAPPING.get(prop_id)
if not prop_config:
continue # Skip unknown properties
enrichment.properties_found.append(prop_id)
prop_name: str = prop_config["name"]
prop_type: str = prop_config["type"]
category: str = prop_config["category"]
values: List[Any] = []
for stmt in prop_statements:
value = extract_value_from_statement(stmt)
if value is not None:
values.append(value)
# Collect entity IDs for label resolution
if prop_type in ("entity", "entity_list") and isinstance(value, str) and value.startswith("Q"):
entity_ids_to_resolve.add(value)
if not values:
continue
# Store values in appropriate category
target_dict = getattr(enrichment, category, None)
if target_dict is None:
continue
if prop_type == "entity":
target_dict[prop_name] = values[0]
elif prop_type == "entity_list":
target_dict[prop_name] = values
elif prop_type in ("string", "url"):
target_dict[prop_name] = values[0] if len(values) == 1 else values
elif prop_type == "time":
target_dict[prop_name] = values[0]
elif prop_type == "coordinates":
target_dict[prop_name] = values[0]
elif prop_type == "commons_media":
target_dict[prop_name] = values[0]
elif prop_type == "quantity":
target_dict[prop_name] = values[0]
# Resolve entity labels
if entity_ids_to_resolve:
time.sleep(0.2) # Small delay before SPARQL query
labels_map = fetch_entity_labels_batch(entity_ids_to_resolve, client)
# Replace entity IDs with resolved labels
for category_name in ["classification", "location", "organization", "recognition", "architecture"]:
category_dict = getattr(enrichment, category_name, {})
for key, value in list(category_dict.items()):
if isinstance(value, str) and value in labels_map:
category_dict[key] = labels_map[value]
elif isinstance(value, list):
category_dict[key] = [
labels_map.get(v, {"id": v, "label": v}) if isinstance(v, str) and v.startswith("Q") else v
for v in value
]
return enrichment
def enrichment_to_dict(enrichment: FullWikidataEnrichment) -> Dict:
"""Convert FullWikidataEnrichment to a dictionary for YAML output."""
result = {
"wikidata_entity_id": enrichment.entity_id,
"api_metadata": {
"api_endpoint": WIKIDATA_REST_API,
"fetch_timestamp": enrichment.fetch_timestamp,
"user_agent": USER_AGENT,
"enrichment_version": "2.0_full",
"properties_found": enrichment.properties_found,
}
}
# Add labels
if enrichment.labels:
result["wikidata_labels"] = enrichment.labels
for lang in ["en", "nl", "ja", "de", "fr", "es"]:
if lang in enrichment.labels:
result[f"wikidata_label_{lang}"] = enrichment.labels[lang]
# Add descriptions
if enrichment.descriptions:
result["wikidata_descriptions"] = enrichment.descriptions
if "en" in enrichment.descriptions:
result["wikidata_description_en"] = enrichment.descriptions["en"]
# Add aliases
if enrichment.aliases:
result["wikidata_aliases"] = enrichment.aliases
# Add sitelinks (Wikipedia articles)
if enrichment.sitelinks:
result["wikidata_sitelinks"] = enrichment.sitelinks
# Add all category data with readable prefixes
if enrichment.temporal:
result["wikidata_temporal"] = enrichment.temporal
# Promote key dates to top level for easy access
if "inception" in enrichment.temporal:
result["wikidata_inception"] = enrichment.temporal["inception"]
if "dissolution" in enrichment.temporal:
result["wikidata_dissolution"] = enrichment.temporal["dissolution"]
if "date_of_official_opening" in enrichment.temporal:
result["wikidata_opening_date"] = enrichment.temporal["date_of_official_opening"]
if enrichment.classification:
result["wikidata_classification"] = enrichment.classification
if "instance_of" in enrichment.classification:
result["wikidata_instance_of"] = enrichment.classification["instance_of"]
if "field_of_work" in enrichment.classification:
result["wikidata_field_of_work"] = enrichment.classification["field_of_work"]
if enrichment.location:
result["wikidata_location"] = enrichment.location
if "country" in enrichment.location:
result["wikidata_country"] = enrichment.location["country"]
if "located_in_admin_entity" in enrichment.location:
result["wikidata_located_in"] = enrichment.location["located_in_admin_entity"]
if "coordinates" in enrichment.location:
result["wikidata_coordinates"] = enrichment.location["coordinates"]
if enrichment.organization:
result["wikidata_organization"] = enrichment.organization
if enrichment.identifiers:
result["wikidata_identifiers"] = enrichment.identifiers
if enrichment.web:
result["wikidata_web"] = enrichment.web
if "official_website" in enrichment.web:
result["wikidata_official_website"] = enrichment.web["official_website"]
if enrichment.social:
result["wikidata_social_media"] = enrichment.social
if enrichment.media:
result["wikidata_media"] = enrichment.media
if "image" in enrichment.media:
result["wikidata_image"] = enrichment.media["image"]
if "logo" in enrichment.media:
result["wikidata_logo"] = enrichment.media["logo"]
if enrichment.contact:
result["wikidata_contact"] = enrichment.contact
if enrichment.collection:
result["wikidata_collection"] = enrichment.collection
if enrichment.recognition:
result["wikidata_recognition"] = enrichment.recognition
if enrichment.architecture:
result["wikidata_architecture"] = enrichment.architecture
return result
def get_wikidata_entity_id(data: Dict) -> Optional[str]:
"""Extract Wikidata entity ID from a custodian YAML file."""
wd = data.get("wikidata_enrichment", {})
if wd and wd.get("wikidata_entity_id"):
return wd.get("wikidata_entity_id")
identifiers = data.get("identifiers", [])
for ident in identifiers:
if isinstance(ident, dict):
scheme = ident.get("identifier_scheme", "")
if scheme.lower() == "wikidata":
return ident.get("identifier_value")
original = data.get("original_entry", {})
for ident in original.get("identifiers", []):
if isinstance(ident, dict):
scheme = ident.get("identifier_scheme", "")
if scheme.lower() == "wikidata":
return ident.get("identifier_value")
return None
def is_fully_enriched(data: Dict) -> bool:
"""Check if file has been fully enriched with v2.0."""
wd = data.get("wikidata_enrichment", {})
api_meta = wd.get("api_metadata", {})
return api_meta.get("enrichment_version") == "2.0_full"
def load_progress() -> Dict:
"""Load progress from checkpoint file."""
if PROGRESS_FILE.exists():
try:
with open(PROGRESS_FILE, 'r') as f:
return json.load(f)
except Exception:
pass
return {"processed_files": [], "stats": {}}
def save_progress(progress: Dict):
"""Save progress to checkpoint file."""
try:
with open(PROGRESS_FILE, 'w') as f:
json.dump(progress, f, indent=2)
except Exception as e:
logger.error(f"Failed to save progress: {e}")
def main():
parser = argparse.ArgumentParser(description="Full Wikidata enrichment for custodian files")
parser.add_argument("--dry-run", action="store_true", help="Show what would be enriched without modifying files")
parser.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = no limit)")
parser.add_argument("--country", type=str, help="Only process files for country code XX")
parser.add_argument("--force", action="store_true", help="Re-enrich even if already has v2.0 enrichment")
parser.add_argument("--resume", action="store_true", help="Resume from last checkpoint")
args = parser.parse_args()
progress = load_progress() if args.resume else {"processed_files": [], "stats": {}}
processed_files = set(progress.get("processed_files", []))
stats = {
"total_scanned": 0,
"needs_enrichment": 0,
"already_enriched_v2": 0,
"no_wikidata_id": 0,
"enriched_successfully": 0,
"errors": 0,
"skipped_already_processed": 0,
"properties_counts": {},
}
pattern = f"{args.country}-*.yaml" if args.country else "*.yaml"
yaml_files = sorted(CUSTODIAN_DIR.glob(pattern))
logger.info(f"Found {len(yaml_files)} YAML files in {CUSTODIAN_DIR}")
files_to_process = []
for yaml_file in yaml_files:
stats["total_scanned"] += 1
if args.resume and yaml_file.name in processed_files:
stats["skipped_already_processed"] += 1
continue
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
continue
entity_id = get_wikidata_entity_id(data)
if not entity_id:
stats["no_wikidata_id"] += 1
continue
if not args.force and is_fully_enriched(data):
stats["already_enriched_v2"] += 1
continue
stats["needs_enrichment"] += 1
files_to_process.append((yaml_file, entity_id)) # Don't store data - re-read later
except Exception as e:
logger.error(f"Error reading {yaml_file}: {e}")
stats["errors"] += 1
logger.info(f"Files needing enrichment: {len(files_to_process)}")
logger.info(f"Files already enriched (v2.0): {stats['already_enriched_v2']}")
logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
if args.limit > 0:
files_to_process = files_to_process[:args.limit]
logger.info(f"Limited to first {args.limit} files")
if args.dry_run:
logger.info("DRY RUN - No files will be modified")
for yaml_file, entity_id in files_to_process[:20]:
logger.info(f" Would enrich: {yaml_file.name} ({entity_id})")
if len(files_to_process) > 20:
logger.info(f" ... and {len(files_to_process) - 20} more")
return
# CRITICAL KEYS that must NEVER be deleted during enrichment
# See AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
PROTECTED_KEYS = {'location', 'original_entry', 'ghcid', 'custodian_name', 'identifiers',
'provenance', 'ch_annotator', 'google_maps_enrichment', 'osm_enrichment',
'unesco_mow_enrichment', 'web_enrichment'}
with httpx.Client(timeout=30.0) as client:
for i, (yaml_file, _, entity_id) in enumerate(files_to_process):
try:
logger.info(f"[{i+1}/{len(files_to_process)}] Enriching {yaml_file.name} ({entity_id})")
# SAFETY FIX: Re-read the file immediately before modifying
# This prevents race conditions where another script modified the file
# between initial scan and enrichment
with open(yaml_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
logger.warning(f" File is empty or invalid: {yaml_file.name}")
stats["errors"] += 1
continue
# Record which protected keys exist BEFORE modification
keys_before = set(data.keys())
protected_keys_before = keys_before & PROTECTED_KEYS
entity_data = fetch_entity_data(entity_id, client)
if entity_data is None:
logger.warning(f" Could not fetch data for {entity_id}")
stats["errors"] += 1
continue
enrichment = parse_entity_data_full(entity_id, entity_data, client)
enrichment_dict = enrichment_to_dict(enrichment)
data["wikidata_enrichment"] = enrichment_dict
# SAFETY CHECK: Verify no protected keys were lost
keys_after = set(data.keys())
protected_keys_after = keys_after & PROTECTED_KEYS
lost_keys = protected_keys_before - protected_keys_after
if lost_keys:
logger.error(f" CRITICAL: Protected keys lost during enrichment: {lost_keys}")
logger.error(f" Skipping file to prevent data loss!")
stats["errors"] += 1
continue
# Track property statistics
for prop in enrichment.properties_found:
stats["properties_counts"][prop] = stats["properties_counts"].get(prop, 0) + 1
stats["enriched_successfully"] += 1
# Log key findings
findings = []
if enrichment.temporal.get("inception"):
findings.append(f"inception: {enrichment.temporal['inception']}")
if enrichment.temporal.get("date_of_official_opening"):
findings.append(f"opened: {enrichment.temporal['date_of_official_opening']}")
if enrichment.classification.get("field_of_work"):
fow = enrichment.classification["field_of_work"]
if isinstance(fow, list) and fow:
label = fow[0].get("label", fow[0]) if isinstance(fow[0], dict) else fow[0]
findings.append(f"field: {label}")
if enrichment.identifiers:
findings.append(f"{len(enrichment.identifiers)} identifiers")
if findings:
logger.info(f" Found: {', '.join(findings)}")
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
processed_files.add(yaml_file.name)
progress["processed_files"] = list(processed_files)
progress["stats"] = stats
if (i + 1) % 10 == 0:
save_progress(progress)
time.sleep(REQUEST_DELAY)
except Exception as e:
logger.error(f"Error processing {yaml_file.name}: {e}")
stats["errors"] += 1
save_progress(progress)
logger.info("\n" + "=" * 60)
logger.info("FULL ENRICHMENT COMPLETE")
logger.info("=" * 60)
logger.info(f"Total files scanned: {stats['total_scanned']}")
logger.info(f"Files needing enrichment: {stats['needs_enrichment']}")
logger.info(f"Already enriched (v2.0): {stats['already_enriched_v2']}")
logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
logger.info(f"Successfully enriched: {stats['enriched_successfully']}")
logger.info(f"Errors: {stats['errors']}")
logger.info("")
logger.info("Top properties found:")
sorted_props = sorted(stats["properties_counts"].items(), key=lambda x: x[1], reverse=True)[:15]
for prop, count in sorted_props:
prop_name = PROPERTY_MAPPING.get(prop, {}).get("name", prop)
logger.info(f" {prop} ({prop_name}): {count}")
logger.info("=" * 60)
if __name__ == "__main__":
main()