714 lines
31 KiB
Python
714 lines
31 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich custodian YAML files with FULL Wikidata data from detected entities.
|
|
|
|
This script captures ALL available Wikidata properties, not just a subset.
|
|
It resolves entity references (Q-numbers) to human-readable labels.
|
|
|
|
Enhanced from enrich_custodians_wikidata_inception.py to capture:
|
|
- All temporal data (P571 inception, P576 dissolution, P1619 official opening)
|
|
- All identifiers (ISIL, VIAF, GND, LCNAF, BnF, OSM, TripAdvisor, Google KG, etc.)
|
|
- All location data with resolved labels
|
|
- Field of work / subject areas
|
|
- Related entities (parent org, adjacent buildings, etc.)
|
|
- External URLs (blog, social media, etc.)
|
|
|
|
Usage:
|
|
python scripts/enrich_custodians_wikidata_full.py [--dry-run] [--limit N] [--country XX]
|
|
|
|
Options:
|
|
--dry-run Show what would be enriched without modifying files
|
|
--limit N Process only first N files (for testing)
|
|
--country XX Only process files for country code XX (e.g., JP, CZ, NL)
|
|
--force Re-enrich even if already has wikidata_enrichment
|
|
|
|
Environment Variables:
|
|
WIKIDATA_API_TOKEN - Optional OAuth2 token for increased rate limits (5,000 req/hr)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Set
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
|
|
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
PROGRESS_FILE = Path(__file__).parent.parent / "data" / "custodian" / ".full_enrichment_progress.json"
|
|
|
|
# Rate limiting
|
|
WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "")
|
|
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
|
|
USER_AGENT = f"GLAMDataExtractor/1.1 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"
|
|
|
|
if WIKIDATA_API_TOKEN:
|
|
REQUEST_DELAY = 0.75 # ~4800 requests per hour
|
|
logger.info("Using authenticated mode: 5,000 req/hr limit")
|
|
else:
|
|
REQUEST_DELAY = 7.5 # ~480 requests per hour
|
|
logger.info("Using anonymous mode: 500 req/hr limit")
|
|
|
|
# HTTP Headers
|
|
HEADERS = {
|
|
"Accept": "application/json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
if WIKIDATA_API_TOKEN:
|
|
HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}"
|
|
|
|
# COMPREHENSIVE Property mapping - capturing ALL useful properties for heritage institutions
|
|
# Organized by category for clarity
|
|
PROPERTY_MAPPING = {
|
|
# === TEMPORAL PROPERTIES ===
|
|
"P571": {"name": "inception", "type": "time", "category": "temporal"},
|
|
"P576": {"name": "dissolution", "type": "time", "category": "temporal"},
|
|
"P1619": {"name": "date_of_official_opening", "type": "time", "category": "temporal"},
|
|
"P580": {"name": "start_time", "type": "time", "category": "temporal"},
|
|
"P582": {"name": "end_time", "type": "time", "category": "temporal"},
|
|
|
|
# === ORGANIZATIONAL PROPERTIES ===
|
|
"P31": {"name": "instance_of", "type": "entity_list", "category": "classification"},
|
|
"P17": {"name": "country", "type": "entity", "category": "location"},
|
|
"P131": {"name": "located_in_admin_entity", "type": "entity", "category": "location"},
|
|
"P276": {"name": "location", "type": "entity", "category": "location"},
|
|
"P159": {"name": "headquarters_location", "type": "entity", "category": "location"},
|
|
"P625": {"name": "coordinates", "type": "coordinates", "category": "location"},
|
|
"P969": {"name": "located_at_street_address", "type": "string", "category": "location"},
|
|
"P281": {"name": "postal_code", "type": "string", "category": "location"},
|
|
"P749": {"name": "parent_organization", "type": "entity", "category": "organization"},
|
|
"P355": {"name": "subsidiary", "type": "entity_list", "category": "organization"},
|
|
"P361": {"name": "part_of", "type": "entity", "category": "organization"},
|
|
"P527": {"name": "has_parts", "type": "entity_list", "category": "organization"},
|
|
"P463": {"name": "member_of", "type": "entity_list", "category": "organization"},
|
|
"P101": {"name": "field_of_work", "type": "entity_list", "category": "classification"},
|
|
"P921": {"name": "main_subject", "type": "entity_list", "category": "classification"},
|
|
"P3032": {"name": "adjacent_building", "type": "entity", "category": "location"},
|
|
"P1435": {"name": "heritage_designation", "type": "entity_list", "category": "classification"},
|
|
"P112": {"name": "founded_by", "type": "entity_list", "category": "organization"},
|
|
"P169": {"name": "chief_executive_officer", "type": "entity", "category": "organization"},
|
|
"P488": {"name": "chairperson", "type": "entity", "category": "organization"},
|
|
|
|
# === IDENTIFIERS ===
|
|
"P791": {"name": "isil", "type": "string", "category": "identifier"},
|
|
"P214": {"name": "viaf", "type": "string", "category": "identifier"},
|
|
"P227": {"name": "gnd", "type": "string", "category": "identifier"},
|
|
"P244": {"name": "lcnaf", "type": "string", "category": "identifier"},
|
|
"P268": {"name": "bnf", "type": "string", "category": "identifier"},
|
|
"P269": {"name": "idref", "type": "string", "category": "identifier"},
|
|
"P213": {"name": "isni", "type": "string", "category": "identifier"},
|
|
"P1566": {"name": "geonames_id", "type": "string", "category": "identifier"},
|
|
"P349": {"name": "ndl_authority_id", "type": "string", "category": "identifier"},
|
|
"P271": {"name": "nacsis_cat_id", "type": "string", "category": "identifier"},
|
|
"P2671": {"name": "google_knowledge_graph_id", "type": "string", "category": "identifier"},
|
|
"P3134": {"name": "tripadvisor_id", "type": "string", "category": "identifier"},
|
|
"P11693": {"name": "openstreetmap_node_id", "type": "string", "category": "identifier"},
|
|
"P11496": {"name": "cinii_research_id", "type": "string", "category": "identifier"},
|
|
"P5587": {"name": "libris_uri", "type": "string", "category": "identifier"},
|
|
"P496": {"name": "orcid", "type": "string", "category": "identifier"},
|
|
"P1015": {"name": "noraf_id", "type": "string", "category": "identifier"},
|
|
"P1006": {"name": "nta_id", "type": "string", "category": "identifier"},
|
|
"P409": {"name": "nla_id", "type": "string", "category": "identifier"},
|
|
"P950": {"name": "bne_id", "type": "string", "category": "identifier"},
|
|
"P906": {"name": "selibr", "type": "string", "category": "identifier"},
|
|
"P1017": {"name": "bac_id", "type": "string", "category": "identifier"},
|
|
"P7859": {"name": "worldcat_identities_id", "type": "string", "category": "identifier"},
|
|
"P3500": {"name": "ringgold_id", "type": "string", "category": "identifier"},
|
|
"P2427": {"name": "grid_id", "type": "string", "category": "identifier"},
|
|
"P6782": {"name": "ror_id", "type": "string", "category": "identifier"},
|
|
"P3153": {"name": "crossref_funder_id", "type": "string", "category": "identifier"},
|
|
|
|
# === WEB PRESENCE ===
|
|
"P856": {"name": "official_website", "type": "url", "category": "web"},
|
|
"P1581": {"name": "official_blog_url", "type": "url", "category": "web"},
|
|
"P973": {"name": "described_at_url", "type": "url", "category": "web"},
|
|
"P2013": {"name": "facebook_id", "type": "string", "category": "social"},
|
|
"P2002": {"name": "twitter_username", "type": "string", "category": "social"},
|
|
"P2003": {"name": "instagram_username", "type": "string", "category": "social"},
|
|
"P2397": {"name": "youtube_channel_id", "type": "string", "category": "social"},
|
|
"P4264": {"name": "linkedin_company_id", "type": "string", "category": "social"},
|
|
"P4003": {"name": "facebook_page_id", "type": "string", "category": "social"},
|
|
"P8687": {"name": "social_media_followers", "type": "quantity", "category": "social"},
|
|
|
|
# === MEDIA ===
|
|
"P18": {"name": "image", "type": "commons_media", "category": "media"},
|
|
"P154": {"name": "logo", "type": "commons_media", "category": "media"},
|
|
"P41": {"name": "flag_image", "type": "commons_media", "category": "media"},
|
|
"P94": {"name": "coat_of_arms", "type": "commons_media", "category": "media"},
|
|
"P373": {"name": "commons_category", "type": "string", "category": "media"},
|
|
"P935": {"name": "commons_gallery", "type": "string", "category": "media"},
|
|
|
|
# === CONTACT ===
|
|
"P968": {"name": "email", "type": "string", "category": "contact"},
|
|
"P1329": {"name": "phone_number", "type": "string", "category": "contact"},
|
|
"P3740": {"name": "number_of_works", "type": "quantity", "category": "collection"},
|
|
"P1436": {"name": "collection_items_count", "type": "quantity", "category": "collection"},
|
|
|
|
# === AWARDS & RECOGNITION ===
|
|
"P166": {"name": "award_received", "type": "entity_list", "category": "recognition"},
|
|
|
|
# === ARCHITECTURE ===
|
|
"P149": {"name": "architectural_style", "type": "entity_list", "category": "architecture"},
|
|
"P84": {"name": "architect", "type": "entity_list", "category": "architecture"},
|
|
"P631": {"name": "structural_engineer", "type": "entity_list", "category": "architecture"},
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class FullWikidataEnrichment:
|
|
"""Container for comprehensive Wikidata enrichment data."""
|
|
entity_id: str
|
|
labels: Dict[str, str] = field(default_factory=dict)
|
|
descriptions: Dict[str, str] = field(default_factory=dict)
|
|
aliases: Dict[str, List[str]] = field(default_factory=dict)
|
|
sitelinks: Dict[str, str] = field(default_factory=dict)
|
|
|
|
# All extracted properties organized by category
|
|
temporal: Dict[str, Any] = field(default_factory=dict)
|
|
classification: Dict[str, Any] = field(default_factory=dict)
|
|
location: Dict[str, Any] = field(default_factory=dict)
|
|
organization: Dict[str, Any] = field(default_factory=dict)
|
|
identifiers: Dict[str, str] = field(default_factory=dict)
|
|
web: Dict[str, str] = field(default_factory=dict)
|
|
social: Dict[str, str] = field(default_factory=dict)
|
|
media: Dict[str, str] = field(default_factory=dict)
|
|
contact: Dict[str, str] = field(default_factory=dict)
|
|
collection: Dict[str, Any] = field(default_factory=dict)
|
|
recognition: Dict[str, Any] = field(default_factory=dict)
|
|
architecture: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
# Metadata
|
|
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
properties_found: List[str] = field(default_factory=list)
|
|
|
|
|
|
def extract_value_from_statement(statement: Dict) -> Any:
|
|
"""Extract the value from a Wikidata statement structure."""
|
|
try:
|
|
value_data = statement.get("value", {})
|
|
content = value_data.get("content")
|
|
|
|
if isinstance(content, dict):
|
|
if "entity-type" in content or "id" in content:
|
|
return content.get("id", content)
|
|
elif "time" in content:
|
|
time_val = content.get("time", "")
|
|
if time_val.startswith("+") or time_val.startswith("-"):
|
|
time_val = time_val[1:]
|
|
if "T" in time_val:
|
|
time_val = time_val.split("T")[0]
|
|
return time_val
|
|
elif "latitude" in content and "longitude" in content:
|
|
return {
|
|
"latitude": content.get("latitude"),
|
|
"longitude": content.get("longitude"),
|
|
"precision": content.get("precision")
|
|
}
|
|
elif "amount" in content:
|
|
return content.get("amount", "").lstrip("+")
|
|
else:
|
|
return content
|
|
else:
|
|
return content
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def fetch_entity_labels_batch(entity_ids: Set[str], client: httpx.Client) -> Dict[str, Dict[str, str]]:
|
|
"""Fetch labels for multiple entities in a batch using SPARQL."""
|
|
if not entity_ids:
|
|
return {}
|
|
|
|
# Limit batch size
|
|
entity_ids_list = list(entity_ids)[:50]
|
|
|
|
entity_values = " ".join([f"wd:{eid}" for eid in entity_ids_list])
|
|
query = f"""
|
|
SELECT ?entity ?entityLabel ?entityDescription WHERE {{
|
|
VALUES ?entity {{ {entity_values} }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,de,fr". }}
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
response = client.get(
|
|
WIKIDATA_SPARQL_ENDPOINT,
|
|
params={"query": query, "format": "json"},
|
|
headers={"User-Agent": USER_AGENT, "Accept": "application/sparql-results+json"}
|
|
)
|
|
response.raise_for_status()
|
|
results = response.json()
|
|
|
|
labels = {}
|
|
for binding in results.get("results", {}).get("bindings", []):
|
|
entity_uri = binding.get("entity", {}).get("value", "")
|
|
entity_id = entity_uri.split("/")[-1] if entity_uri else None
|
|
if entity_id:
|
|
labels[entity_id] = {
|
|
"id": entity_id,
|
|
"label": binding.get("entityLabel", {}).get("value", entity_id),
|
|
"description": binding.get("entityDescription", {}).get("value", "")
|
|
}
|
|
return labels
|
|
except Exception as e:
|
|
logger.warning(f"SPARQL label fetch failed: {e}")
|
|
return {eid: {"id": eid, "label": eid} for eid in entity_ids_list}
|
|
|
|
|
|
def fetch_entity_data(entity_id: str, client: httpx.Client) -> Optional[Dict]:
|
|
"""Fetch full entity data from Wikibase REST API."""
|
|
url = f"{WIKIDATA_REST_API}/entities/items/{entity_id}"
|
|
|
|
try:
|
|
response = client.get(url, headers=HEADERS)
|
|
|
|
if response.status_code == 403:
|
|
headers_no_auth = {k: v for k, v in HEADERS.items() if k != "Authorization"}
|
|
response = client.get(url, headers=headers_no_auth)
|
|
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 404:
|
|
logger.warning(f"Entity {entity_id} not found")
|
|
else:
|
|
logger.error(f"HTTP error fetching {entity_id}: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error fetching {entity_id}: {e}")
|
|
return None
|
|
|
|
|
|
def parse_entity_data_full(entity_id: str, data: Dict, client: httpx.Client) -> FullWikidataEnrichment:
|
|
"""Parse the full entity data with label resolution."""
|
|
enrichment = FullWikidataEnrichment(entity_id=entity_id)
|
|
|
|
# Extract labels
|
|
enrichment.labels = data.get("labels", {})
|
|
enrichment.descriptions = data.get("descriptions", {})
|
|
enrichment.aliases = data.get("aliases", {})
|
|
|
|
# Extract sitelinks
|
|
sitelinks = data.get("sitelinks", {})
|
|
enrichment.sitelinks = {k: v.get("title", "") for k, v in sitelinks.items() if isinstance(v, dict)}
|
|
|
|
# Collect entity IDs that need label resolution
|
|
entity_ids_to_resolve: Set[str] = set()
|
|
|
|
# Process all statements
|
|
statements = data.get("statements", {})
|
|
|
|
for prop_id, prop_statements in statements.items():
|
|
if not prop_statements:
|
|
continue
|
|
|
|
prop_config = PROPERTY_MAPPING.get(prop_id)
|
|
if not prop_config:
|
|
continue # Skip unknown properties
|
|
|
|
enrichment.properties_found.append(prop_id)
|
|
prop_name: str = prop_config["name"]
|
|
prop_type: str = prop_config["type"]
|
|
category: str = prop_config["category"]
|
|
|
|
values: List[Any] = []
|
|
for stmt in prop_statements:
|
|
value = extract_value_from_statement(stmt)
|
|
if value is not None:
|
|
values.append(value)
|
|
# Collect entity IDs for label resolution
|
|
if prop_type in ("entity", "entity_list") and isinstance(value, str) and value.startswith("Q"):
|
|
entity_ids_to_resolve.add(value)
|
|
|
|
if not values:
|
|
continue
|
|
|
|
# Store values in appropriate category
|
|
target_dict = getattr(enrichment, category, None)
|
|
if target_dict is None:
|
|
continue
|
|
|
|
if prop_type == "entity":
|
|
target_dict[prop_name] = values[0]
|
|
elif prop_type == "entity_list":
|
|
target_dict[prop_name] = values
|
|
elif prop_type in ("string", "url"):
|
|
target_dict[prop_name] = values[0] if len(values) == 1 else values
|
|
elif prop_type == "time":
|
|
target_dict[prop_name] = values[0]
|
|
elif prop_type == "coordinates":
|
|
target_dict[prop_name] = values[0]
|
|
elif prop_type == "commons_media":
|
|
target_dict[prop_name] = values[0]
|
|
elif prop_type == "quantity":
|
|
target_dict[prop_name] = values[0]
|
|
|
|
# Resolve entity labels
|
|
if entity_ids_to_resolve:
|
|
time.sleep(0.2) # Small delay before SPARQL query
|
|
labels_map = fetch_entity_labels_batch(entity_ids_to_resolve, client)
|
|
|
|
# Replace entity IDs with resolved labels
|
|
for category_name in ["classification", "location", "organization", "recognition", "architecture"]:
|
|
category_dict = getattr(enrichment, category_name, {})
|
|
for key, value in list(category_dict.items()):
|
|
if isinstance(value, str) and value in labels_map:
|
|
category_dict[key] = labels_map[value]
|
|
elif isinstance(value, list):
|
|
category_dict[key] = [
|
|
labels_map.get(v, {"id": v, "label": v}) if isinstance(v, str) and v.startswith("Q") else v
|
|
for v in value
|
|
]
|
|
|
|
return enrichment
|
|
|
|
|
|
def enrichment_to_dict(enrichment: FullWikidataEnrichment) -> Dict:
|
|
"""Convert FullWikidataEnrichment to a dictionary for YAML output."""
|
|
result = {
|
|
"wikidata_entity_id": enrichment.entity_id,
|
|
"api_metadata": {
|
|
"api_endpoint": WIKIDATA_REST_API,
|
|
"fetch_timestamp": enrichment.fetch_timestamp,
|
|
"user_agent": USER_AGENT,
|
|
"enrichment_version": "2.0_full",
|
|
"properties_found": enrichment.properties_found,
|
|
}
|
|
}
|
|
|
|
# Add labels
|
|
if enrichment.labels:
|
|
result["wikidata_labels"] = enrichment.labels
|
|
for lang in ["en", "nl", "ja", "de", "fr", "es"]:
|
|
if lang in enrichment.labels:
|
|
result[f"wikidata_label_{lang}"] = enrichment.labels[lang]
|
|
|
|
# Add descriptions
|
|
if enrichment.descriptions:
|
|
result["wikidata_descriptions"] = enrichment.descriptions
|
|
if "en" in enrichment.descriptions:
|
|
result["wikidata_description_en"] = enrichment.descriptions["en"]
|
|
|
|
# Add aliases
|
|
if enrichment.aliases:
|
|
result["wikidata_aliases"] = enrichment.aliases
|
|
|
|
# Add sitelinks (Wikipedia articles)
|
|
if enrichment.sitelinks:
|
|
result["wikidata_sitelinks"] = enrichment.sitelinks
|
|
|
|
# Add all category data with readable prefixes
|
|
if enrichment.temporal:
|
|
result["wikidata_temporal"] = enrichment.temporal
|
|
# Promote key dates to top level for easy access
|
|
if "inception" in enrichment.temporal:
|
|
result["wikidata_inception"] = enrichment.temporal["inception"]
|
|
if "dissolution" in enrichment.temporal:
|
|
result["wikidata_dissolution"] = enrichment.temporal["dissolution"]
|
|
if "date_of_official_opening" in enrichment.temporal:
|
|
result["wikidata_opening_date"] = enrichment.temporal["date_of_official_opening"]
|
|
|
|
if enrichment.classification:
|
|
result["wikidata_classification"] = enrichment.classification
|
|
if "instance_of" in enrichment.classification:
|
|
result["wikidata_instance_of"] = enrichment.classification["instance_of"]
|
|
if "field_of_work" in enrichment.classification:
|
|
result["wikidata_field_of_work"] = enrichment.classification["field_of_work"]
|
|
|
|
if enrichment.location:
|
|
result["wikidata_location"] = enrichment.location
|
|
if "country" in enrichment.location:
|
|
result["wikidata_country"] = enrichment.location["country"]
|
|
if "located_in_admin_entity" in enrichment.location:
|
|
result["wikidata_located_in"] = enrichment.location["located_in_admin_entity"]
|
|
if "coordinates" in enrichment.location:
|
|
result["wikidata_coordinates"] = enrichment.location["coordinates"]
|
|
|
|
if enrichment.organization:
|
|
result["wikidata_organization"] = enrichment.organization
|
|
|
|
if enrichment.identifiers:
|
|
result["wikidata_identifiers"] = enrichment.identifiers
|
|
|
|
if enrichment.web:
|
|
result["wikidata_web"] = enrichment.web
|
|
if "official_website" in enrichment.web:
|
|
result["wikidata_official_website"] = enrichment.web["official_website"]
|
|
|
|
if enrichment.social:
|
|
result["wikidata_social_media"] = enrichment.social
|
|
|
|
if enrichment.media:
|
|
result["wikidata_media"] = enrichment.media
|
|
if "image" in enrichment.media:
|
|
result["wikidata_image"] = enrichment.media["image"]
|
|
if "logo" in enrichment.media:
|
|
result["wikidata_logo"] = enrichment.media["logo"]
|
|
|
|
if enrichment.contact:
|
|
result["wikidata_contact"] = enrichment.contact
|
|
|
|
if enrichment.collection:
|
|
result["wikidata_collection"] = enrichment.collection
|
|
|
|
if enrichment.recognition:
|
|
result["wikidata_recognition"] = enrichment.recognition
|
|
|
|
if enrichment.architecture:
|
|
result["wikidata_architecture"] = enrichment.architecture
|
|
|
|
return result
|
|
|
|
|
|
def get_wikidata_entity_id(data: Dict) -> Optional[str]:
|
|
"""Extract Wikidata entity ID from a custodian YAML file."""
|
|
wd = data.get("wikidata_enrichment", {})
|
|
if wd and wd.get("wikidata_entity_id"):
|
|
return wd.get("wikidata_entity_id")
|
|
|
|
identifiers = data.get("identifiers", [])
|
|
for ident in identifiers:
|
|
if isinstance(ident, dict):
|
|
scheme = ident.get("identifier_scheme", "")
|
|
if scheme.lower() == "wikidata":
|
|
return ident.get("identifier_value")
|
|
|
|
original = data.get("original_entry", {})
|
|
for ident in original.get("identifiers", []):
|
|
if isinstance(ident, dict):
|
|
scheme = ident.get("identifier_scheme", "")
|
|
if scheme.lower() == "wikidata":
|
|
return ident.get("identifier_value")
|
|
|
|
return None
|
|
|
|
|
|
def is_fully_enriched(data: Dict) -> bool:
|
|
"""Check if file has been fully enriched with v2.0."""
|
|
wd = data.get("wikidata_enrichment", {})
|
|
api_meta = wd.get("api_metadata", {})
|
|
return api_meta.get("enrichment_version") == "2.0_full"
|
|
|
|
|
|
def load_progress() -> Dict:
|
|
"""Load progress from checkpoint file."""
|
|
if PROGRESS_FILE.exists():
|
|
try:
|
|
with open(PROGRESS_FILE, 'r') as f:
|
|
return json.load(f)
|
|
except Exception:
|
|
pass
|
|
return {"processed_files": [], "stats": {}}
|
|
|
|
|
|
def save_progress(progress: Dict):
|
|
"""Save progress to checkpoint file."""
|
|
try:
|
|
with open(PROGRESS_FILE, 'w') as f:
|
|
json.dump(progress, f, indent=2)
|
|
except Exception as e:
|
|
logger.error(f"Failed to save progress: {e}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Full Wikidata enrichment for custodian files")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be enriched without modifying files")
|
|
parser.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = no limit)")
|
|
parser.add_argument("--country", type=str, help="Only process files for country code XX")
|
|
parser.add_argument("--force", action="store_true", help="Re-enrich even if already has v2.0 enrichment")
|
|
parser.add_argument("--resume", action="store_true", help="Resume from last checkpoint")
|
|
args = parser.parse_args()
|
|
|
|
progress = load_progress() if args.resume else {"processed_files": [], "stats": {}}
|
|
processed_files = set(progress.get("processed_files", []))
|
|
|
|
stats = {
|
|
"total_scanned": 0,
|
|
"needs_enrichment": 0,
|
|
"already_enriched_v2": 0,
|
|
"no_wikidata_id": 0,
|
|
"enriched_successfully": 0,
|
|
"errors": 0,
|
|
"skipped_already_processed": 0,
|
|
"properties_counts": {},
|
|
}
|
|
|
|
pattern = f"{args.country}-*.yaml" if args.country else "*.yaml"
|
|
yaml_files = sorted(CUSTODIAN_DIR.glob(pattern))
|
|
|
|
logger.info(f"Found {len(yaml_files)} YAML files in {CUSTODIAN_DIR}")
|
|
|
|
files_to_process = []
|
|
|
|
for yaml_file in yaml_files:
|
|
stats["total_scanned"] += 1
|
|
|
|
if args.resume and yaml_file.name in processed_files:
|
|
stats["skipped_already_processed"] += 1
|
|
continue
|
|
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
entity_id = get_wikidata_entity_id(data)
|
|
if not entity_id:
|
|
stats["no_wikidata_id"] += 1
|
|
continue
|
|
|
|
if not args.force and is_fully_enriched(data):
|
|
stats["already_enriched_v2"] += 1
|
|
continue
|
|
|
|
stats["needs_enrichment"] += 1
|
|
files_to_process.append((yaml_file, entity_id)) # Don't store data - re-read later
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading {yaml_file}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
logger.info(f"Files needing enrichment: {len(files_to_process)}")
|
|
logger.info(f"Files already enriched (v2.0): {stats['already_enriched_v2']}")
|
|
logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
|
|
|
|
if args.limit > 0:
|
|
files_to_process = files_to_process[:args.limit]
|
|
logger.info(f"Limited to first {args.limit} files")
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN - No files will be modified")
|
|
for yaml_file, entity_id in files_to_process[:20]:
|
|
logger.info(f" Would enrich: {yaml_file.name} ({entity_id})")
|
|
if len(files_to_process) > 20:
|
|
logger.info(f" ... and {len(files_to_process) - 20} more")
|
|
return
|
|
|
|
# CRITICAL KEYS that must NEVER be deleted during enrichment
|
|
# See AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
|
|
PROTECTED_KEYS = {'location', 'original_entry', 'ghcid', 'custodian_name', 'identifiers',
|
|
'provenance', 'ch_annotator', 'google_maps_enrichment', 'osm_enrichment',
|
|
'unesco_mow_enrichment', 'web_enrichment'}
|
|
|
|
with httpx.Client(timeout=30.0) as client:
|
|
for i, (yaml_file, _, entity_id) in enumerate(files_to_process):
|
|
try:
|
|
logger.info(f"[{i+1}/{len(files_to_process)}] Enriching {yaml_file.name} ({entity_id})")
|
|
|
|
# SAFETY FIX: Re-read the file immediately before modifying
|
|
# This prevents race conditions where another script modified the file
|
|
# between initial scan and enrichment
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
logger.warning(f" File is empty or invalid: {yaml_file.name}")
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
# Record which protected keys exist BEFORE modification
|
|
keys_before = set(data.keys())
|
|
protected_keys_before = keys_before & PROTECTED_KEYS
|
|
|
|
entity_data = fetch_entity_data(entity_id, client)
|
|
|
|
if entity_data is None:
|
|
logger.warning(f" Could not fetch data for {entity_id}")
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
enrichment = parse_entity_data_full(entity_id, entity_data, client)
|
|
enrichment_dict = enrichment_to_dict(enrichment)
|
|
|
|
data["wikidata_enrichment"] = enrichment_dict
|
|
|
|
# SAFETY CHECK: Verify no protected keys were lost
|
|
keys_after = set(data.keys())
|
|
protected_keys_after = keys_after & PROTECTED_KEYS
|
|
lost_keys = protected_keys_before - protected_keys_after
|
|
if lost_keys:
|
|
logger.error(f" CRITICAL: Protected keys lost during enrichment: {lost_keys}")
|
|
logger.error(f" Skipping file to prevent data loss!")
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
# Track property statistics
|
|
for prop in enrichment.properties_found:
|
|
stats["properties_counts"][prop] = stats["properties_counts"].get(prop, 0) + 1
|
|
|
|
stats["enriched_successfully"] += 1
|
|
|
|
# Log key findings
|
|
findings = []
|
|
if enrichment.temporal.get("inception"):
|
|
findings.append(f"inception: {enrichment.temporal['inception']}")
|
|
if enrichment.temporal.get("date_of_official_opening"):
|
|
findings.append(f"opened: {enrichment.temporal['date_of_official_opening']}")
|
|
if enrichment.classification.get("field_of_work"):
|
|
fow = enrichment.classification["field_of_work"]
|
|
if isinstance(fow, list) and fow:
|
|
label = fow[0].get("label", fow[0]) if isinstance(fow[0], dict) else fow[0]
|
|
findings.append(f"field: {label}")
|
|
if enrichment.identifiers:
|
|
findings.append(f"{len(enrichment.identifiers)} identifiers")
|
|
|
|
if findings:
|
|
logger.info(f" Found: {', '.join(findings)}")
|
|
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
processed_files.add(yaml_file.name)
|
|
progress["processed_files"] = list(processed_files)
|
|
progress["stats"] = stats
|
|
|
|
if (i + 1) % 10 == 0:
|
|
save_progress(progress)
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {yaml_file.name}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
save_progress(progress)
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("FULL ENRICHMENT COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total files scanned: {stats['total_scanned']}")
|
|
logger.info(f"Files needing enrichment: {stats['needs_enrichment']}")
|
|
logger.info(f"Already enriched (v2.0): {stats['already_enriched_v2']}")
|
|
logger.info(f"Files without Wikidata ID: {stats['no_wikidata_id']}")
|
|
logger.info(f"Successfully enriched: {stats['enriched_successfully']}")
|
|
logger.info(f"Errors: {stats['errors']}")
|
|
logger.info("")
|
|
logger.info("Top properties found:")
|
|
sorted_props = sorted(stats["properties_counts"].items(), key=lambda x: x[1], reverse=True)[:15]
|
|
for prop, count in sorted_props:
|
|
prop_name = PROPERTY_MAPPING.get(prop, {}).get("name", prop)
|
|
logger.info(f" {prop} ({prop_name}): {count}")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|