glam/scripts/enrich_unesco_mow.py
2025-12-07 00:26:01 +01:00

584 lines
23 KiB
Python

#!/usr/bin/env python3
"""
UNESCO Memory of the World (MoW) Enrichment Script
Enriches custodian YAML files with UNESCO Memory of the World inscription data.
Unlike ICH (country-based matching), MoW uses Wikidata ID matching to find
custodians that are directly listed as holding institutions for MoW inscriptions.
Data source: Wikidata SPARQL (UNESCO has no MoW API)
- P1435 = heritage designation
- Q16024238 = Memory of the World International Register
- P276 = location (holding institution)
This enables PRECISE matching: if a custodian's Wikidata ID appears as P276
(location) on a MoW inscription, we have a direct link.
Usage:
python scripts/enrich_unesco_mow.py [--dry-run] [--limit N] [--country CC]
python scripts/enrich_unesco_mow.py --refresh-cache
python scripts/enrich_unesco_mow.py --stats
"""
import argparse
import json
import logging
import os
import sys
import urllib.request
import urllib.error
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List, Set
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
try:
import yaml
except ImportError:
print("ERROR: PyYAML not installed. Run: pip install pyyaml")
sys.exit(1)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Constants
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
CACHE_DIR = Path(__file__).parent.parent / "data" / "cache"
CACHE_FILE = CACHE_DIR / "unesco_mow_wikidata.json"
CACHE_MAX_AGE_DAYS = 7
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
# SPARQL query to get all Memory of the World inscriptions with location (custodian) data
MOW_SPARQL_QUERY = """
SELECT DISTINCT
?item ?itemLabel
?countryLabel ?country
?location ?locationLabel
?inscriptionYear
?desc
?image
WHERE {
?item wdt:P1435 wd:Q16024238 . # Memory of the World International Register
OPTIONAL {
?item wdt:P17 ?country .
}
OPTIONAL {
?item wdt:P276 ?location . # Critical: This links to custodian institutions!
}
OPTIONAL {
?item wdt:P571 ?inception .
BIND(YEAR(?inception) AS ?inscriptionYear)
}
OPTIONAL {
?item schema:description ?desc .
FILTER(LANG(?desc) = "en")
}
OPTIONAL {
?item wdt:P18 ?image .
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,nl,de,fr,es,pt" . }
}
ORDER BY ?itemLabel
"""
class UNESCOMoWEnricher:
"""Enriches custodian files with UNESCO Memory of the World data."""
def __init__(self, dry_run: bool = False):
self.dry_run = dry_run
self.mow_data: List[Dict] = []
self.inscriptions_by_custodian: Dict[str, List[Dict]] = {} # Wikidata ID -> list of inscriptions
self.inscriptions_by_country: Dict[str, List[Dict]] = {} # Country code -> list of inscriptions
self.stats = {
"inscriptions_fetched": 0,
"inscriptions_with_location": 0,
"unique_custodians_in_mow": 0,
"countries_covered": 0,
"custodians_processed": 0,
"custodians_with_wikidata": 0,
"custodians_matched_by_wikidata": 0,
"custodians_enriched": 0,
"mow_references_added": 0,
"errors": 0
}
def fetch_mow_data(self, force_refresh: bool = False) -> List[Dict]:
"""Fetch MoW data from Wikidata SPARQL or cache."""
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Check cache
if CACHE_FILE.exists() and not force_refresh:
cache_age = datetime.now() - datetime.fromtimestamp(CACHE_FILE.stat().st_mtime)
if cache_age.days < CACHE_MAX_AGE_DAYS:
logger.info(f"📁 Loading cached MoW data ({cache_age.days}.{cache_age.seconds//3600} days old)")
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
self.mow_data = cache_data.get('inscriptions', [])
logger.info(f" Loaded {len(self.mow_data)} cached MoW inscriptions")
self._build_indices()
return self.mow_data
# Fetch from Wikidata SPARQL
logger.info("🌍 Fetching Memory of the World data from Wikidata SPARQL...")
try:
# Encode query for URL
encoded_query = urllib.parse.urlencode({'query': MOW_SPARQL_QUERY})
url = f"{WIKIDATA_SPARQL_ENDPOINT}?{encoded_query}"
req = urllib.request.Request(
url,
headers={
'User-Agent': 'GLAM-Heritage-Enricher/1.0 (glam-data@example.com)',
'Accept': 'application/json'
}
)
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode('utf-8'))
except urllib.error.URLError as e:
logger.error(f"Failed to fetch MoW data: {e}")
raise
# Parse SPARQL results
bindings = result.get('results', {}).get('bindings', [])
# Group by item (inscription) since one inscription may have multiple locations
inscriptions_map: Dict[str, Dict] = {}
for binding in bindings:
item_uri = binding.get('item', {}).get('value', '')
item_id = item_uri.split('/')[-1] if item_uri else ''
if not item_id:
continue
if item_id not in inscriptions_map:
inscriptions_map[item_id] = {
'wikidata_id': item_id,
'wikidata_url': item_uri,
'name': binding.get('itemLabel', {}).get('value', ''),
'description': binding.get('desc', {}).get('value', ''),
'country': binding.get('countryLabel', {}).get('value', ''),
'country_id': self._extract_id(binding.get('country', {}).get('value', '')),
'inscription_year': self._parse_year(binding.get('inscriptionYear', {}).get('value', '')),
'image_url': binding.get('image', {}).get('value', ''),
'locations': [] # Custodian institutions
}
# Add location if present
location_uri = binding.get('location', {}).get('value', '')
location_id = self._extract_id(location_uri)
location_label = binding.get('locationLabel', {}).get('value', '')
if location_id and location_id not in [loc['wikidata_id'] for loc in inscriptions_map[item_id]['locations']]:
inscriptions_map[item_id]['locations'].append({
'wikidata_id': location_id,
'wikidata_url': location_uri,
'name': location_label
})
self.mow_data = list(inscriptions_map.values())
self.stats["inscriptions_fetched"] = len(self.mow_data)
with_location = sum(1 for i in self.mow_data if i['locations'])
self.stats["inscriptions_with_location"] = with_location
logger.info(f"✅ Fetched {len(self.mow_data)} MoW inscriptions ({with_location} with custodian locations)")
# Cache the data
cache_data = {
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
'inscriptions': self.mow_data
}
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(cache_data, f, ensure_ascii=False, indent=2)
logger.info(f" Cached to {CACHE_FILE}")
self._build_indices()
return self.mow_data
def _extract_id(self, uri: str) -> str:
"""Extract Wikidata ID from URI."""
if uri:
return uri.split('/')[-1]
return ''
def _parse_year(self, year_str: str) -> Optional[int]:
"""Parse year from string."""
if year_str:
try:
return int(year_str)
except ValueError:
pass
return None
def _build_indices(self):
"""Build indices for fast lookup by custodian Wikidata ID and country."""
self.inscriptions_by_custodian = {}
self.inscriptions_by_country = {}
unique_custodians: Set[str] = set()
for inscription in self.mow_data:
# Index by custodian Wikidata ID
for location in inscription.get('locations', []):
custodian_id = location.get('wikidata_id', '')
if custodian_id:
unique_custodians.add(custodian_id)
if custodian_id not in self.inscriptions_by_custodian:
self.inscriptions_by_custodian[custodian_id] = []
# Add inscription with specific location info
inscription_copy = inscription.copy()
inscription_copy['matched_location'] = location
self.inscriptions_by_custodian[custodian_id].append(inscription_copy)
# Index by country (using ISO code mapping)
country_code = self._country_to_code(inscription.get('country', ''))
if country_code:
if country_code not in self.inscriptions_by_country:
self.inscriptions_by_country[country_code] = []
self.inscriptions_by_country[country_code].append(inscription)
self.stats["unique_custodians_in_mow"] = len(unique_custodians)
self.stats["countries_covered"] = len(self.inscriptions_by_country)
logger.info(f" Indexed {len(unique_custodians)} unique custodian institutions")
logger.info(f" Indexed {len(self.inscriptions_by_country)} countries with MoW inscriptions")
def _country_to_code(self, country_name: str) -> Optional[str]:
"""Convert country name to ISO code."""
# Simple mapping for common countries
country_map = {
"netherlands": "NL",
"belgium": "BE",
"germany": "DE",
"france": "FR",
"united kingdom": "GB",
"united states of america": "US",
"united states": "US",
"japan": "JP",
"china": "CN",
"india": "IN",
"brazil": "BR",
"mexico": "MX",
"spain": "ES",
"italy": "IT",
"portugal": "PT",
"poland": "PL",
"austria": "AT",
"switzerland": "CH",
"australia": "AU",
"new zealand": "NZ",
"canada": "CA",
"south africa": "ZA",
"egypt": "EG",
"russia": "RU",
"russian federation": "RU",
"korea": "KR",
"republic of korea": "KR",
"south korea": "KR",
"indonesia": "ID",
"philippines": "PH",
"thailand": "TH",
"vietnam": "VN",
"viet nam": "VN",
"peru": "PE",
"argentina": "AR",
"colombia": "CO",
"chile": "CL",
"czech republic": "CZ",
"czechia": "CZ",
"hungary": "HU",
"greece": "GR",
"turkey": "TR",
"iran": "IR",
"iran (islamic republic of)": "IR",
"israel": "IL",
"morocco": "MA",
"tunisia": "TN",
"kenya": "KE",
"nigeria": "NG",
"ethiopia": "ET",
"pakistan": "PK",
"bangladesh": "BD",
"malaysia": "MY",
"singapore": "SG",
"norway": "NO",
"sweden": "SE",
"denmark": "DK",
"finland": "FI",
"ireland": "IE",
"luxembourg": "LU",
"croatia": "HR",
"slovenia": "SI",
"slovakia": "SK",
"romania": "RO",
"bulgaria": "BG",
"ukraine": "UA",
"latvia": "LV",
"lithuania": "LT",
"estonia": "EE",
}
return country_map.get(country_name.lower().strip())
def get_wikidata_id_from_custodian(self, data: dict) -> Optional[str]:
"""Extract Wikidata ID from custodian data."""
# Try wikidata_enrichment first
wikidata_id = data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
if wikidata_id:
return wikidata_id
# Try original_entry
wikidata_id = data.get('original_entry', {}).get('wikidata_id')
if wikidata_id:
return wikidata_id
# Try identifiers list
for identifier in data.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
return identifier.get('identifier_value')
return None
def get_country_from_custodian(self, data: dict) -> Optional[str]:
"""Extract country code from custodian data."""
# Try GHCID first (most reliable)
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
if ghcid and len(ghcid) >= 2:
return ghcid[:2].upper()
# Try location_resolution
loc_res = data.get('ghcid', {}).get('location_resolution', {})
if isinstance(loc_res, dict):
country = loc_res.get('country_code')
if country:
return country.upper()
return None
def enrich_custodian(self, file_path: Path) -> bool:
"""Enrich a single custodian file with MoW data."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return False
# Get Wikidata ID
wikidata_id = self.get_wikidata_id_from_custodian(data)
if wikidata_id:
self.stats["custodians_with_wikidata"] += 1
country_code = self.get_country_from_custodian(data)
# Look up MoW inscriptions where this custodian is listed as location
matched_inscriptions = []
match_method = None
if wikidata_id and wikidata_id in self.inscriptions_by_custodian:
matched_inscriptions = self.inscriptions_by_custodian[wikidata_id]
match_method = "wikidata_id_direct"
self.stats["custodians_matched_by_wikidata"] += 1
if not matched_inscriptions:
return False
# Get custodian name for logging
custodian_name = (
data.get('custodian_name', {}).get('claim_value', '') or
data.get('wikidata_enrichment', {}).get('wikidata_label_en', '') or
data.get('original_entry', {}).get('organisatie', '') or
data.get('google_maps_enrichment', {}).get('name', '') or
file_path.stem
)
# Build enrichment data
mow_enrichment = {
"match_method": match_method,
"custodian_wikidata_id": wikidata_id,
"total_inscriptions_held": len(matched_inscriptions),
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
"inscriptions": []
}
# Add inscription details (sorted by year, most recent first)
sorted_inscriptions = sorted(
matched_inscriptions,
key=lambda x: -(x.get('inscription_year') or 0)
)
for inscription in sorted_inscriptions[:20]: # Limit to 20 inscriptions
desc = inscription.get("description", "")
if len(desc) > 400:
desc = desc[:400] + '...'
mow_enrichment["inscriptions"].append({
"wikidata_id": inscription["wikidata_id"],
"name": inscription["name"],
"description": desc,
"country": inscription.get("country", ""),
"inscription_year": inscription.get("inscription_year"),
"wikidata_url": inscription.get("wikidata_url", ""),
"unesco_mow_url": f"https://www.unesco.org/en/memory-world", # Generic URL as UNESCO has no item-level URLs
"image_url": inscription.get("image_url", "")
})
# Update data
data['unesco_mow_enrichment'] = mow_enrichment
self.stats["mow_references_added"] += len(mow_enrichment["inscriptions"])
# Log
logger.info(f"📜 {custodian_name}: {len(mow_enrichment['inscriptions'])} MoW inscriptions (matched by {match_method})")
for inscription in mow_enrichment["inscriptions"][:3]:
year_str = f", {inscription['inscription_year']}" if inscription.get('inscription_year') else ""
logger.info(f" 📜 {inscription['name'][:60]}... ({inscription['country']}{year_str})")
if self.dry_run:
logger.info(f" [DRY RUN - not saving]")
return True
# Save updated file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
self.stats["custodians_enriched"] += 1
return True
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
self.stats["errors"] += 1
return False
def enrich_all(self, limit: Optional[int] = None, country_filter: Optional[str] = None):
"""Enrich all custodian files."""
# Fetch MoW data first
self.fetch_mow_data()
# Find all custodian files
if not CUSTODIAN_DIR.exists():
logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}")
return
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
if country_filter:
files = [f for f in files if f.name.startswith(country_filter.upper())]
logger.info(f"Filtering to country: {country_filter.upper()}")
if limit:
files = files[:limit]
logger.info(f"\n📂 Processing {len(files)} custodian files...")
for file_path in files:
self.stats["custodians_processed"] += 1
self.enrich_custodian(file_path)
self.print_stats()
def print_stats(self):
"""Print enrichment statistics."""
print("\n" + "=" * 65)
print("UNESCO MEMORY OF THE WORLD ENRICHMENT STATISTICS")
print("=" * 65)
print(f"MoW inscriptions fetched: {self.stats['inscriptions_fetched']}")
print(f"Inscriptions with custodian locations: {self.stats['inscriptions_with_location']}")
print(f"Unique custodian institutions in MoW: {self.stats['unique_custodians_in_mow']}")
print(f"Countries with MoW inscriptions: {self.stats['countries_covered']}")
print("-" * 65)
print(f"Custodian files processed: {self.stats['custodians_processed']}")
print(f"Custodians with Wikidata ID: {self.stats['custodians_with_wikidata']}")
print(f"Custodians matched by Wikidata ID: {self.stats['custodians_matched_by_wikidata']}")
print(f"Custodians enriched: {self.stats['custodians_enriched']}")
print(f"Total MoW references added: {self.stats['mow_references_added']}")
print(f"Errors: {self.stats['errors']}")
print("=" * 65)
def show_stats_only(self):
"""Show MoW statistics without enriching."""
self.fetch_mow_data()
print("\n" + "=" * 65)
print("UNESCO MEMORY OF THE WORLD STATISTICS (via Wikidata)")
print("=" * 65)
print(f"\nTotal inscriptions: {len(self.mow_data)}")
print(f"Inscriptions with custodian (P276): {self.stats['inscriptions_with_location']}")
print(f"Unique custodian institutions: {self.stats['unique_custodians_in_mow']}")
# Top countries
print(f"\nTop 15 Countries by MoW Inscriptions:")
sorted_countries = sorted(
self.inscriptions_by_country.items(),
key=lambda x: -len(x[1])
)[:15]
for country, inscriptions in sorted_countries:
print(f" {country}: {len(inscriptions)} inscriptions")
# Inscriptions by year
year_counts = {}
for inscription in self.mow_data:
year = inscription.get('inscription_year')
if year:
year_counts[year] = year_counts.get(year, 0) + 1
print("\nRecent Inscriptions by Year:")
for year in sorted(year_counts.keys(), reverse=True)[:10]:
print(f" {year}: {year_counts[year]} inscriptions")
# Sample custodians (institutions holding MoW inscriptions)
print(f"\nSample Custodian Institutions (first 10 with most inscriptions):")
sorted_custodians = sorted(
self.inscriptions_by_custodian.items(),
key=lambda x: -len(x[1])
)[:10]
for custodian_id, inscriptions in sorted_custodians:
name = inscriptions[0].get('matched_location', {}).get('name', custodian_id)
print(f" {name} ({custodian_id}): {len(inscriptions)} inscriptions")
def main():
parser = argparse.ArgumentParser(
description="Enrich custodian files with UNESCO Memory of the World data"
)
parser.add_argument('--dry-run', action='store_true',
help="Don't save changes, just show what would be done")
parser.add_argument('--limit', type=int,
help="Limit number of files to process")
parser.add_argument('--country', type=str,
help="Filter to specific country code (e.g., NL, BE)")
parser.add_argument('--refresh-cache', action='store_true',
help="Force refresh of cached MoW data")
parser.add_argument('--stats', action='store_true',
help="Show MoW statistics only, don't enrich")
args = parser.parse_args()
enricher = UNESCOMoWEnricher(dry_run=args.dry_run)
if args.refresh_cache:
enricher.fetch_mow_data(force_refresh=True)
print("Cache refreshed successfully.")
return
if args.stats:
enricher.show_stats_only()
return
enricher.enrich_all(limit=args.limit, country_filter=args.country)
if __name__ == "__main__":
main()