#!/usr/bin/env python3 """ UNESCO Memory of the World (MoW) Enrichment Script Enriches custodian YAML files with UNESCO Memory of the World inscription data. Unlike ICH (country-based matching), MoW uses Wikidata ID matching to find custodians that are directly listed as holding institutions for MoW inscriptions. Data source: Wikidata SPARQL (UNESCO has no MoW API) - P1435 = heritage designation - Q16024238 = Memory of the World International Register - P276 = location (holding institution) This enables PRECISE matching: if a custodian's Wikidata ID appears as P276 (location) on a MoW inscription, we have a direct link. Usage: python scripts/enrich_unesco_mow.py [--dry-run] [--limit N] [--country CC] python scripts/enrich_unesco_mow.py --refresh-cache python scripts/enrich_unesco_mow.py --stats """ import argparse import json import logging import os import sys import urllib.request import urllib.error import urllib.parse from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, List, Set # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) try: import yaml except ImportError: print("ERROR: PyYAML not installed. Run: pip install pyyaml") sys.exit(1) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Constants WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" CACHE_DIR = Path(__file__).parent.parent / "data" / "cache" CACHE_FILE = CACHE_DIR / "unesco_mow_wikidata.json" CACHE_MAX_AGE_DAYS = 7 CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" # SPARQL query to get all Memory of the World inscriptions with location (custodian) data MOW_SPARQL_QUERY = """ SELECT DISTINCT ?item ?itemLabel ?countryLabel ?country ?location ?locationLabel ?inscriptionYear ?desc ?image WHERE { ?item wdt:P1435 wd:Q16024238 . # Memory of the World International Register OPTIONAL { ?item wdt:P17 ?country . } OPTIONAL { ?item wdt:P276 ?location . # Critical: This links to custodian institutions! } OPTIONAL { ?item wdt:P571 ?inception . BIND(YEAR(?inception) AS ?inscriptionYear) } OPTIONAL { ?item schema:description ?desc . FILTER(LANG(?desc) = "en") } OPTIONAL { ?item wdt:P18 ?image . } SERVICE wikibase:label { bd:serviceParam wikibase:language "en,nl,de,fr,es,pt" . } } ORDER BY ?itemLabel """ class UNESCOMoWEnricher: """Enriches custodian files with UNESCO Memory of the World data.""" def __init__(self, dry_run: bool = False): self.dry_run = dry_run self.mow_data: List[Dict] = [] self.inscriptions_by_custodian: Dict[str, List[Dict]] = {} # Wikidata ID -> list of inscriptions self.inscriptions_by_country: Dict[str, List[Dict]] = {} # Country code -> list of inscriptions self.stats = { "inscriptions_fetched": 0, "inscriptions_with_location": 0, "unique_custodians_in_mow": 0, "countries_covered": 0, "custodians_processed": 0, "custodians_with_wikidata": 0, "custodians_matched_by_wikidata": 0, "custodians_enriched": 0, "mow_references_added": 0, "errors": 0 } def fetch_mow_data(self, force_refresh: bool = False) -> List[Dict]: """Fetch MoW data from Wikidata SPARQL or cache.""" CACHE_DIR.mkdir(parents=True, exist_ok=True) # Check cache if CACHE_FILE.exists() and not force_refresh: cache_age = datetime.now() - datetime.fromtimestamp(CACHE_FILE.stat().st_mtime) if cache_age.days < CACHE_MAX_AGE_DAYS: logger.info(f"šŸ“ Loading cached MoW data ({cache_age.days}.{cache_age.seconds//3600} days old)") with open(CACHE_FILE, 'r', encoding='utf-8') as f: cache_data = json.load(f) self.mow_data = cache_data.get('inscriptions', []) logger.info(f" Loaded {len(self.mow_data)} cached MoW inscriptions") self._build_indices() return self.mow_data # Fetch from Wikidata SPARQL logger.info("šŸŒ Fetching Memory of the World data from Wikidata SPARQL...") try: # Encode query for URL encoded_query = urllib.parse.urlencode({'query': MOW_SPARQL_QUERY}) url = f"{WIKIDATA_SPARQL_ENDPOINT}?{encoded_query}" req = urllib.request.Request( url, headers={ 'User-Agent': 'GLAM-Heritage-Enricher/1.0 (glam-data@example.com)', 'Accept': 'application/json' } ) with urllib.request.urlopen(req, timeout=120) as response: result = json.loads(response.read().decode('utf-8')) except urllib.error.URLError as e: logger.error(f"Failed to fetch MoW data: {e}") raise # Parse SPARQL results bindings = result.get('results', {}).get('bindings', []) # Group by item (inscription) since one inscription may have multiple locations inscriptions_map: Dict[str, Dict] = {} for binding in bindings: item_uri = binding.get('item', {}).get('value', '') item_id = item_uri.split('/')[-1] if item_uri else '' if not item_id: continue if item_id not in inscriptions_map: inscriptions_map[item_id] = { 'wikidata_id': item_id, 'wikidata_url': item_uri, 'name': binding.get('itemLabel', {}).get('value', ''), 'description': binding.get('desc', {}).get('value', ''), 'country': binding.get('countryLabel', {}).get('value', ''), 'country_id': self._extract_id(binding.get('country', {}).get('value', '')), 'inscription_year': self._parse_year(binding.get('inscriptionYear', {}).get('value', '')), 'image_url': binding.get('image', {}).get('value', ''), 'locations': [] # Custodian institutions } # Add location if present location_uri = binding.get('location', {}).get('value', '') location_id = self._extract_id(location_uri) location_label = binding.get('locationLabel', {}).get('value', '') if location_id and location_id not in [loc['wikidata_id'] for loc in inscriptions_map[item_id]['locations']]: inscriptions_map[item_id]['locations'].append({ 'wikidata_id': location_id, 'wikidata_url': location_uri, 'name': location_label }) self.mow_data = list(inscriptions_map.values()) self.stats["inscriptions_fetched"] = len(self.mow_data) with_location = sum(1 for i in self.mow_data if i['locations']) self.stats["inscriptions_with_location"] = with_location logger.info(f"āœ… Fetched {len(self.mow_data)} MoW inscriptions ({with_location} with custodian locations)") # Cache the data cache_data = { 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'inscriptions': self.mow_data } with open(CACHE_FILE, 'w', encoding='utf-8') as f: json.dump(cache_data, f, ensure_ascii=False, indent=2) logger.info(f" Cached to {CACHE_FILE}") self._build_indices() return self.mow_data def _extract_id(self, uri: str) -> str: """Extract Wikidata ID from URI.""" if uri: return uri.split('/')[-1] return '' def _parse_year(self, year_str: str) -> Optional[int]: """Parse year from string.""" if year_str: try: return int(year_str) except ValueError: pass return None def _build_indices(self): """Build indices for fast lookup by custodian Wikidata ID and country.""" self.inscriptions_by_custodian = {} self.inscriptions_by_country = {} unique_custodians: Set[str] = set() for inscription in self.mow_data: # Index by custodian Wikidata ID for location in inscription.get('locations', []): custodian_id = location.get('wikidata_id', '') if custodian_id: unique_custodians.add(custodian_id) if custodian_id not in self.inscriptions_by_custodian: self.inscriptions_by_custodian[custodian_id] = [] # Add inscription with specific location info inscription_copy = inscription.copy() inscription_copy['matched_location'] = location self.inscriptions_by_custodian[custodian_id].append(inscription_copy) # Index by country (using ISO code mapping) country_code = self._country_to_code(inscription.get('country', '')) if country_code: if country_code not in self.inscriptions_by_country: self.inscriptions_by_country[country_code] = [] self.inscriptions_by_country[country_code].append(inscription) self.stats["unique_custodians_in_mow"] = len(unique_custodians) self.stats["countries_covered"] = len(self.inscriptions_by_country) logger.info(f" Indexed {len(unique_custodians)} unique custodian institutions") logger.info(f" Indexed {len(self.inscriptions_by_country)} countries with MoW inscriptions") def _country_to_code(self, country_name: str) -> Optional[str]: """Convert country name to ISO code.""" # Simple mapping for common countries country_map = { "netherlands": "NL", "belgium": "BE", "germany": "DE", "france": "FR", "united kingdom": "GB", "united states of america": "US", "united states": "US", "japan": "JP", "china": "CN", "india": "IN", "brazil": "BR", "mexico": "MX", "spain": "ES", "italy": "IT", "portugal": "PT", "poland": "PL", "austria": "AT", "switzerland": "CH", "australia": "AU", "new zealand": "NZ", "canada": "CA", "south africa": "ZA", "egypt": "EG", "russia": "RU", "russian federation": "RU", "korea": "KR", "republic of korea": "KR", "south korea": "KR", "indonesia": "ID", "philippines": "PH", "thailand": "TH", "vietnam": "VN", "viet nam": "VN", "peru": "PE", "argentina": "AR", "colombia": "CO", "chile": "CL", "czech republic": "CZ", "czechia": "CZ", "hungary": "HU", "greece": "GR", "turkey": "TR", "iran": "IR", "iran (islamic republic of)": "IR", "israel": "IL", "morocco": "MA", "tunisia": "TN", "kenya": "KE", "nigeria": "NG", "ethiopia": "ET", "pakistan": "PK", "bangladesh": "BD", "malaysia": "MY", "singapore": "SG", "norway": "NO", "sweden": "SE", "denmark": "DK", "finland": "FI", "ireland": "IE", "luxembourg": "LU", "croatia": "HR", "slovenia": "SI", "slovakia": "SK", "romania": "RO", "bulgaria": "BG", "ukraine": "UA", "latvia": "LV", "lithuania": "LT", "estonia": "EE", } return country_map.get(country_name.lower().strip()) def get_wikidata_id_from_custodian(self, data: dict) -> Optional[str]: """Extract Wikidata ID from custodian data.""" # Try wikidata_enrichment first wikidata_id = data.get('wikidata_enrichment', {}).get('wikidata_entity_id') if wikidata_id: return wikidata_id # Try original_entry wikidata_id = data.get('original_entry', {}).get('wikidata_id') if wikidata_id: return wikidata_id # Try identifiers list for identifier in data.get('identifiers', []): if identifier.get('identifier_scheme') == 'Wikidata': return identifier.get('identifier_value') return None def get_country_from_custodian(self, data: dict) -> Optional[str]: """Extract country code from custodian data.""" # Try GHCID first (most reliable) ghcid = data.get('ghcid', {}).get('ghcid_current', '') if ghcid and len(ghcid) >= 2: return ghcid[:2].upper() # Try location_resolution loc_res = data.get('ghcid', {}).get('location_resolution', {}) if isinstance(loc_res, dict): country = loc_res.get('country_code') if country: return country.upper() return None def enrich_custodian(self, file_path: Path) -> bool: """Enrich a single custodian file with MoW data.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False # Get Wikidata ID wikidata_id = self.get_wikidata_id_from_custodian(data) if wikidata_id: self.stats["custodians_with_wikidata"] += 1 country_code = self.get_country_from_custodian(data) # Look up MoW inscriptions where this custodian is listed as location matched_inscriptions = [] match_method = None if wikidata_id and wikidata_id in self.inscriptions_by_custodian: matched_inscriptions = self.inscriptions_by_custodian[wikidata_id] match_method = "wikidata_id_direct" self.stats["custodians_matched_by_wikidata"] += 1 if not matched_inscriptions: return False # Get custodian name for logging custodian_name = ( data.get('custodian_name', {}).get('claim_value', '') or data.get('wikidata_enrichment', {}).get('wikidata_label_en', '') or data.get('original_entry', {}).get('organisatie', '') or data.get('google_maps_enrichment', {}).get('name', '') or file_path.stem ) # Build enrichment data mow_enrichment = { "match_method": match_method, "custodian_wikidata_id": wikidata_id, "total_inscriptions_held": len(matched_inscriptions), "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), "inscriptions": [] } # Add inscription details (sorted by year, most recent first) sorted_inscriptions = sorted( matched_inscriptions, key=lambda x: -(x.get('inscription_year') or 0) ) for inscription in sorted_inscriptions[:20]: # Limit to 20 inscriptions desc = inscription.get("description", "") if len(desc) > 400: desc = desc[:400] + '...' mow_enrichment["inscriptions"].append({ "wikidata_id": inscription["wikidata_id"], "name": inscription["name"], "description": desc, "country": inscription.get("country", ""), "inscription_year": inscription.get("inscription_year"), "wikidata_url": inscription.get("wikidata_url", ""), "unesco_mow_url": f"https://www.unesco.org/en/memory-world", # Generic URL as UNESCO has no item-level URLs "image_url": inscription.get("image_url", "") }) # Update data data['unesco_mow_enrichment'] = mow_enrichment self.stats["mow_references_added"] += len(mow_enrichment["inscriptions"]) # Log logger.info(f"šŸ“œ {custodian_name}: {len(mow_enrichment['inscriptions'])} MoW inscriptions (matched by {match_method})") for inscription in mow_enrichment["inscriptions"][:3]: year_str = f", {inscription['inscription_year']}" if inscription.get('inscription_year') else "" logger.info(f" šŸ“œ {inscription['name'][:60]}... ({inscription['country']}{year_str})") if self.dry_run: logger.info(f" [DRY RUN - not saving]") return True # Save updated file with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) self.stats["custodians_enriched"] += 1 return True except Exception as e: logger.error(f"Error processing {file_path}: {e}") self.stats["errors"] += 1 return False def enrich_all(self, limit: Optional[int] = None, country_filter: Optional[str] = None): """Enrich all custodian files.""" # Fetch MoW data first self.fetch_mow_data() # Find all custodian files if not CUSTODIAN_DIR.exists(): logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}") return files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if country_filter: files = [f for f in files if f.name.startswith(country_filter.upper())] logger.info(f"Filtering to country: {country_filter.upper()}") if limit: files = files[:limit] logger.info(f"\nšŸ“‚ Processing {len(files)} custodian files...") for file_path in files: self.stats["custodians_processed"] += 1 self.enrich_custodian(file_path) self.print_stats() def print_stats(self): """Print enrichment statistics.""" print("\n" + "=" * 65) print("UNESCO MEMORY OF THE WORLD ENRICHMENT STATISTICS") print("=" * 65) print(f"MoW inscriptions fetched: {self.stats['inscriptions_fetched']}") print(f"Inscriptions with custodian locations: {self.stats['inscriptions_with_location']}") print(f"Unique custodian institutions in MoW: {self.stats['unique_custodians_in_mow']}") print(f"Countries with MoW inscriptions: {self.stats['countries_covered']}") print("-" * 65) print(f"Custodian files processed: {self.stats['custodians_processed']}") print(f"Custodians with Wikidata ID: {self.stats['custodians_with_wikidata']}") print(f"Custodians matched by Wikidata ID: {self.stats['custodians_matched_by_wikidata']}") print(f"Custodians enriched: {self.stats['custodians_enriched']}") print(f"Total MoW references added: {self.stats['mow_references_added']}") print(f"Errors: {self.stats['errors']}") print("=" * 65) def show_stats_only(self): """Show MoW statistics without enriching.""" self.fetch_mow_data() print("\n" + "=" * 65) print("UNESCO MEMORY OF THE WORLD STATISTICS (via Wikidata)") print("=" * 65) print(f"\nTotal inscriptions: {len(self.mow_data)}") print(f"Inscriptions with custodian (P276): {self.stats['inscriptions_with_location']}") print(f"Unique custodian institutions: {self.stats['unique_custodians_in_mow']}") # Top countries print(f"\nTop 15 Countries by MoW Inscriptions:") sorted_countries = sorted( self.inscriptions_by_country.items(), key=lambda x: -len(x[1]) )[:15] for country, inscriptions in sorted_countries: print(f" {country}: {len(inscriptions)} inscriptions") # Inscriptions by year year_counts = {} for inscription in self.mow_data: year = inscription.get('inscription_year') if year: year_counts[year] = year_counts.get(year, 0) + 1 print("\nRecent Inscriptions by Year:") for year in sorted(year_counts.keys(), reverse=True)[:10]: print(f" {year}: {year_counts[year]} inscriptions") # Sample custodians (institutions holding MoW inscriptions) print(f"\nSample Custodian Institutions (first 10 with most inscriptions):") sorted_custodians = sorted( self.inscriptions_by_custodian.items(), key=lambda x: -len(x[1]) )[:10] for custodian_id, inscriptions in sorted_custodians: name = inscriptions[0].get('matched_location', {}).get('name', custodian_id) print(f" {name} ({custodian_id}): {len(inscriptions)} inscriptions") def main(): parser = argparse.ArgumentParser( description="Enrich custodian files with UNESCO Memory of the World data" ) parser.add_argument('--dry-run', action='store_true', help="Don't save changes, just show what would be done") parser.add_argument('--limit', type=int, help="Limit number of files to process") parser.add_argument('--country', type=str, help="Filter to specific country code (e.g., NL, BE)") parser.add_argument('--refresh-cache', action='store_true', help="Force refresh of cached MoW data") parser.add_argument('--stats', action='store_true', help="Show MoW statistics only, don't enrich") args = parser.parse_args() enricher = UNESCOMoWEnricher(dry_run=args.dry_run) if args.refresh_cache: enricher.fetch_mow_data(force_refresh=True) print("Cache refreshed successfully.") return if args.stats: enricher.show_stats_only() return enricher.enrich_all(limit=args.limit, country_filter=args.country) if __name__ == "__main__": main()