#!/usr/bin/env python3 """ UNESCO Intangible Cultural Heritage (ICH) Enrichment Script Enriches custodian YAML files with UNESCO Intangible Cultural Heritage data. For each custodian, finds ICH elements inscribed in the same country. Data source: https://ich.unesco.org/dive/data/graph_en.json ICH Lists: - RL: Representative List of the Intangible Cultural Heritage of Humanity - USL: List of Intangible Cultural Heritage in Need of Urgent Safeguarding - GSP: Register of Good Safeguarding Practices - BSP: Best Safeguarding Practices Usage: python scripts/enrich_unesco_ich.py [--dry-run] [--limit N] [--country CC] python scripts/enrich_unesco_ich.py --refresh-cache python scripts/enrich_unesco_ich.py --stats """ import argparse import json import logging import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional import urllib.request import urllib.error # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) try: import yaml except ImportError: print("ERROR: PyYAML not installed. Run: pip install pyyaml") sys.exit(1) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Constants UNESCO_ICH_API = "https://ich.unesco.org/dive/data/graph_en.json" CACHE_DIR = Path(__file__).parent.parent / "data" / "cache" CACHE_FILE = CACHE_DIR / "unesco_ich.json" CACHE_MAX_AGE_DAYS = 7 CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" # ICH List codes ICH_LISTS = { "RL": "Representative List of the Intangible Cultural Heritage of Humanity", "USL": "List of Intangible Cultural Heritage in Need of Urgent Safeguarding", "GSP": "Register of Good Safeguarding Practices", "BSP": "Best Safeguarding Practices" } # Country name to ISO code mapping COUNTRY_NAME_TO_CODE = { "netherlands": "NL", "belgium": "BE", "germany": "DE", "france": "FR", "united kingdom": "GB", "united kingdom of great britain and northern ireland": "GB", "united states of america": "US", "united states": "US", "japan": "JP", "china": "CN", "india": "IN", "brazil": "BR", "mexico": "MX", "spain": "ES", "italy": "IT", "portugal": "PT", "poland": "PL", "czech republic": "CZ", "czechia": "CZ", "austria": "AT", "switzerland": "CH", "sweden": "SE", "norway": "NO", "denmark": "DK", "finland": "FI", "ireland": "IE", "greece": "GR", "turkey": "TR", "türkiye": "TR", "russia": "RU", "russian federation": "RU", "ukraine": "UA", "romania": "RO", "hungary": "HU", "bulgaria": "BG", "croatia": "HR", "serbia": "RS", "slovenia": "SI", "slovakia": "SK", "lithuania": "LT", "latvia": "LV", "estonia": "EE", "luxembourg": "LU", "malta": "MT", "cyprus": "CY", "iceland": "IS", "albania": "AL", "north macedonia": "MK", "montenegro": "ME", "bosnia and herzegovina": "BA", "republic of korea": "KR", "south korea": "KR", "korea": "KR", "democratic people's republic of korea": "KP", "north korea": "KP", "viet nam": "VN", "vietnam": "VN", "thailand": "TH", "indonesia": "ID", "malaysia": "MY", "philippines": "PH", "singapore": "SG", "australia": "AU", "new zealand": "NZ", "canada": "CA", "argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE", "venezuela": "VE", "venezuela (bolivarian republic of)": "VE", "ecuador": "EC", "bolivia": "BO", "bolivia (plurinational state of)": "BO", "uruguay": "UY", "paraguay": "PY", "egypt": "EG", "south africa": "ZA", "nigeria": "NG", "kenya": "KE", "morocco": "MA", "algeria": "DZ", "tunisia": "TN", "saudi arabia": "SA", "united arab emirates": "AE", "israel": "IL", "iran": "IR", "iran (islamic republic of)": "IR", "iraq": "IQ", "pakistan": "PK", "bangladesh": "BD", "afghanistan": "AF", "lebanon": "LB", "palestine": "PS", "state of palestine": "PS", "jordan": "JO", "syria": "SY", "syrian arab republic": "SY", "yemen": "YE", "oman": "OM", "kuwait": "KW", "qatar": "QA", "bahrain": "BH", "azerbaijan": "AZ", "armenia": "AM", "georgia": "GE", "uzbekistan": "UZ", "kazakhstan": "KZ", "kyrgyzstan": "KG", "tajikistan": "TJ", "turkmenistan": "TM", "mongolia": "MN", "cambodia": "KH", "lao people's democratic republic": "LA", "laos": "LA", "myanmar": "MM", "nepal": "NP", "sri lanka": "LK", "bhutan": "BT", "maldives": "MV", "mauritius": "MU", "madagascar": "MG", "senegal": "SN", "mali": "ML", "niger": "NE", "burkina faso": "BF", "benin": "BJ", "togo": "TG", "ghana": "GH", "côte d'ivoire": "CI", "ivory coast": "CI", "cameroon": "CM", "ethiopia": "ET", "uganda": "UG", "tanzania": "TZ", "united republic of tanzania": "TZ", "zambia": "ZM", "zimbabwe": "ZW", "mozambique": "MZ", "malawi": "MW", "botswana": "BW", "namibia": "NA", "angola": "AO", "democratic republic of the congo": "CD", "congo": "CG", "cuba": "CU", "jamaica": "JM", "haiti": "HT", "dominican republic": "DO", "puerto rico": "PR", "guatemala": "GT", "honduras": "HN", "el salvador": "SV", "nicaragua": "NI", "costa rica": "CR", "panama": "PA", "andorra": "AD", "monaco": "MC", "san marino": "SM", "vatican": "VA", "holy see": "VA", "netherlands (kingdom of the)": "NL", "republic of moldova": "MD", "timor-leste": "TL", "cabo verde": "CV", "eswatini": "SZ", "micronesia (federated states of)": "FM", "brunei darussalam": "BN", "curaçao": "CW", "saint kitts and nevis": "KN", "saint vincent and the grenadines": "VC", "sao tome and principe": "ST", "cook islands": "CK", "antigua and barbuda": "AG", "papua new guinea": "PG", "liechtenstein": "LI", } class UNESCOICHEnricher: """Enriches custodian files with UNESCO Intangible Cultural Heritage data.""" def __init__(self, dry_run: bool = False): self.dry_run = dry_run self.ich_data: dict = {} self.elements_by_country: dict = {} # country_code -> list of elements self.stats = { "elements_fetched": 0, "countries_covered": 0, "custodians_processed": 0, "custodians_with_country": 0, "custodians_enriched": 0, "ich_references_added": 0, "errors": 0 } def fetch_ich_data(self, force_refresh: bool = False) -> dict: """Fetch ICH data from UNESCO API or cache.""" CACHE_DIR.mkdir(parents=True, exist_ok=True) # Check cache if CACHE_FILE.exists() and not force_refresh: cache_age = datetime.now() - datetime.fromtimestamp(CACHE_FILE.stat().st_mtime) if cache_age.days < CACHE_MAX_AGE_DAYS: logger.info(f"📁 Loading cached ICH data ({cache_age.days}.{cache_age.seconds//3600} days old)") with open(CACHE_FILE, 'r', encoding='utf-8') as f: self.ich_data = json.load(f) elements = {k: v for k, v in self.ich_data.get('nodes', {}).items() if v.get('type') == 'element'} logger.info(f" Loaded {len(elements)} cached ICH elements") self._build_country_index() return self.ich_data # Fetch from API logger.info("🌍 Fetching Intangible Cultural Heritage data from UNESCO...") try: req = urllib.request.Request( UNESCO_ICH_API, headers={'User-Agent': 'GLAM-Heritage-Enricher/1.0'} ) with urllib.request.urlopen(req, timeout=60) as response: self.ich_data = json.loads(response.read().decode('utf-8')) except urllib.error.URLError as e: logger.error(f"Failed to fetch ICH data: {e}") raise # Count elements elements = {k: v for k, v in self.ich_data.get('nodes', {}).items() if v.get('type') == 'element'} self.stats["elements_fetched"] = len(elements) logger.info(f"✅ Fetched {len(elements)} ICH elements") # Cache the data with open(CACHE_FILE, 'w', encoding='utf-8') as f: json.dump(self.ich_data, f, ensure_ascii=False, indent=2) logger.info(f" Cached to {CACHE_FILE}") self._build_country_index() return self.ich_data def _build_country_index(self): """Build index of ICH elements by country code.""" nodes = self.ich_data.get('nodes', {}) edges = self.ich_data.get('edges', []) # Get all country nodes: node_id -> country_code country_id_to_code = {} for node_id, node in nodes.items(): if node.get('type') == 'country': label = node.get('label', '').lower().strip() code = COUNTRY_NAME_TO_CODE.get(label) if code: country_id_to_code[node_id] = code logger.info(f" Mapped {len(country_id_to_code)} country nodes to ISO codes") # Build element -> countries mapping from edges # Edge format: {'subject': 'element_2', 'predicate': 'related', 'object': 'country_26', 'weight': 1} element_countries = {} for edge in edges: subject = edge.get('subject', '') obj = edge.get('object', '') # Check if edge connects element to country if subject.startswith('element_') and obj.startswith('country_'): if subject not in element_countries: element_countries[subject] = [] element_countries[subject].append(obj) # Build country code -> elements index self.elements_by_country = {} for node_id, node in nodes.items(): if node.get('type') != 'element': continue element_data = self._extract_element_data(node_id, node) if not element_data: continue # Get countries from edges country_ids = element_countries.get(node_id, []) for country_id in country_ids: country_code = country_id_to_code.get(country_id) if country_code: if country_code not in self.elements_by_country: self.elements_by_country[country_code] = [] self.elements_by_country[country_code].append(element_data) self.stats["countries_covered"] = len(self.elements_by_country) total_refs = sum(len(v) for v in self.elements_by_country.values()) logger.info(f" Indexed {total_refs} element-country references for {len(self.elements_by_country)} countries") def _extract_element_data(self, node_id: str, node: dict) -> Optional[dict]: """Extract relevant data from an ICH element node.""" meta = node.get('meta', {}) # Get UNESCO ID from node_id (e.g., "element_123" -> "123") unesco_id = node_id.replace('element_', '') if node_id.startswith('element_') else node_id ich_list = meta.get('list', '') year = meta.get('year') description = meta.get('description', '') # Truncate description if too long if len(description) > 500: description = description[:500] + '...' return { "unesco_ich_id": unesco_id, "name": node.get('label', ''), "description": description, "list_type": ich_list, "list_name": ICH_LISTS.get(ich_list, ich_list), "inscription_year": year, "multinational": meta.get('multinational', False), "url": meta.get('link', ''), "icon_url": meta.get('icon', {}).get('small', '') if isinstance(meta.get('icon'), dict) else '' } def get_country_from_custodian(self, data: dict) -> Optional[str]: """Extract country code from custodian data.""" # Try GHCID first (most reliable) ghcid = data.get('ghcid', {}).get('ghcid_current', '') if ghcid and len(ghcid) >= 2: return ghcid[:2].upper() # Try location locations = data.get('locations', []) if locations and isinstance(locations, list): for loc in locations: if isinstance(loc, dict) and loc.get('country'): return loc['country'].upper() # Try location_resolution loc_res = data.get('location_resolution', {}) if isinstance(loc_res, dict): country = loc_res.get('country_code') if country: return country.upper() # Try ghcid.location_resolution ghcid_loc = data.get('ghcid', {}).get('location_resolution', {}) if isinstance(ghcid_loc, dict): country = ghcid_loc.get('country_code') if country: return country.upper() return None def enrich_custodian(self, file_path: Path) -> bool: """Enrich a single custodian file with ICH data.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False # Get country code country_code = self.get_country_from_custodian(data) if not country_code: return False self.stats["custodians_with_country"] += 1 # Get ICH elements for this country elements = self.elements_by_country.get(country_code, []) if not elements: return False # Build enrichment data custodian_name = ( data.get('custodian_name', {}).get('claim_value', '') or data.get('original_entry', {}).get('organisatie', '') or data.get('google_maps_enrichment', {}).get('name', '') or file_path.stem ) ich_enrichment = { "country_code": country_code, "total_elements_in_country": len(elements), "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), "elements": [] } # Add top 10 ICH elements for this country (prioritize RL, then by year) sorted_elements = sorted( elements, key=lambda x: ( 0 if x['list_type'] == 'RL' else 1, # Representative List first 0 if x['list_type'] == 'USL' else 1, # Urgent Safeguarding second -(x['inscription_year'] or 0) # Most recent first ) ) for elem in sorted_elements[:10]: desc = elem.get("description", "") if len(desc) > 300: desc = desc[:300] + '...' ich_enrichment["elements"].append({ "unesco_ich_id": elem["unesco_ich_id"], "name": elem["name"], "list_type": elem["list_type"], "list_name": elem["list_name"], "inscription_year": elem["inscription_year"], "multinational": elem["multinational"], "url": elem["url"], "description": desc }) # Update data data['unesco_ich_enrichment'] = ich_enrichment self.stats["ich_references_added"] += len(ich_enrichment["elements"]) # Log logger.info(f"🏛️ {custodian_name}: {len(ich_enrichment['elements'])} ICH elements for {country_code}") for elem in ich_enrichment["elements"][:3]: logger.info(f" 🎭 {elem['name'][:50]}... ({elem['list_type']}, {elem['inscription_year']})") if self.dry_run: logger.info(f" [DRY RUN - not saving]") return True # Save updated file with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) self.stats["custodians_enriched"] += 1 return True except Exception as e: logger.error(f"Error processing {file_path}: {e}") self.stats["errors"] += 1 return False def enrich_all(self, limit: Optional[int] = None, country_filter: Optional[str] = None): """Enrich all custodian files.""" # Fetch ICH data first self.fetch_ich_data() # Find all custodian files if not CUSTODIAN_DIR.exists(): logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}") return files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if country_filter: files = [f for f in files if f.name.startswith(country_filter.upper())] logger.info(f"Filtering to country: {country_filter.upper()}") if limit: files = files[:limit] logger.info(f"\n📂 Processing {len(files)} custodian files...") for file_path in files: self.stats["custodians_processed"] += 1 self.enrich_custodian(file_path) self.print_stats() def print_stats(self): """Print enrichment statistics.""" print("\n" + "=" * 60) print("UNESCO INTANGIBLE CULTURAL HERITAGE ENRICHMENT STATISTICS") print("=" * 60) print(f"ICH elements indexed: {self.stats['elements_fetched']}") print(f"Countries with ICH data: {self.stats['countries_covered']}") print(f"Custodian files processed: {self.stats['custodians_processed']}") print(f"Custodians with country code: {self.stats['custodians_with_country']}") print(f"Custodians enriched: {self.stats['custodians_enriched']}") print(f"Total ICH references added: {self.stats['ich_references_added']}") print(f"Errors: {self.stats['errors']}") print("=" * 60) def show_stats_only(self): """Show ICH statistics without enriching.""" self.fetch_ich_data() print("\n" + "=" * 60) print("UNESCO INTANGIBLE CULTURAL HERITAGE STATISTICS") print("=" * 60) nodes = self.ich_data.get('nodes', {}) # Count by type type_counts = {} for node in nodes.values(): t = node.get('type', 'unknown') type_counts[t] = type_counts.get(t, 0) + 1 print("\nNode Types:") for t, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {t}: {count}") # Count elements by list type list_counts = {} for node in nodes.values(): if node.get('type') == 'element': list_type = node.get('meta', {}).get('list', 'unknown') list_counts[list_type] = list_counts.get(list_type, 0) + 1 print("\nICH Elements by List:") for lt, count in sorted(list_counts.items(), key=lambda x: -x[1]): list_name = ICH_LISTS.get(lt, lt) print(f" {lt} ({list_name}): {count}") # Top countries print(f"\nCountries with ICH Elements: {len(self.elements_by_country)}") print("\nTop 15 Countries by ICH Elements:") for country, elements in sorted(self.elements_by_country.items(), key=lambda x: -len(x[1]))[:15]: print(f" {country}: {len(elements)} elements") # Elements by year year_counts = {} for node in nodes.values(): if node.get('type') == 'element': year = node.get('meta', {}).get('year') if year: year_counts[year] = year_counts.get(year, 0) + 1 print("\nRecent Inscriptions:") for year in sorted(year_counts.keys(), reverse=True)[:5]: print(f" {year}: {year_counts[year]} elements") def main(): parser = argparse.ArgumentParser( description="Enrich custodian files with UNESCO Intangible Cultural Heritage data" ) parser.add_argument('--dry-run', action='store_true', help="Don't save changes, just show what would be done") parser.add_argument('--limit', type=int, help="Limit number of files to process") parser.add_argument('--country', type=str, help="Filter to specific country code (e.g., NL, BE)") parser.add_argument('--refresh-cache', action='store_true', help="Force refresh of cached ICH data") parser.add_argument('--stats', action='store_true', help="Show ICH statistics only, don't enrich") args = parser.parse_args() enricher = UNESCOICHEnricher(dry_run=args.dry_run) if args.refresh_cache: enricher.fetch_ich_data(force_refresh=True) print("Cache refreshed successfully.") return if args.stats: enricher.show_stats_only() return enricher.enrich_all(limit=args.limit, country_filter=args.country) if __name__ == "__main__": main()