glam/scripts/enrich_unesco_ich.py
2025-12-07 00:26:01 +01:00

632 lines
22 KiB
Python
Executable file

#!/usr/bin/env python3
"""
UNESCO Intangible Cultural Heritage (ICH) Enrichment Script
Enriches custodian YAML files with UNESCO Intangible Cultural Heritage data.
For each custodian, finds ICH elements inscribed in the same country.
Data source: https://ich.unesco.org/dive/data/graph_en.json
ICH Lists:
- RL: Representative List of the Intangible Cultural Heritage of Humanity
- USL: List of Intangible Cultural Heritage in Need of Urgent Safeguarding
- GSP: Register of Good Safeguarding Practices
- BSP: Best Safeguarding Practices
Usage:
python scripts/enrich_unesco_ich.py [--dry-run] [--limit N] [--country CC]
python scripts/enrich_unesco_ich.py --refresh-cache
python scripts/enrich_unesco_ich.py --stats
"""
import argparse
import json
import logging
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import urllib.request
import urllib.error
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
try:
import yaml
except ImportError:
print("ERROR: PyYAML not installed. Run: pip install pyyaml")
sys.exit(1)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Constants
UNESCO_ICH_API = "https://ich.unesco.org/dive/data/graph_en.json"
CACHE_DIR = Path(__file__).parent.parent / "data" / "cache"
CACHE_FILE = CACHE_DIR / "unesco_ich.json"
CACHE_MAX_AGE_DAYS = 7
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
# ICH List codes
ICH_LISTS = {
"RL": "Representative List of the Intangible Cultural Heritage of Humanity",
"USL": "List of Intangible Cultural Heritage in Need of Urgent Safeguarding",
"GSP": "Register of Good Safeguarding Practices",
"BSP": "Best Safeguarding Practices"
}
# Country name to ISO code mapping
COUNTRY_NAME_TO_CODE = {
"netherlands": "NL",
"belgium": "BE",
"germany": "DE",
"france": "FR",
"united kingdom": "GB",
"united kingdom of great britain and northern ireland": "GB",
"united states of america": "US",
"united states": "US",
"japan": "JP",
"china": "CN",
"india": "IN",
"brazil": "BR",
"mexico": "MX",
"spain": "ES",
"italy": "IT",
"portugal": "PT",
"poland": "PL",
"czech republic": "CZ",
"czechia": "CZ",
"austria": "AT",
"switzerland": "CH",
"sweden": "SE",
"norway": "NO",
"denmark": "DK",
"finland": "FI",
"ireland": "IE",
"greece": "GR",
"turkey": "TR",
"türkiye": "TR",
"russia": "RU",
"russian federation": "RU",
"ukraine": "UA",
"romania": "RO",
"hungary": "HU",
"bulgaria": "BG",
"croatia": "HR",
"serbia": "RS",
"slovenia": "SI",
"slovakia": "SK",
"lithuania": "LT",
"latvia": "LV",
"estonia": "EE",
"luxembourg": "LU",
"malta": "MT",
"cyprus": "CY",
"iceland": "IS",
"albania": "AL",
"north macedonia": "MK",
"montenegro": "ME",
"bosnia and herzegovina": "BA",
"republic of korea": "KR",
"south korea": "KR",
"korea": "KR",
"democratic people's republic of korea": "KP",
"north korea": "KP",
"viet nam": "VN",
"vietnam": "VN",
"thailand": "TH",
"indonesia": "ID",
"malaysia": "MY",
"philippines": "PH",
"singapore": "SG",
"australia": "AU",
"new zealand": "NZ",
"canada": "CA",
"argentina": "AR",
"chile": "CL",
"colombia": "CO",
"peru": "PE",
"venezuela": "VE",
"venezuela (bolivarian republic of)": "VE",
"ecuador": "EC",
"bolivia": "BO",
"bolivia (plurinational state of)": "BO",
"uruguay": "UY",
"paraguay": "PY",
"egypt": "EG",
"south africa": "ZA",
"nigeria": "NG",
"kenya": "KE",
"morocco": "MA",
"algeria": "DZ",
"tunisia": "TN",
"saudi arabia": "SA",
"united arab emirates": "AE",
"israel": "IL",
"iran": "IR",
"iran (islamic republic of)": "IR",
"iraq": "IQ",
"pakistan": "PK",
"bangladesh": "BD",
"afghanistan": "AF",
"lebanon": "LB",
"palestine": "PS",
"state of palestine": "PS",
"jordan": "JO",
"syria": "SY",
"syrian arab republic": "SY",
"yemen": "YE",
"oman": "OM",
"kuwait": "KW",
"qatar": "QA",
"bahrain": "BH",
"azerbaijan": "AZ",
"armenia": "AM",
"georgia": "GE",
"uzbekistan": "UZ",
"kazakhstan": "KZ",
"kyrgyzstan": "KG",
"tajikistan": "TJ",
"turkmenistan": "TM",
"mongolia": "MN",
"cambodia": "KH",
"lao people's democratic republic": "LA",
"laos": "LA",
"myanmar": "MM",
"nepal": "NP",
"sri lanka": "LK",
"bhutan": "BT",
"maldives": "MV",
"mauritius": "MU",
"madagascar": "MG",
"senegal": "SN",
"mali": "ML",
"niger": "NE",
"burkina faso": "BF",
"benin": "BJ",
"togo": "TG",
"ghana": "GH",
"côte d'ivoire": "CI",
"ivory coast": "CI",
"cameroon": "CM",
"ethiopia": "ET",
"uganda": "UG",
"tanzania": "TZ",
"united republic of tanzania": "TZ",
"zambia": "ZM",
"zimbabwe": "ZW",
"mozambique": "MZ",
"malawi": "MW",
"botswana": "BW",
"namibia": "NA",
"angola": "AO",
"democratic republic of the congo": "CD",
"congo": "CG",
"cuba": "CU",
"jamaica": "JM",
"haiti": "HT",
"dominican republic": "DO",
"puerto rico": "PR",
"guatemala": "GT",
"honduras": "HN",
"el salvador": "SV",
"nicaragua": "NI",
"costa rica": "CR",
"panama": "PA",
"andorra": "AD",
"monaco": "MC",
"san marino": "SM",
"vatican": "VA",
"holy see": "VA",
"netherlands (kingdom of the)": "NL",
"republic of moldova": "MD",
"timor-leste": "TL",
"cabo verde": "CV",
"eswatini": "SZ",
"micronesia (federated states of)": "FM",
"brunei darussalam": "BN",
"curaçao": "CW",
"saint kitts and nevis": "KN",
"saint vincent and the grenadines": "VC",
"sao tome and principe": "ST",
"cook islands": "CK",
"antigua and barbuda": "AG",
"papua new guinea": "PG",
"liechtenstein": "LI",
}
class UNESCOICHEnricher:
"""Enriches custodian files with UNESCO Intangible Cultural Heritage data."""
def __init__(self, dry_run: bool = False):
self.dry_run = dry_run
self.ich_data: dict = {}
self.elements_by_country: dict = {} # country_code -> list of elements
self.stats = {
"elements_fetched": 0,
"countries_covered": 0,
"custodians_processed": 0,
"custodians_with_country": 0,
"custodians_enriched": 0,
"ich_references_added": 0,
"errors": 0
}
def fetch_ich_data(self, force_refresh: bool = False) -> dict:
"""Fetch ICH data from UNESCO API or cache."""
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Check cache
if CACHE_FILE.exists() and not force_refresh:
cache_age = datetime.now() - datetime.fromtimestamp(CACHE_FILE.stat().st_mtime)
if cache_age.days < CACHE_MAX_AGE_DAYS:
logger.info(f"📁 Loading cached ICH data ({cache_age.days}.{cache_age.seconds//3600} days old)")
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
self.ich_data = json.load(f)
elements = {k: v for k, v in self.ich_data.get('nodes', {}).items()
if v.get('type') == 'element'}
logger.info(f" Loaded {len(elements)} cached ICH elements")
self._build_country_index()
return self.ich_data
# Fetch from API
logger.info("🌍 Fetching Intangible Cultural Heritage data from UNESCO...")
try:
req = urllib.request.Request(
UNESCO_ICH_API,
headers={'User-Agent': 'GLAM-Heritage-Enricher/1.0'}
)
with urllib.request.urlopen(req, timeout=60) as response:
self.ich_data = json.loads(response.read().decode('utf-8'))
except urllib.error.URLError as e:
logger.error(f"Failed to fetch ICH data: {e}")
raise
# Count elements
elements = {k: v for k, v in self.ich_data.get('nodes', {}).items()
if v.get('type') == 'element'}
self.stats["elements_fetched"] = len(elements)
logger.info(f"✅ Fetched {len(elements)} ICH elements")
# Cache the data
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(self.ich_data, f, ensure_ascii=False, indent=2)
logger.info(f" Cached to {CACHE_FILE}")
self._build_country_index()
return self.ich_data
def _build_country_index(self):
"""Build index of ICH elements by country code."""
nodes = self.ich_data.get('nodes', {})
edges = self.ich_data.get('edges', [])
# Get all country nodes: node_id -> country_code
country_id_to_code = {}
for node_id, node in nodes.items():
if node.get('type') == 'country':
label = node.get('label', '').lower().strip()
code = COUNTRY_NAME_TO_CODE.get(label)
if code:
country_id_to_code[node_id] = code
logger.info(f" Mapped {len(country_id_to_code)} country nodes to ISO codes")
# Build element -> countries mapping from edges
# Edge format: {'subject': 'element_2', 'predicate': 'related', 'object': 'country_26', 'weight': 1}
element_countries = {}
for edge in edges:
subject = edge.get('subject', '')
obj = edge.get('object', '')
# Check if edge connects element to country
if subject.startswith('element_') and obj.startswith('country_'):
if subject not in element_countries:
element_countries[subject] = []
element_countries[subject].append(obj)
# Build country code -> elements index
self.elements_by_country = {}
for node_id, node in nodes.items():
if node.get('type') != 'element':
continue
element_data = self._extract_element_data(node_id, node)
if not element_data:
continue
# Get countries from edges
country_ids = element_countries.get(node_id, [])
for country_id in country_ids:
country_code = country_id_to_code.get(country_id)
if country_code:
if country_code not in self.elements_by_country:
self.elements_by_country[country_code] = []
self.elements_by_country[country_code].append(element_data)
self.stats["countries_covered"] = len(self.elements_by_country)
total_refs = sum(len(v) for v in self.elements_by_country.values())
logger.info(f" Indexed {total_refs} element-country references for {len(self.elements_by_country)} countries")
def _extract_element_data(self, node_id: str, node: dict) -> Optional[dict]:
"""Extract relevant data from an ICH element node."""
meta = node.get('meta', {})
# Get UNESCO ID from node_id (e.g., "element_123" -> "123")
unesco_id = node_id.replace('element_', '') if node_id.startswith('element_') else node_id
ich_list = meta.get('list', '')
year = meta.get('year')
description = meta.get('description', '')
# Truncate description if too long
if len(description) > 500:
description = description[:500] + '...'
return {
"unesco_ich_id": unesco_id,
"name": node.get('label', ''),
"description": description,
"list_type": ich_list,
"list_name": ICH_LISTS.get(ich_list, ich_list),
"inscription_year": year,
"multinational": meta.get('multinational', False),
"url": meta.get('link', ''),
"icon_url": meta.get('icon', {}).get('small', '') if isinstance(meta.get('icon'), dict) else ''
}
def get_country_from_custodian(self, data: dict) -> Optional[str]:
"""Extract country code from custodian data."""
# Try GHCID first (most reliable)
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
if ghcid and len(ghcid) >= 2:
return ghcid[:2].upper()
# Try location
locations = data.get('locations', [])
if locations and isinstance(locations, list):
for loc in locations:
if isinstance(loc, dict) and loc.get('country'):
return loc['country'].upper()
# Try location_resolution
loc_res = data.get('location_resolution', {})
if isinstance(loc_res, dict):
country = loc_res.get('country_code')
if country:
return country.upper()
# Try ghcid.location_resolution
ghcid_loc = data.get('ghcid', {}).get('location_resolution', {})
if isinstance(ghcid_loc, dict):
country = ghcid_loc.get('country_code')
if country:
return country.upper()
return None
def enrich_custodian(self, file_path: Path) -> bool:
"""Enrich a single custodian file with ICH data."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return False
# Get country code
country_code = self.get_country_from_custodian(data)
if not country_code:
return False
self.stats["custodians_with_country"] += 1
# Get ICH elements for this country
elements = self.elements_by_country.get(country_code, [])
if not elements:
return False
# Build enrichment data
custodian_name = (
data.get('custodian_name', {}).get('claim_value', '') or
data.get('original_entry', {}).get('organisatie', '') or
data.get('google_maps_enrichment', {}).get('name', '') or
file_path.stem
)
ich_enrichment = {
"country_code": country_code,
"total_elements_in_country": len(elements),
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
"elements": []
}
# Add top 10 ICH elements for this country (prioritize RL, then by year)
sorted_elements = sorted(
elements,
key=lambda x: (
0 if x['list_type'] == 'RL' else 1, # Representative List first
0 if x['list_type'] == 'USL' else 1, # Urgent Safeguarding second
-(x['inscription_year'] or 0) # Most recent first
)
)
for elem in sorted_elements[:10]:
desc = elem.get("description", "")
if len(desc) > 300:
desc = desc[:300] + '...'
ich_enrichment["elements"].append({
"unesco_ich_id": elem["unesco_ich_id"],
"name": elem["name"],
"list_type": elem["list_type"],
"list_name": elem["list_name"],
"inscription_year": elem["inscription_year"],
"multinational": elem["multinational"],
"url": elem["url"],
"description": desc
})
# Update data
data['unesco_ich_enrichment'] = ich_enrichment
self.stats["ich_references_added"] += len(ich_enrichment["elements"])
# Log
logger.info(f"🏛️ {custodian_name}: {len(ich_enrichment['elements'])} ICH elements for {country_code}")
for elem in ich_enrichment["elements"][:3]:
logger.info(f" 🎭 {elem['name'][:50]}... ({elem['list_type']}, {elem['inscription_year']})")
if self.dry_run:
logger.info(f" [DRY RUN - not saving]")
return True
# Save updated file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
self.stats["custodians_enriched"] += 1
return True
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
self.stats["errors"] += 1
return False
def enrich_all(self, limit: Optional[int] = None, country_filter: Optional[str] = None):
"""Enrich all custodian files."""
# Fetch ICH data first
self.fetch_ich_data()
# Find all custodian files
if not CUSTODIAN_DIR.exists():
logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}")
return
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
if country_filter:
files = [f for f in files if f.name.startswith(country_filter.upper())]
logger.info(f"Filtering to country: {country_filter.upper()}")
if limit:
files = files[:limit]
logger.info(f"\n📂 Processing {len(files)} custodian files...")
for file_path in files:
self.stats["custodians_processed"] += 1
self.enrich_custodian(file_path)
self.print_stats()
def print_stats(self):
"""Print enrichment statistics."""
print("\n" + "=" * 60)
print("UNESCO INTANGIBLE CULTURAL HERITAGE ENRICHMENT STATISTICS")
print("=" * 60)
print(f"ICH elements indexed: {self.stats['elements_fetched']}")
print(f"Countries with ICH data: {self.stats['countries_covered']}")
print(f"Custodian files processed: {self.stats['custodians_processed']}")
print(f"Custodians with country code: {self.stats['custodians_with_country']}")
print(f"Custodians enriched: {self.stats['custodians_enriched']}")
print(f"Total ICH references added: {self.stats['ich_references_added']}")
print(f"Errors: {self.stats['errors']}")
print("=" * 60)
def show_stats_only(self):
"""Show ICH statistics without enriching."""
self.fetch_ich_data()
print("\n" + "=" * 60)
print("UNESCO INTANGIBLE CULTURAL HERITAGE STATISTICS")
print("=" * 60)
nodes = self.ich_data.get('nodes', {})
# Count by type
type_counts = {}
for node in nodes.values():
t = node.get('type', 'unknown')
type_counts[t] = type_counts.get(t, 0) + 1
print("\nNode Types:")
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {count}")
# Count elements by list type
list_counts = {}
for node in nodes.values():
if node.get('type') == 'element':
list_type = node.get('meta', {}).get('list', 'unknown')
list_counts[list_type] = list_counts.get(list_type, 0) + 1
print("\nICH Elements by List:")
for lt, count in sorted(list_counts.items(), key=lambda x: -x[1]):
list_name = ICH_LISTS.get(lt, lt)
print(f" {lt} ({list_name}): {count}")
# Top countries
print(f"\nCountries with ICH Elements: {len(self.elements_by_country)}")
print("\nTop 15 Countries by ICH Elements:")
for country, elements in sorted(self.elements_by_country.items(),
key=lambda x: -len(x[1]))[:15]:
print(f" {country}: {len(elements)} elements")
# Elements by year
year_counts = {}
for node in nodes.values():
if node.get('type') == 'element':
year = node.get('meta', {}).get('year')
if year:
year_counts[year] = year_counts.get(year, 0) + 1
print("\nRecent Inscriptions:")
for year in sorted(year_counts.keys(), reverse=True)[:5]:
print(f" {year}: {year_counts[year]} elements")
def main():
parser = argparse.ArgumentParser(
description="Enrich custodian files with UNESCO Intangible Cultural Heritage data"
)
parser.add_argument('--dry-run', action='store_true',
help="Don't save changes, just show what would be done")
parser.add_argument('--limit', type=int,
help="Limit number of files to process")
parser.add_argument('--country', type=str,
help="Filter to specific country code (e.g., NL, BE)")
parser.add_argument('--refresh-cache', action='store_true',
help="Force refresh of cached ICH data")
parser.add_argument('--stats', action='store_true',
help="Show ICH statistics only, don't enrich")
args = parser.parse_args()
enricher = UNESCOICHEnricher(dry_run=args.dry_run)
if args.refresh_cache:
enricher.fetch_ich_data(force_refresh=True)
print("Cache refreshed successfully.")
return
if args.stats:
enricher.show_stats_only()
return
enricher.enrich_all(limit=args.limit, country_filter=args.country)
if __name__ == "__main__":
main()