#!/usr/bin/env python3 """ Enrich UNESCO MoW custodian files with proper CustodianName data. This script: 1. Loads multilingual labels from Wikidata cache 2. Determines the appropriate emic (local language) name for each custodian 3. Updates custodian YAML files with: - custodian_name.emic_name (local language name) - custodian_name.name_language (ISO 639-1 code) - custodian_name.standardized_name (same as emic_name for now) 4. Regenerates abbreviations from local language names if different 5. Updates GHCIDs and maintains history for changed abbreviations Per AGENTS.md: The abbreviation and optional snake_case name suffix should be derived from the emic name in the institution's official local/national language. """ import json import yaml import unicodedata import re from pathlib import Path from datetime import datetime, timezone from typing import Optional, Dict, List, Tuple # Country to primary official language(s) mapping # Format: country_name -> (primary_lang, fallback_langs) # For multilingual countries, we'll check if the institution has a label in any official language COUNTRY_LANGUAGE_MAP = { # Europe - Western 'Germany': ('de', []), 'Austria': ('de', []), 'France': ('fr', []), 'Netherlands': ('nl', []), 'Belgium': ('nl', ['fr', 'de']), # Check all three official languages 'Luxembourg': ('lb', ['fr', 'de']), 'Switzerland': ('de', ['fr', 'it', 'rm']), # Check all four national languages 'United Kingdom': ('en', []), 'Ireland': ('en', ['ga']), # Europe - Northern 'Norway': ('nb', ['nn', 'no']), # Bokmål preferred, Nynorsk fallback 'Sweden': ('sv', []), 'Denmark': ('da', []), 'Finland': ('fi', ['sv']), # Swedish is also official 'Iceland': ('is', []), # Europe - Southern 'Spain': ('es', ['ca', 'eu', 'gl']), # Regional languages 'Portugal': ('pt', []), 'Italy': ('it', []), 'Greece': ('el', []), 'Malta': ('mt', ['en']), 'Cyprus': ('el', ['tr']), # Europe - Central/Eastern 'Poland': ('pl', []), 'Czech Republic': ('cs', []), 'Czechia': ('cs', []), 'Slovakia': ('sk', []), 'Hungary': ('hu', []), 'Slovenia': ('sl', []), 'Croatia': ('hr', []), 'Serbia': ('sr', []), 'Bosnia and Herzegovina': ('bs', ['hr', 'sr']), 'North Macedonia': ('mk', []), 'Albania': ('sq', []), 'Bulgaria': ('bg', []), 'Romania': ('ro', []), 'Moldova': ('ro', []), 'Ukraine': ('uk', []), 'Belarus': ('be', ['ru']), 'Russia': ('ru', []), 'Estonia': ('et', []), 'Latvia': ('lv', []), 'Lithuania': ('lt', []), # Americas 'United States': ('en', []), 'Canada': ('en', ['fr']), 'Mexico': ('es', []), 'Brazil': ('pt', []), 'Argentina': ('es', []), 'Chile': ('es', []), 'Colombia': ('es', []), 'Peru': ('es', []), 'Venezuela': ('es', []), 'Ecuador': ('es', []), 'Bolivia': ('es', []), 'Paraguay': ('es', ['gn']), 'Uruguay': ('es', []), 'Cuba': ('es', []), 'Dominican Republic': ('es', []), 'Puerto Rico': ('es', ['en']), 'Costa Rica': ('es', []), 'Panama': ('es', []), 'Guatemala': ('es', []), 'Honduras': ('es', []), 'El Salvador': ('es', []), 'Nicaragua': ('es', []), 'Jamaica': ('en', []), 'Trinidad and Tobago': ('en', []), 'Barbados': ('en', []), 'Suriname': ('nl', []), 'Guyana': ('en', []), # Asia - East 'Japan': ('ja', []), "People's Republic of China": ('zh', []), 'China': ('zh', []), 'Taiwan': ('zh', []), 'South Korea': ('ko', []), 'North Korea': ('ko', []), 'Mongolia': ('mn', []), # Asia - Southeast 'Vietnam': ('vi', []), 'Thailand': ('th', []), 'Cambodia': ('km', []), 'Laos': ('lo', []), 'Myanmar': ('my', []), 'Malaysia': ('ms', []), 'Singapore': ('en', ['zh', 'ms', 'ta']), 'Indonesia': ('id', []), 'Philippines': ('tl', ['en']), 'Brunei': ('ms', []), 'East Timor': ('pt', ['tet']), 'Timor-Leste': ('pt', ['tet']), # Asia - South 'India': ('hi', ['en', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or']), 'Pakistan': ('ur', ['en']), 'Bangladesh': ('bn', []), 'Sri Lanka': ('si', ['ta']), 'Nepal': ('ne', []), 'Bhutan': ('dz', []), 'Maldives': ('dv', []), # Asia - Central 'Kazakhstan': ('kk', ['ru']), 'Uzbekistan': ('uz', []), 'Turkmenistan': ('tk', []), 'Kyrgyzstan': ('ky', ['ru']), 'Tajikistan': ('tg', []), 'Afghanistan': ('ps', ['fa']), # Asia - West / Middle East 'Turkey': ('tr', []), 'Iran': ('fa', []), 'Iraq': ('ar', ['ku']), 'Syria': ('ar', []), 'Lebanon': ('ar', []), 'Jordan': ('ar', []), 'Israel': ('he', ['ar']), 'Palestine': ('ar', []), 'Saudi Arabia': ('ar', []), 'United Arab Emirates': ('ar', []), 'Kuwait': ('ar', []), 'Qatar': ('ar', []), 'Bahrain': ('ar', []), 'Oman': ('ar', []), 'Yemen': ('ar', []), 'Georgia': ('ka', []), 'Armenia': ('hy', []), 'Azerbaijan': ('az', []), # Africa - North 'Egypt': ('ar', []), 'Libya': ('ar', []), 'Tunisia': ('ar', ['fr']), 'Algeria': ('ar', ['fr']), 'Morocco': ('ar', ['fr']), # Africa - West 'Nigeria': ('en', []), 'Ghana': ('en', []), 'Senegal': ('fr', []), 'Ivory Coast': ('fr', []), "Côte d'Ivoire": ('fr', []), 'Mali': ('fr', []), 'Burkina Faso': ('fr', []), 'Niger': ('fr', []), 'Benin': ('fr', []), 'Togo': ('fr', []), 'Guinea': ('fr', []), 'Sierra Leone': ('en', []), 'Liberia': ('en', []), 'Mauritania': ('ar', ['fr']), 'Cape Verde': ('pt', []), 'Gambia': ('en', []), # Africa - East 'Kenya': ('sw', ['en']), 'Tanzania': ('sw', ['en']), 'Uganda': ('en', ['sw']), 'Rwanda': ('rw', ['fr', 'en']), 'Burundi': ('rn', ['fr']), 'Ethiopia': ('am', []), 'Eritrea': ('ti', ['ar']), 'Somalia': ('so', ['ar']), 'Djibouti': ('fr', ['ar']), 'Madagascar': ('mg', ['fr']), 'Mauritius': ('en', ['fr']), 'Seychelles': ('en', ['fr']), # Africa - Central 'Democratic Republic of the Congo': ('fr', []), 'Republic of the Congo': ('fr', []), 'Central African Republic': ('fr', []), 'Chad': ('fr', ['ar']), 'Cameroon': ('fr', ['en']), 'Gabon': ('fr', []), 'Equatorial Guinea': ('es', ['fr', 'pt']), # Africa - Southern 'South Africa': ('en', ['af', 'zu', 'xh']), 'Namibia': ('en', ['de', 'af']), 'Botswana': ('en', ['tn']), 'Zimbabwe': ('en', ['sn', 'nd']), 'Zambia': ('en', []), 'Malawi': ('en', []), 'Mozambique': ('pt', []), 'Angola': ('pt', []), 'Lesotho': ('en', ['st']), 'Eswatini': ('en', ['ss']), # Oceania 'Australia': ('en', []), 'New Zealand': ('en', ['mi']), 'Papua New Guinea': ('en', ['tpi', 'ho']), 'Fiji': ('en', ['fj', 'hi']), 'Vanuatu': ('en', ['fr', 'bi']), 'Samoa': ('sm', ['en']), 'Tonga': ('to', ['en']), 'Solomon Islands': ('en', []), 'Kiribati': ('en', ['gil']), 'Micronesia': ('en', []), 'Palau': ('en', ['pau']), # Caribbean 'Haiti': ('ht', ['fr']), 'Bahamas': ('en', []), 'Curaçao': ('nl', ['pap']), 'Aruba': ('nl', ['pap']), # Default fallback 'Unknown': ('en', []), } def normalize_diacritics(text: str) -> str: """Normalize diacritics to ASCII equivalents.""" normalized = unicodedata.normalize('NFD', text) ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return ascii_text def get_significant_words(text: str) -> List[str]: """Extract significant words from a name, skipping articles/prepositions.""" # Skip words by language SKIP_WORDS = { # Dutch 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', # English 'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'under', 'and', 'or', 'but', # French 'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous', 'pour', 'par', 'avec', "l'", "d'", 'et', 'ou', # German 'der', 'die', 'das', 'dem', 'den', 'ein', 'eine', 'einer', 'einem', 'einen', 'von', 'zu', 'für', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder', # Spanish 'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para', 'sobre', 'bajo', 'y', 'o', 'e', 'u', # Portuguese 'o', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na', 'nos', 'nas', 'com', 'sob', # Italian 'il', 'lo', 'gli', 'uno', 'di', 'dello', 'della', 'dei', 'degli', 'delle', 'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle', 'nel', 'nello', 'nella', 'nei', 'negli', 'nelle', 'sul', 'sullo', 'sulla', 'sui', 'sugli', 'sulle', 'per', 'tra', 'fra', 'ed', 'od', # Russian (transliterated) 'i', 'v', 'na', 'pri', } words = text.split() significant = [] for word in words: # Clean word clean_word = re.sub(r"[''`\",.:;!?()[\]{}]", '', word.lower()) if clean_word and clean_word not in SKIP_WORDS: # Skip pure numbers if not clean_word.isdigit() and not re.match(r'^\d+-\d+$', clean_word): significant.append(word) return significant def generate_abbreviation(name: str, max_length: int = 10) -> str: """Generate abbreviation from emic name using first letters of significant words.""" significant_words = get_significant_words(name) if not significant_words: # Fallback: use first letters of all words significant_words = name.split()[:3] # Take first letter of each word abbrev = '' for word in significant_words: # Clean the word of special characters clean = re.sub(r"[''`\",.:;!?()[\]{}&/\\+@#$%*|=<>~^_-]", '', word) if clean: # Normalize diacritics and take first letter first_letter = normalize_diacritics(clean[0]).upper() if first_letter.isalpha(): abbrev += first_letter # Ensure at least 2 characters if len(abbrev) < 2: # Try to get more from the name clean_name = normalize_diacritics(name) clean_name = re.sub(r'[^A-Za-z]', '', clean_name) abbrev = clean_name[:3].upper() return abbrev[:max_length] def get_emic_name(custodian: Dict, country: str, labels: Dict[str, str]) -> Tuple[str, str]: """ Determine the appropriate emic (local language) name for a custodian. Returns: (emic_name, language_code) """ # Get language mapping for country if country in COUNTRY_LANGUAGE_MAP: primary_lang, fallback_langs = COUNTRY_LANGUAGE_MAP[country] all_langs = [primary_lang] + fallback_langs else: # Unknown country - default to English all_langs = ['en'] # Try each language in order for lang in all_langs: if lang in labels: return labels[lang], lang # Fallback to English if available if 'en' in labels: return labels['en'], 'en' # Ultimate fallback: first available label if labels: first_lang = next(iter(labels)) return labels[first_lang], first_lang # No labels at all - use original name return custodian.get('name_en', 'Unknown'), 'en' def load_custodian_file(filepath: Path) -> Optional[Dict]: """Load a custodian YAML file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: print(f"Error loading {filepath}: {e}") return None def save_custodian_file(filepath: Path, data: Dict): """Save a custodian YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def main(): # Paths project_root = Path(__file__).parent.parent custodian_dir = project_root / 'data' / 'custodian' cache_dir = project_root / 'data' / 'cache' # Load data print("Loading multilingual labels...") with open(cache_dir / 'unesco_mow_multilingual_labels.json', 'r') as f: labels_data = json.load(f) wikidata_labels = labels_data['custodians'] print("Loading original custodian info...") with open(cache_dir / 'unesco_mow_custodians.json', 'r') as f: custodians_data = json.load(f) original_custodians = {c['wikidata_id']: c for c in custodians_data['custodians']} print(f"Processing {len(original_custodians)} UNESCO MoW custodians...\n") # Statistics stats = { 'total': 0, 'updated': 0, 'abbreviation_changed': 0, 'ghcid_changed': 0, 'not_found': 0, 'errors': 0, 'already_enriched': 0, } # Track changes for reporting changes = [] timestamp = datetime.now(timezone.utc).isoformat() for qid, custodian in original_custodians.items(): stats['total'] += 1 # Find the custodian file by Wikidata ID matching_files = list(custodian_dir.glob('*.yaml')) custodian_file = None for filepath in matching_files: data = load_custodian_file(filepath) if data: # Check if this is the right custodian by Wikidata ID wikidata_id = data.get('original_entry', {}).get('wikidata_id') or \ data.get('wikidata_enrichment', {}).get('wikidata_entity_id') if wikidata_id == qid: custodian_file = filepath break if not custodian_file: stats['not_found'] += 1 continue # Load full custodian data data = load_custodian_file(custodian_file) if not data: stats['errors'] += 1 continue # Get labels for this custodian labels_info = wikidata_labels.get(qid, {}) labels = labels_info.get('labels', {}) if not labels: print(f" No labels found for {qid}") continue # Determine emic name country = custodian.get('country', 'Unknown') emic_name, lang_code = get_emic_name(custodian, country, labels) # Current values current_name = data.get('custodian_name', {}).get('claim_value', '') current_emic = data.get('custodian_name', {}).get('emic_name', '') # Check if already enriched with emic_name if current_emic and current_emic == emic_name: stats['already_enriched'] += 1 continue # Generate abbreviation from emic name new_abbrev = generate_abbreviation(emic_name) # Get current abbreviation from GHCID current_ghcid = data.get('ghcid', {}).get('ghcid_current', '') current_abbrev = current_ghcid.split('-')[-1] if current_ghcid else '' # Update custodian_name if 'custodian_name' not in data: data['custodian_name'] = {} data['custodian_name']['emic_name'] = emic_name data['custodian_name']['name_language'] = lang_code data['custodian_name']['standardized_name'] = emic_name # Keep original English name if different if current_name and current_name != emic_name: if 'alternative_names' not in data['custodian_name']: data['custodian_name']['alternative_names'] = [] if current_name not in [n.get('name') if isinstance(n, dict) else n for n in data['custodian_name']['alternative_names']]: data['custodian_name']['alternative_names'].append({ 'name': current_name, 'language': 'en', 'source': 'wikidata' }) # Track change change_info = { 'wikidata_id': qid, 'file': custodian_file.name, 'country': country, 'old_name': current_name, 'new_emic_name': emic_name, 'language': lang_code, 'old_abbrev': current_abbrev, 'new_abbrev': new_abbrev, } # Check if abbreviation changed if new_abbrev != current_abbrev and current_abbrev: stats['abbreviation_changed'] += 1 change_info['abbrev_changed'] = True # TODO: For now, we don't update GHCID - that requires more careful handling # with collision detection. Just log the change. print(f" ABBREV CHANGE: {custodian_file.name}") print(f" {country}: {current_name}") print(f" Emic ({lang_code}): {emic_name}") print(f" Abbrev: {current_abbrev} → {new_abbrev}") changes.append(change_info) # Save updated file save_custodian_file(custodian_file, data) stats['updated'] += 1 # Print summary print("\n" + "=" * 60) print("ENRICHMENT SUMMARY") print("=" * 60) print(f"Total custodians processed: {stats['total']}") print(f"Files updated: {stats['updated']}") print(f"Already enriched: {stats['already_enriched']}") print(f"Abbreviation changes detected: {stats['abbreviation_changed']}") print(f"Files not found: {stats['not_found']}") print(f"Errors: {stats['errors']}") # Save changes log changes_log = { 'timestamp': timestamp, 'stats': stats, 'changes': changes } log_file = cache_dir / 'emic_name_enrichment_log.json' with open(log_file, 'w', encoding='utf-8') as f: json.dump(changes_log, f, indent=2, ensure_ascii=False) print(f"\nChanges log saved to: {log_file}") # Show sample of abbreviation changes abbrev_changes = [c for c in changes if c.get('abbrev_changed')] if abbrev_changes[:10]: print("\n" + "-" * 60) print("Sample abbreviation changes (not yet applied to GHCID):") print("-" * 60) for c in abbrev_changes[:10]: print(f" {c['country']}: {c['old_abbrev']} → {c['new_abbrev']}") print(f" EN: {c['old_name']}") print(f" {c['language'].upper()}: {c['new_emic_name']}") print() if __name__ == '__main__': main()