glam/scripts/enrich_custodian_emic_names.py
kempersc 6a6557bbe8 feat(enrichment): add emic name enrichment and update CustodianName schema
- Add emic_name, name_language, standardized_name to CustodianName
- Add scripts for enriching custodian emic names from Wikidata
- Add YouTube and Google Maps enrichment scripts
- Update DuckLake loader for new schema fields
2025-12-08 14:58:50 +01:00

557 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Enrich UNESCO MoW custodian files with proper CustodianName data.
This script:
1. Loads multilingual labels from Wikidata cache
2. Determines the appropriate emic (local language) name for each custodian
3. Updates custodian YAML files with:
- custodian_name.emic_name (local language name)
- custodian_name.name_language (ISO 639-1 code)
- custodian_name.standardized_name (same as emic_name for now)
4. Regenerates abbreviations from local language names if different
5. Updates GHCIDs and maintains history for changed abbreviations
Per AGENTS.md: The abbreviation and optional snake_case name suffix should be
derived from the emic name in the institution's official local/national language.
"""
import json
import yaml
import unicodedata
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, List, Tuple
# Country to primary official language(s) mapping
# Format: country_name -> (primary_lang, fallback_langs)
# For multilingual countries, we'll check if the institution has a label in any official language
COUNTRY_LANGUAGE_MAP = {
# Europe - Western
'Germany': ('de', []),
'Austria': ('de', []),
'France': ('fr', []),
'Netherlands': ('nl', []),
'Belgium': ('nl', ['fr', 'de']), # Check all three official languages
'Luxembourg': ('lb', ['fr', 'de']),
'Switzerland': ('de', ['fr', 'it', 'rm']), # Check all four national languages
'United Kingdom': ('en', []),
'Ireland': ('en', ['ga']),
# Europe - Northern
'Norway': ('nb', ['nn', 'no']), # Bokmål preferred, Nynorsk fallback
'Sweden': ('sv', []),
'Denmark': ('da', []),
'Finland': ('fi', ['sv']), # Swedish is also official
'Iceland': ('is', []),
# Europe - Southern
'Spain': ('es', ['ca', 'eu', 'gl']), # Regional languages
'Portugal': ('pt', []),
'Italy': ('it', []),
'Greece': ('el', []),
'Malta': ('mt', ['en']),
'Cyprus': ('el', ['tr']),
# Europe - Central/Eastern
'Poland': ('pl', []),
'Czech Republic': ('cs', []),
'Czechia': ('cs', []),
'Slovakia': ('sk', []),
'Hungary': ('hu', []),
'Slovenia': ('sl', []),
'Croatia': ('hr', []),
'Serbia': ('sr', []),
'Bosnia and Herzegovina': ('bs', ['hr', 'sr']),
'North Macedonia': ('mk', []),
'Albania': ('sq', []),
'Bulgaria': ('bg', []),
'Romania': ('ro', []),
'Moldova': ('ro', []),
'Ukraine': ('uk', []),
'Belarus': ('be', ['ru']),
'Russia': ('ru', []),
'Estonia': ('et', []),
'Latvia': ('lv', []),
'Lithuania': ('lt', []),
# Americas
'United States': ('en', []),
'Canada': ('en', ['fr']),
'Mexico': ('es', []),
'Brazil': ('pt', []),
'Argentina': ('es', []),
'Chile': ('es', []),
'Colombia': ('es', []),
'Peru': ('es', []),
'Venezuela': ('es', []),
'Ecuador': ('es', []),
'Bolivia': ('es', []),
'Paraguay': ('es', ['gn']),
'Uruguay': ('es', []),
'Cuba': ('es', []),
'Dominican Republic': ('es', []),
'Puerto Rico': ('es', ['en']),
'Costa Rica': ('es', []),
'Panama': ('es', []),
'Guatemala': ('es', []),
'Honduras': ('es', []),
'El Salvador': ('es', []),
'Nicaragua': ('es', []),
'Jamaica': ('en', []),
'Trinidad and Tobago': ('en', []),
'Barbados': ('en', []),
'Suriname': ('nl', []),
'Guyana': ('en', []),
# Asia - East
'Japan': ('ja', []),
"People's Republic of China": ('zh', []),
'China': ('zh', []),
'Taiwan': ('zh', []),
'South Korea': ('ko', []),
'North Korea': ('ko', []),
'Mongolia': ('mn', []),
# Asia - Southeast
'Vietnam': ('vi', []),
'Thailand': ('th', []),
'Cambodia': ('km', []),
'Laos': ('lo', []),
'Myanmar': ('my', []),
'Malaysia': ('ms', []),
'Singapore': ('en', ['zh', 'ms', 'ta']),
'Indonesia': ('id', []),
'Philippines': ('tl', ['en']),
'Brunei': ('ms', []),
'East Timor': ('pt', ['tet']),
'Timor-Leste': ('pt', ['tet']),
# Asia - South
'India': ('hi', ['en', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or']),
'Pakistan': ('ur', ['en']),
'Bangladesh': ('bn', []),
'Sri Lanka': ('si', ['ta']),
'Nepal': ('ne', []),
'Bhutan': ('dz', []),
'Maldives': ('dv', []),
# Asia - Central
'Kazakhstan': ('kk', ['ru']),
'Uzbekistan': ('uz', []),
'Turkmenistan': ('tk', []),
'Kyrgyzstan': ('ky', ['ru']),
'Tajikistan': ('tg', []),
'Afghanistan': ('ps', ['fa']),
# Asia - West / Middle East
'Turkey': ('tr', []),
'Iran': ('fa', []),
'Iraq': ('ar', ['ku']),
'Syria': ('ar', []),
'Lebanon': ('ar', []),
'Jordan': ('ar', []),
'Israel': ('he', ['ar']),
'Palestine': ('ar', []),
'Saudi Arabia': ('ar', []),
'United Arab Emirates': ('ar', []),
'Kuwait': ('ar', []),
'Qatar': ('ar', []),
'Bahrain': ('ar', []),
'Oman': ('ar', []),
'Yemen': ('ar', []),
'Georgia': ('ka', []),
'Armenia': ('hy', []),
'Azerbaijan': ('az', []),
# Africa - North
'Egypt': ('ar', []),
'Libya': ('ar', []),
'Tunisia': ('ar', ['fr']),
'Algeria': ('ar', ['fr']),
'Morocco': ('ar', ['fr']),
# Africa - West
'Nigeria': ('en', []),
'Ghana': ('en', []),
'Senegal': ('fr', []),
'Ivory Coast': ('fr', []),
"Côte d'Ivoire": ('fr', []),
'Mali': ('fr', []),
'Burkina Faso': ('fr', []),
'Niger': ('fr', []),
'Benin': ('fr', []),
'Togo': ('fr', []),
'Guinea': ('fr', []),
'Sierra Leone': ('en', []),
'Liberia': ('en', []),
'Mauritania': ('ar', ['fr']),
'Cape Verde': ('pt', []),
'Gambia': ('en', []),
# Africa - East
'Kenya': ('sw', ['en']),
'Tanzania': ('sw', ['en']),
'Uganda': ('en', ['sw']),
'Rwanda': ('rw', ['fr', 'en']),
'Burundi': ('rn', ['fr']),
'Ethiopia': ('am', []),
'Eritrea': ('ti', ['ar']),
'Somalia': ('so', ['ar']),
'Djibouti': ('fr', ['ar']),
'Madagascar': ('mg', ['fr']),
'Mauritius': ('en', ['fr']),
'Seychelles': ('en', ['fr']),
# Africa - Central
'Democratic Republic of the Congo': ('fr', []),
'Republic of the Congo': ('fr', []),
'Central African Republic': ('fr', []),
'Chad': ('fr', ['ar']),
'Cameroon': ('fr', ['en']),
'Gabon': ('fr', []),
'Equatorial Guinea': ('es', ['fr', 'pt']),
# Africa - Southern
'South Africa': ('en', ['af', 'zu', 'xh']),
'Namibia': ('en', ['de', 'af']),
'Botswana': ('en', ['tn']),
'Zimbabwe': ('en', ['sn', 'nd']),
'Zambia': ('en', []),
'Malawi': ('en', []),
'Mozambique': ('pt', []),
'Angola': ('pt', []),
'Lesotho': ('en', ['st']),
'Eswatini': ('en', ['ss']),
# Oceania
'Australia': ('en', []),
'New Zealand': ('en', ['mi']),
'Papua New Guinea': ('en', ['tpi', 'ho']),
'Fiji': ('en', ['fj', 'hi']),
'Vanuatu': ('en', ['fr', 'bi']),
'Samoa': ('sm', ['en']),
'Tonga': ('to', ['en']),
'Solomon Islands': ('en', []),
'Kiribati': ('en', ['gil']),
'Micronesia': ('en', []),
'Palau': ('en', ['pau']),
# Caribbean
'Haiti': ('ht', ['fr']),
'Bahamas': ('en', []),
'Curaçao': ('nl', ['pap']),
'Aruba': ('nl', ['pap']),
# Default fallback
'Unknown': ('en', []),
}
def normalize_diacritics(text: str) -> str:
"""Normalize diacritics to ASCII equivalents."""
normalized = unicodedata.normalize('NFD', text)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
def get_significant_words(text: str) -> List[str]:
"""Extract significant words from a name, skipping articles/prepositions."""
# Skip words by language
SKIP_WORDS = {
# Dutch
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
# English
'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'under',
'and', 'or', 'but',
# French
'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous',
'pour', 'par', 'avec', "l'", "d'", 'et', 'ou',
# German
'der', 'die', 'das', 'dem', 'den', 'ein', 'eine', 'einer', 'einem', 'einen',
'von', 'zu', 'für', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder',
# Spanish
'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para', 'sobre', 'bajo',
'y', 'o', 'e', 'u',
# Portuguese
'o', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na',
'nos', 'nas', 'com', 'sob',
# Italian
'il', 'lo', 'gli', 'uno', 'di', 'dello', 'della', 'dei', 'degli', 'delle',
'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle',
'nel', 'nello', 'nella', 'nei', 'negli', 'nelle', 'sul', 'sullo', 'sulla', 'sui', 'sugli',
'sulle', 'per', 'tra', 'fra', 'ed', 'od',
# Russian (transliterated)
'i', 'v', 'na', 'pri',
}
words = text.split()
significant = []
for word in words:
# Clean word
clean_word = re.sub(r"[''`\",.:;!?()[\]{}]", '', word.lower())
if clean_word and clean_word not in SKIP_WORDS:
# Skip pure numbers
if not clean_word.isdigit() and not re.match(r'^\d+-\d+$', clean_word):
significant.append(word)
return significant
def generate_abbreviation(name: str, max_length: int = 10) -> str:
"""Generate abbreviation from emic name using first letters of significant words."""
significant_words = get_significant_words(name)
if not significant_words:
# Fallback: use first letters of all words
significant_words = name.split()[:3]
# Take first letter of each word
abbrev = ''
for word in significant_words:
# Clean the word of special characters
clean = re.sub(r"[''`\",.:;!?()[\]{}&/\\+@#$%*|=<>~^_-]", '', word)
if clean:
# Normalize diacritics and take first letter
first_letter = normalize_diacritics(clean[0]).upper()
if first_letter.isalpha():
abbrev += first_letter
# Ensure at least 2 characters
if len(abbrev) < 2:
# Try to get more from the name
clean_name = normalize_diacritics(name)
clean_name = re.sub(r'[^A-Za-z]', '', clean_name)
abbrev = clean_name[:3].upper()
return abbrev[:max_length]
def get_emic_name(custodian: Dict, country: str, labels: Dict[str, str]) -> Tuple[str, str]:
"""
Determine the appropriate emic (local language) name for a custodian.
Returns: (emic_name, language_code)
"""
# Get language mapping for country
if country in COUNTRY_LANGUAGE_MAP:
primary_lang, fallback_langs = COUNTRY_LANGUAGE_MAP[country]
all_langs = [primary_lang] + fallback_langs
else:
# Unknown country - default to English
all_langs = ['en']
# Try each language in order
for lang in all_langs:
if lang in labels:
return labels[lang], lang
# Fallback to English if available
if 'en' in labels:
return labels['en'], 'en'
# Ultimate fallback: first available label
if labels:
first_lang = next(iter(labels))
return labels[first_lang], first_lang
# No labels at all - use original name
return custodian.get('name_en', 'Unknown'), 'en'
def load_custodian_file(filepath: Path) -> Optional[Dict]:
"""Load a custodian YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None
def save_custodian_file(filepath: Path, data: Dict):
"""Save a custodian YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def main():
# Paths
project_root = Path(__file__).parent.parent
custodian_dir = project_root / 'data' / 'custodian'
cache_dir = project_root / 'data' / 'cache'
# Load data
print("Loading multilingual labels...")
with open(cache_dir / 'unesco_mow_multilingual_labels.json', 'r') as f:
labels_data = json.load(f)
wikidata_labels = labels_data['custodians']
print("Loading original custodian info...")
with open(cache_dir / 'unesco_mow_custodians.json', 'r') as f:
custodians_data = json.load(f)
original_custodians = {c['wikidata_id']: c for c in custodians_data['custodians']}
print(f"Processing {len(original_custodians)} UNESCO MoW custodians...\n")
# Statistics
stats = {
'total': 0,
'updated': 0,
'abbreviation_changed': 0,
'ghcid_changed': 0,
'not_found': 0,
'errors': 0,
'already_enriched': 0,
}
# Track changes for reporting
changes = []
timestamp = datetime.now(timezone.utc).isoformat()
for qid, custodian in original_custodians.items():
stats['total'] += 1
# Find the custodian file by Wikidata ID
matching_files = list(custodian_dir.glob('*.yaml'))
custodian_file = None
for filepath in matching_files:
data = load_custodian_file(filepath)
if data:
# Check if this is the right custodian by Wikidata ID
wikidata_id = data.get('original_entry', {}).get('wikidata_id') or \
data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
if wikidata_id == qid:
custodian_file = filepath
break
if not custodian_file:
stats['not_found'] += 1
continue
# Load full custodian data
data = load_custodian_file(custodian_file)
if not data:
stats['errors'] += 1
continue
# Get labels for this custodian
labels_info = wikidata_labels.get(qid, {})
labels = labels_info.get('labels', {})
if not labels:
print(f" No labels found for {qid}")
continue
# Determine emic name
country = custodian.get('country', 'Unknown')
emic_name, lang_code = get_emic_name(custodian, country, labels)
# Current values
current_name = data.get('custodian_name', {}).get('claim_value', '')
current_emic = data.get('custodian_name', {}).get('emic_name', '')
# Check if already enriched with emic_name
if current_emic and current_emic == emic_name:
stats['already_enriched'] += 1
continue
# Generate abbreviation from emic name
new_abbrev = generate_abbreviation(emic_name)
# Get current abbreviation from GHCID
current_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
current_abbrev = current_ghcid.split('-')[-1] if current_ghcid else ''
# Update custodian_name
if 'custodian_name' not in data:
data['custodian_name'] = {}
data['custodian_name']['emic_name'] = emic_name
data['custodian_name']['name_language'] = lang_code
data['custodian_name']['standardized_name'] = emic_name
# Keep original English name if different
if current_name and current_name != emic_name:
if 'alternative_names' not in data['custodian_name']:
data['custodian_name']['alternative_names'] = []
if current_name not in [n.get('name') if isinstance(n, dict) else n
for n in data['custodian_name']['alternative_names']]:
data['custodian_name']['alternative_names'].append({
'name': current_name,
'language': 'en',
'source': 'wikidata'
})
# Track change
change_info = {
'wikidata_id': qid,
'file': custodian_file.name,
'country': country,
'old_name': current_name,
'new_emic_name': emic_name,
'language': lang_code,
'old_abbrev': current_abbrev,
'new_abbrev': new_abbrev,
}
# Check if abbreviation changed
if new_abbrev != current_abbrev and current_abbrev:
stats['abbreviation_changed'] += 1
change_info['abbrev_changed'] = True
# TODO: For now, we don't update GHCID - that requires more careful handling
# with collision detection. Just log the change.
print(f" ABBREV CHANGE: {custodian_file.name}")
print(f" {country}: {current_name}")
print(f" Emic ({lang_code}): {emic_name}")
print(f" Abbrev: {current_abbrev}{new_abbrev}")
changes.append(change_info)
# Save updated file
save_custodian_file(custodian_file, data)
stats['updated'] += 1
# Print summary
print("\n" + "=" * 60)
print("ENRICHMENT SUMMARY")
print("=" * 60)
print(f"Total custodians processed: {stats['total']}")
print(f"Files updated: {stats['updated']}")
print(f"Already enriched: {stats['already_enriched']}")
print(f"Abbreviation changes detected: {stats['abbreviation_changed']}")
print(f"Files not found: {stats['not_found']}")
print(f"Errors: {stats['errors']}")
# Save changes log
changes_log = {
'timestamp': timestamp,
'stats': stats,
'changes': changes
}
log_file = cache_dir / 'emic_name_enrichment_log.json'
with open(log_file, 'w', encoding='utf-8') as f:
json.dump(changes_log, f, indent=2, ensure_ascii=False)
print(f"\nChanges log saved to: {log_file}")
# Show sample of abbreviation changes
abbrev_changes = [c for c in changes if c.get('abbrev_changed')]
if abbrev_changes[:10]:
print("\n" + "-" * 60)
print("Sample abbreviation changes (not yet applied to GHCID):")
print("-" * 60)
for c in abbrev_changes[:10]:
print(f" {c['country']}: {c['old_abbrev']}{c['new_abbrev']}")
print(f" EN: {c['old_name']}")
print(f" {c['language'].upper()}: {c['new_emic_name']}")
print()
if __name__ == '__main__':
main()