- Add emic_name, name_language, standardized_name to CustodianName - Add scripts for enriching custodian emic names from Wikidata - Add YouTube and Google Maps enrichment scripts - Update DuckLake loader for new schema fields
557 lines
18 KiB
Python
557 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich UNESCO MoW custodian files with proper CustodianName data.
|
|
|
|
This script:
|
|
1. Loads multilingual labels from Wikidata cache
|
|
2. Determines the appropriate emic (local language) name for each custodian
|
|
3. Updates custodian YAML files with:
|
|
- custodian_name.emic_name (local language name)
|
|
- custodian_name.name_language (ISO 639-1 code)
|
|
- custodian_name.standardized_name (same as emic_name for now)
|
|
4. Regenerates abbreviations from local language names if different
|
|
5. Updates GHCIDs and maintains history for changed abbreviations
|
|
|
|
Per AGENTS.md: The abbreviation and optional snake_case name suffix should be
|
|
derived from the emic name in the institution's official local/national language.
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import unicodedata
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, List, Tuple
|
|
|
|
# Country to primary official language(s) mapping
|
|
# Format: country_name -> (primary_lang, fallback_langs)
|
|
# For multilingual countries, we'll check if the institution has a label in any official language
|
|
COUNTRY_LANGUAGE_MAP = {
|
|
# Europe - Western
|
|
'Germany': ('de', []),
|
|
'Austria': ('de', []),
|
|
'France': ('fr', []),
|
|
'Netherlands': ('nl', []),
|
|
'Belgium': ('nl', ['fr', 'de']), # Check all three official languages
|
|
'Luxembourg': ('lb', ['fr', 'de']),
|
|
'Switzerland': ('de', ['fr', 'it', 'rm']), # Check all four national languages
|
|
'United Kingdom': ('en', []),
|
|
'Ireland': ('en', ['ga']),
|
|
|
|
# Europe - Northern
|
|
'Norway': ('nb', ['nn', 'no']), # Bokmål preferred, Nynorsk fallback
|
|
'Sweden': ('sv', []),
|
|
'Denmark': ('da', []),
|
|
'Finland': ('fi', ['sv']), # Swedish is also official
|
|
'Iceland': ('is', []),
|
|
|
|
# Europe - Southern
|
|
'Spain': ('es', ['ca', 'eu', 'gl']), # Regional languages
|
|
'Portugal': ('pt', []),
|
|
'Italy': ('it', []),
|
|
'Greece': ('el', []),
|
|
'Malta': ('mt', ['en']),
|
|
'Cyprus': ('el', ['tr']),
|
|
|
|
# Europe - Central/Eastern
|
|
'Poland': ('pl', []),
|
|
'Czech Republic': ('cs', []),
|
|
'Czechia': ('cs', []),
|
|
'Slovakia': ('sk', []),
|
|
'Hungary': ('hu', []),
|
|
'Slovenia': ('sl', []),
|
|
'Croatia': ('hr', []),
|
|
'Serbia': ('sr', []),
|
|
'Bosnia and Herzegovina': ('bs', ['hr', 'sr']),
|
|
'North Macedonia': ('mk', []),
|
|
'Albania': ('sq', []),
|
|
'Bulgaria': ('bg', []),
|
|
'Romania': ('ro', []),
|
|
'Moldova': ('ro', []),
|
|
'Ukraine': ('uk', []),
|
|
'Belarus': ('be', ['ru']),
|
|
'Russia': ('ru', []),
|
|
'Estonia': ('et', []),
|
|
'Latvia': ('lv', []),
|
|
'Lithuania': ('lt', []),
|
|
|
|
# Americas
|
|
'United States': ('en', []),
|
|
'Canada': ('en', ['fr']),
|
|
'Mexico': ('es', []),
|
|
'Brazil': ('pt', []),
|
|
'Argentina': ('es', []),
|
|
'Chile': ('es', []),
|
|
'Colombia': ('es', []),
|
|
'Peru': ('es', []),
|
|
'Venezuela': ('es', []),
|
|
'Ecuador': ('es', []),
|
|
'Bolivia': ('es', []),
|
|
'Paraguay': ('es', ['gn']),
|
|
'Uruguay': ('es', []),
|
|
'Cuba': ('es', []),
|
|
'Dominican Republic': ('es', []),
|
|
'Puerto Rico': ('es', ['en']),
|
|
'Costa Rica': ('es', []),
|
|
'Panama': ('es', []),
|
|
'Guatemala': ('es', []),
|
|
'Honduras': ('es', []),
|
|
'El Salvador': ('es', []),
|
|
'Nicaragua': ('es', []),
|
|
'Jamaica': ('en', []),
|
|
'Trinidad and Tobago': ('en', []),
|
|
'Barbados': ('en', []),
|
|
'Suriname': ('nl', []),
|
|
'Guyana': ('en', []),
|
|
|
|
# Asia - East
|
|
'Japan': ('ja', []),
|
|
"People's Republic of China": ('zh', []),
|
|
'China': ('zh', []),
|
|
'Taiwan': ('zh', []),
|
|
'South Korea': ('ko', []),
|
|
'North Korea': ('ko', []),
|
|
'Mongolia': ('mn', []),
|
|
|
|
# Asia - Southeast
|
|
'Vietnam': ('vi', []),
|
|
'Thailand': ('th', []),
|
|
'Cambodia': ('km', []),
|
|
'Laos': ('lo', []),
|
|
'Myanmar': ('my', []),
|
|
'Malaysia': ('ms', []),
|
|
'Singapore': ('en', ['zh', 'ms', 'ta']),
|
|
'Indonesia': ('id', []),
|
|
'Philippines': ('tl', ['en']),
|
|
'Brunei': ('ms', []),
|
|
'East Timor': ('pt', ['tet']),
|
|
'Timor-Leste': ('pt', ['tet']),
|
|
|
|
# Asia - South
|
|
'India': ('hi', ['en', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or']),
|
|
'Pakistan': ('ur', ['en']),
|
|
'Bangladesh': ('bn', []),
|
|
'Sri Lanka': ('si', ['ta']),
|
|
'Nepal': ('ne', []),
|
|
'Bhutan': ('dz', []),
|
|
'Maldives': ('dv', []),
|
|
|
|
# Asia - Central
|
|
'Kazakhstan': ('kk', ['ru']),
|
|
'Uzbekistan': ('uz', []),
|
|
'Turkmenistan': ('tk', []),
|
|
'Kyrgyzstan': ('ky', ['ru']),
|
|
'Tajikistan': ('tg', []),
|
|
'Afghanistan': ('ps', ['fa']),
|
|
|
|
# Asia - West / Middle East
|
|
'Turkey': ('tr', []),
|
|
'Iran': ('fa', []),
|
|
'Iraq': ('ar', ['ku']),
|
|
'Syria': ('ar', []),
|
|
'Lebanon': ('ar', []),
|
|
'Jordan': ('ar', []),
|
|
'Israel': ('he', ['ar']),
|
|
'Palestine': ('ar', []),
|
|
'Saudi Arabia': ('ar', []),
|
|
'United Arab Emirates': ('ar', []),
|
|
'Kuwait': ('ar', []),
|
|
'Qatar': ('ar', []),
|
|
'Bahrain': ('ar', []),
|
|
'Oman': ('ar', []),
|
|
'Yemen': ('ar', []),
|
|
'Georgia': ('ka', []),
|
|
'Armenia': ('hy', []),
|
|
'Azerbaijan': ('az', []),
|
|
|
|
# Africa - North
|
|
'Egypt': ('ar', []),
|
|
'Libya': ('ar', []),
|
|
'Tunisia': ('ar', ['fr']),
|
|
'Algeria': ('ar', ['fr']),
|
|
'Morocco': ('ar', ['fr']),
|
|
|
|
# Africa - West
|
|
'Nigeria': ('en', []),
|
|
'Ghana': ('en', []),
|
|
'Senegal': ('fr', []),
|
|
'Ivory Coast': ('fr', []),
|
|
"Côte d'Ivoire": ('fr', []),
|
|
'Mali': ('fr', []),
|
|
'Burkina Faso': ('fr', []),
|
|
'Niger': ('fr', []),
|
|
'Benin': ('fr', []),
|
|
'Togo': ('fr', []),
|
|
'Guinea': ('fr', []),
|
|
'Sierra Leone': ('en', []),
|
|
'Liberia': ('en', []),
|
|
'Mauritania': ('ar', ['fr']),
|
|
'Cape Verde': ('pt', []),
|
|
'Gambia': ('en', []),
|
|
|
|
# Africa - East
|
|
'Kenya': ('sw', ['en']),
|
|
'Tanzania': ('sw', ['en']),
|
|
'Uganda': ('en', ['sw']),
|
|
'Rwanda': ('rw', ['fr', 'en']),
|
|
'Burundi': ('rn', ['fr']),
|
|
'Ethiopia': ('am', []),
|
|
'Eritrea': ('ti', ['ar']),
|
|
'Somalia': ('so', ['ar']),
|
|
'Djibouti': ('fr', ['ar']),
|
|
'Madagascar': ('mg', ['fr']),
|
|
'Mauritius': ('en', ['fr']),
|
|
'Seychelles': ('en', ['fr']),
|
|
|
|
# Africa - Central
|
|
'Democratic Republic of the Congo': ('fr', []),
|
|
'Republic of the Congo': ('fr', []),
|
|
'Central African Republic': ('fr', []),
|
|
'Chad': ('fr', ['ar']),
|
|
'Cameroon': ('fr', ['en']),
|
|
'Gabon': ('fr', []),
|
|
'Equatorial Guinea': ('es', ['fr', 'pt']),
|
|
|
|
# Africa - Southern
|
|
'South Africa': ('en', ['af', 'zu', 'xh']),
|
|
'Namibia': ('en', ['de', 'af']),
|
|
'Botswana': ('en', ['tn']),
|
|
'Zimbabwe': ('en', ['sn', 'nd']),
|
|
'Zambia': ('en', []),
|
|
'Malawi': ('en', []),
|
|
'Mozambique': ('pt', []),
|
|
'Angola': ('pt', []),
|
|
'Lesotho': ('en', ['st']),
|
|
'Eswatini': ('en', ['ss']),
|
|
|
|
# Oceania
|
|
'Australia': ('en', []),
|
|
'New Zealand': ('en', ['mi']),
|
|
'Papua New Guinea': ('en', ['tpi', 'ho']),
|
|
'Fiji': ('en', ['fj', 'hi']),
|
|
'Vanuatu': ('en', ['fr', 'bi']),
|
|
'Samoa': ('sm', ['en']),
|
|
'Tonga': ('to', ['en']),
|
|
'Solomon Islands': ('en', []),
|
|
'Kiribati': ('en', ['gil']),
|
|
'Micronesia': ('en', []),
|
|
'Palau': ('en', ['pau']),
|
|
|
|
# Caribbean
|
|
'Haiti': ('ht', ['fr']),
|
|
'Bahamas': ('en', []),
|
|
'Curaçao': ('nl', ['pap']),
|
|
'Aruba': ('nl', ['pap']),
|
|
|
|
# Default fallback
|
|
'Unknown': ('en', []),
|
|
}
|
|
|
|
|
|
def normalize_diacritics(text: str) -> str:
|
|
"""Normalize diacritics to ASCII equivalents."""
|
|
normalized = unicodedata.normalize('NFD', text)
|
|
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
return ascii_text
|
|
|
|
|
|
def get_significant_words(text: str) -> List[str]:
|
|
"""Extract significant words from a name, skipping articles/prepositions."""
|
|
# Skip words by language
|
|
SKIP_WORDS = {
|
|
# Dutch
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
|
|
'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
|
|
# English
|
|
'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'under',
|
|
'and', 'or', 'but',
|
|
# French
|
|
'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous',
|
|
'pour', 'par', 'avec', "l'", "d'", 'et', 'ou',
|
|
# German
|
|
'der', 'die', 'das', 'dem', 'den', 'ein', 'eine', 'einer', 'einem', 'einen',
|
|
'von', 'zu', 'für', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder',
|
|
# Spanish
|
|
'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para', 'sobre', 'bajo',
|
|
'y', 'o', 'e', 'u',
|
|
# Portuguese
|
|
'o', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na',
|
|
'nos', 'nas', 'com', 'sob',
|
|
# Italian
|
|
'il', 'lo', 'gli', 'uno', 'di', 'dello', 'della', 'dei', 'degli', 'delle',
|
|
'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle',
|
|
'nel', 'nello', 'nella', 'nei', 'negli', 'nelle', 'sul', 'sullo', 'sulla', 'sui', 'sugli',
|
|
'sulle', 'per', 'tra', 'fra', 'ed', 'od',
|
|
# Russian (transliterated)
|
|
'i', 'v', 'na', 'pri',
|
|
}
|
|
|
|
words = text.split()
|
|
significant = []
|
|
for word in words:
|
|
# Clean word
|
|
clean_word = re.sub(r"[''`\",.:;!?()[\]{}]", '', word.lower())
|
|
if clean_word and clean_word not in SKIP_WORDS:
|
|
# Skip pure numbers
|
|
if not clean_word.isdigit() and not re.match(r'^\d+-\d+$', clean_word):
|
|
significant.append(word)
|
|
|
|
return significant
|
|
|
|
|
|
def generate_abbreviation(name: str, max_length: int = 10) -> str:
|
|
"""Generate abbreviation from emic name using first letters of significant words."""
|
|
significant_words = get_significant_words(name)
|
|
|
|
if not significant_words:
|
|
# Fallback: use first letters of all words
|
|
significant_words = name.split()[:3]
|
|
|
|
# Take first letter of each word
|
|
abbrev = ''
|
|
for word in significant_words:
|
|
# Clean the word of special characters
|
|
clean = re.sub(r"[''`\",.:;!?()[\]{}&/\\+@#$%*|=<>~^_-]", '', word)
|
|
if clean:
|
|
# Normalize diacritics and take first letter
|
|
first_letter = normalize_diacritics(clean[0]).upper()
|
|
if first_letter.isalpha():
|
|
abbrev += first_letter
|
|
|
|
# Ensure at least 2 characters
|
|
if len(abbrev) < 2:
|
|
# Try to get more from the name
|
|
clean_name = normalize_diacritics(name)
|
|
clean_name = re.sub(r'[^A-Za-z]', '', clean_name)
|
|
abbrev = clean_name[:3].upper()
|
|
|
|
return abbrev[:max_length]
|
|
|
|
|
|
def get_emic_name(custodian: Dict, country: str, labels: Dict[str, str]) -> Tuple[str, str]:
|
|
"""
|
|
Determine the appropriate emic (local language) name for a custodian.
|
|
|
|
Returns: (emic_name, language_code)
|
|
"""
|
|
# Get language mapping for country
|
|
if country in COUNTRY_LANGUAGE_MAP:
|
|
primary_lang, fallback_langs = COUNTRY_LANGUAGE_MAP[country]
|
|
all_langs = [primary_lang] + fallback_langs
|
|
else:
|
|
# Unknown country - default to English
|
|
all_langs = ['en']
|
|
|
|
# Try each language in order
|
|
for lang in all_langs:
|
|
if lang in labels:
|
|
return labels[lang], lang
|
|
|
|
# Fallback to English if available
|
|
if 'en' in labels:
|
|
return labels['en'], 'en'
|
|
|
|
# Ultimate fallback: first available label
|
|
if labels:
|
|
first_lang = next(iter(labels))
|
|
return labels[first_lang], first_lang
|
|
|
|
# No labels at all - use original name
|
|
return custodian.get('name_en', 'Unknown'), 'en'
|
|
|
|
|
|
def load_custodian_file(filepath: Path) -> Optional[Dict]:
|
|
"""Load a custodian YAML file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}")
|
|
return None
|
|
|
|
|
|
def save_custodian_file(filepath: Path, data: Dict):
|
|
"""Save a custodian YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def main():
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
custodian_dir = project_root / 'data' / 'custodian'
|
|
cache_dir = project_root / 'data' / 'cache'
|
|
|
|
# Load data
|
|
print("Loading multilingual labels...")
|
|
with open(cache_dir / 'unesco_mow_multilingual_labels.json', 'r') as f:
|
|
labels_data = json.load(f)
|
|
wikidata_labels = labels_data['custodians']
|
|
|
|
print("Loading original custodian info...")
|
|
with open(cache_dir / 'unesco_mow_custodians.json', 'r') as f:
|
|
custodians_data = json.load(f)
|
|
original_custodians = {c['wikidata_id']: c for c in custodians_data['custodians']}
|
|
|
|
print(f"Processing {len(original_custodians)} UNESCO MoW custodians...\n")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': 0,
|
|
'updated': 0,
|
|
'abbreviation_changed': 0,
|
|
'ghcid_changed': 0,
|
|
'not_found': 0,
|
|
'errors': 0,
|
|
'already_enriched': 0,
|
|
}
|
|
|
|
# Track changes for reporting
|
|
changes = []
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
for qid, custodian in original_custodians.items():
|
|
stats['total'] += 1
|
|
|
|
# Find the custodian file by Wikidata ID
|
|
matching_files = list(custodian_dir.glob('*.yaml'))
|
|
custodian_file = None
|
|
|
|
for filepath in matching_files:
|
|
data = load_custodian_file(filepath)
|
|
if data:
|
|
# Check if this is the right custodian by Wikidata ID
|
|
wikidata_id = data.get('original_entry', {}).get('wikidata_id') or \
|
|
data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
|
|
if wikidata_id == qid:
|
|
custodian_file = filepath
|
|
break
|
|
|
|
if not custodian_file:
|
|
stats['not_found'] += 1
|
|
continue
|
|
|
|
# Load full custodian data
|
|
data = load_custodian_file(custodian_file)
|
|
if not data:
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
# Get labels for this custodian
|
|
labels_info = wikidata_labels.get(qid, {})
|
|
labels = labels_info.get('labels', {})
|
|
|
|
if not labels:
|
|
print(f" No labels found for {qid}")
|
|
continue
|
|
|
|
# Determine emic name
|
|
country = custodian.get('country', 'Unknown')
|
|
emic_name, lang_code = get_emic_name(custodian, country, labels)
|
|
|
|
# Current values
|
|
current_name = data.get('custodian_name', {}).get('claim_value', '')
|
|
current_emic = data.get('custodian_name', {}).get('emic_name', '')
|
|
|
|
# Check if already enriched with emic_name
|
|
if current_emic and current_emic == emic_name:
|
|
stats['already_enriched'] += 1
|
|
continue
|
|
|
|
# Generate abbreviation from emic name
|
|
new_abbrev = generate_abbreviation(emic_name)
|
|
|
|
# Get current abbreviation from GHCID
|
|
current_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
current_abbrev = current_ghcid.split('-')[-1] if current_ghcid else ''
|
|
|
|
# Update custodian_name
|
|
if 'custodian_name' not in data:
|
|
data['custodian_name'] = {}
|
|
|
|
data['custodian_name']['emic_name'] = emic_name
|
|
data['custodian_name']['name_language'] = lang_code
|
|
data['custodian_name']['standardized_name'] = emic_name
|
|
|
|
# Keep original English name if different
|
|
if current_name and current_name != emic_name:
|
|
if 'alternative_names' not in data['custodian_name']:
|
|
data['custodian_name']['alternative_names'] = []
|
|
if current_name not in [n.get('name') if isinstance(n, dict) else n
|
|
for n in data['custodian_name']['alternative_names']]:
|
|
data['custodian_name']['alternative_names'].append({
|
|
'name': current_name,
|
|
'language': 'en',
|
|
'source': 'wikidata'
|
|
})
|
|
|
|
# Track change
|
|
change_info = {
|
|
'wikidata_id': qid,
|
|
'file': custodian_file.name,
|
|
'country': country,
|
|
'old_name': current_name,
|
|
'new_emic_name': emic_name,
|
|
'language': lang_code,
|
|
'old_abbrev': current_abbrev,
|
|
'new_abbrev': new_abbrev,
|
|
}
|
|
|
|
# Check if abbreviation changed
|
|
if new_abbrev != current_abbrev and current_abbrev:
|
|
stats['abbreviation_changed'] += 1
|
|
change_info['abbrev_changed'] = True
|
|
|
|
# TODO: For now, we don't update GHCID - that requires more careful handling
|
|
# with collision detection. Just log the change.
|
|
print(f" ABBREV CHANGE: {custodian_file.name}")
|
|
print(f" {country}: {current_name}")
|
|
print(f" Emic ({lang_code}): {emic_name}")
|
|
print(f" Abbrev: {current_abbrev} → {new_abbrev}")
|
|
|
|
changes.append(change_info)
|
|
|
|
# Save updated file
|
|
save_custodian_file(custodian_file, data)
|
|
stats['updated'] += 1
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("ENRICHMENT SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total custodians processed: {stats['total']}")
|
|
print(f"Files updated: {stats['updated']}")
|
|
print(f"Already enriched: {stats['already_enriched']}")
|
|
print(f"Abbreviation changes detected: {stats['abbreviation_changed']}")
|
|
print(f"Files not found: {stats['not_found']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
# Save changes log
|
|
changes_log = {
|
|
'timestamp': timestamp,
|
|
'stats': stats,
|
|
'changes': changes
|
|
}
|
|
|
|
log_file = cache_dir / 'emic_name_enrichment_log.json'
|
|
with open(log_file, 'w', encoding='utf-8') as f:
|
|
json.dump(changes_log, f, indent=2, ensure_ascii=False)
|
|
print(f"\nChanges log saved to: {log_file}")
|
|
|
|
# Show sample of abbreviation changes
|
|
abbrev_changes = [c for c in changes if c.get('abbrev_changed')]
|
|
if abbrev_changes[:10]:
|
|
print("\n" + "-" * 60)
|
|
print("Sample abbreviation changes (not yet applied to GHCID):")
|
|
print("-" * 60)
|
|
for c in abbrev_changes[:10]:
|
|
print(f" {c['country']}: {c['old_abbrev']} → {c['new_abbrev']}")
|
|
print(f" EN: {c['old_name']}")
|
|
print(f" {c['language'].upper()}: {c['new_emic_name']}")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|