glam/scripts/create_mow_custodians.py
2025-12-07 00:26:01 +01:00

1162 lines
40 KiB
Python

#!/usr/bin/env python3
"""
Create custodian YAML files from UNESCO Memory of the World (MoW) derived institutions.
This script creates new custodian entries for heritage institutions that hold
MoW-inscribed documentary heritage but don't yet exist in our custodian database.
The MoW data was sourced from Wikidata (UNESCO has no public MoW API).
Usage:
python scripts/create_mow_custodians.py [--dry-run] [--limit N]
Data Sources:
- /tmp/mow_custodians_enriched.json - Wikidata-enriched MoW custodian data
- /data/reference/geonames.db - GeoNames database for location resolution
"""
import json
import os
import re
import sqlite3
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List
import argparse
import yaml
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
MOW_ENRICHED_FILE = Path("/tmp/mow_custodians_enriched.json")
# Country code mapping (Wikidata labels to ISO 3166-1 alpha-2)
# Complete mapping including territories, historical names, and alternate spellings
COUNTRY_CODE_MAP = {
# Major countries
"Germany": "DE", "France": "FR", "South Korea": "KR", "United Kingdom": "GB",
"People's Republic of China": "CN", "China": "CN", "Switzerland": "CH",
"Spain": "ES", "United States of America": "US", "United States": "US",
"Brazil": "BR", "India": "IN", "Iran": "IR", "Poland": "PL", "Turkey": "TR",
"Sweden": "SE", "Italy": "IT", "Japan": "JP", "Canada": "CA", "Russia": "RU",
"Czech Republic": "CZ", "Czechia": "CZ", "Mexico": "MX", "Austria": "AT",
"Portugal": "PT", "Netherlands": "NL", "Belgium": "BE", "Denmark": "DK",
"Norway": "NO", "Finland": "FI", "Australia": "AU", "New Zealand": "NZ",
"Argentina": "AR", "Chile": "CL", "Colombia": "CO", "Peru": "PE",
"Venezuela": "VE", "Egypt": "EG", "South Africa": "ZA", "Israel": "IL",
"Indonesia": "ID", "Malaysia": "MY", "Thailand": "TH", "Vietnam": "VN",
"Philippines": "PH", "Singapore": "SG", "Taiwan": "TW", "Hong Kong": "HK",
"Pakistan": "PK", "Bangladesh": "BD", "Sri Lanka": "LK", "Nepal": "NP",
"Greece": "GR", "Hungary": "HU", "Romania": "RO", "Bulgaria": "BG",
"Slovakia": "SK", "Slovenia": "SI", "Croatia": "HR", "Serbia": "RS",
"Ukraine": "UA", "Belarus": "BY", "Lithuania": "LT", "Latvia": "LV",
"Estonia": "EE", "Ireland": "IE", "Scotland": "GB", "Wales": "GB",
"Morocco": "MA", "Tunisia": "TN", "Algeria": "DZ", "Nigeria": "NG",
"Kenya": "KE", "Ghana": "GH", "Ethiopia": "ET", "Tanzania": "TZ",
"Saudi Arabia": "SA", "United Arab Emirates": "AE", "Qatar": "QA",
"Kuwait": "KW", "Bahrain": "BH", "Oman": "OM", "Jordan": "JO",
"Lebanon": "LB", "Syria": "SY", "Iraq": "IQ", "Yemen": "YE",
"Afghanistan": "AF", "Kazakhstan": "KZ", "Uzbekistan": "UZ",
"Turkmenistan": "TM", "Tajikistan": "TJ", "Kyrgyzstan": "KG",
"Azerbaijan": "AZ", "Armenia": "AM", "Georgia": "GE", "Mongolia": "MN",
"North Korea": "KP", "Cuba": "CU", "Jamaica": "JM", "Haiti": "HT",
"Dominican Republic": "DO", "Puerto Rico": "PR", "Trinidad and Tobago": "TT",
"Barbados": "BB", "Guatemala": "GT", "Honduras": "HN",
"El Salvador": "SV", "Nicaragua": "NI", "Costa Rica": "CR", "Panama": "PA",
"Ecuador": "EC", "Bolivia": "BO", "Paraguay": "PY", "Uruguay": "UY",
"Luxembourg": "LU", "Liechtenstein": "LI", "Monaco": "MC", "Andorra": "AD",
"San Marino": "SM", "Vatican City": "VA", "Malta": "MT", "Cyprus": "CY",
"Iceland": "IS", "Greenland": "GL", "Faroe Islands": "FO",
"North Macedonia": "MK", "Bosnia and Herzegovina": "BA", "Albania": "AL",
"Montenegro": "ME", "Kosovo": "XK", "Moldova": "MD",
"Democratic Republic of the Congo": "CD", "Republic of the Congo": "CG",
"Cameroon": "CM", "Senegal": "SN", "Mali": "ML", "Burkina Faso": "BF",
"Niger": "NE", "Chad": "TD", "Sudan": "SD", "South Sudan": "SS",
"Uganda": "UG", "Rwanda": "RW", "Burundi": "BI", "Malawi": "MW",
"Zambia": "ZM", "Zimbabwe": "ZW", "Botswana": "BW", "Namibia": "NA",
"Mozambique": "MZ", "Madagascar": "MG", "Mauritius": "MU",
"Seychelles": "SC", "Comoros": "KM", "Réunion": "RE",
"Cambodia": "KH", "Laos": "LA", "Myanmar": "MM", "Brunei": "BN",
"East Timor": "TL", "Timor-Leste": "TL", "Papua New Guinea": "PG",
"Fiji": "FJ", "Samoa": "WS", "Tonga": "TO", "Vanuatu": "VU",
"New Caledonia": "NC", "French Polynesia": "PF", "Guam": "GU",
# Caribbean territories and small states (ISO 3166-1 alpha-2)
"Angola": "AO",
"Anguilla": "AI",
"Antigua and Barbuda": "AG",
"Aruba": "AW",
"Bahamas": "BS", "The Bahamas": "BS",
"Belize": "BZ",
"Benin": "BJ",
"Cape Verde": "CV", "Cabo Verde": "CV",
"Curaçao": "CW", "Curacao": "CW",
"Guyana": "GY",
"Montserrat": "MS",
"Saint Lucia": "LC", "St. Lucia": "LC", "St Lucia": "LC",
"Sint Maarten": "SX",
"Suriname": "SR",
# Historical/dissolved entities (map to successor or use special codes)
# Netherlands Antilles dissolved in 2010 - use CW (Curaçao) as primary successor
"Netherlands Antilles": "CW",
# Additional Caribbean
"Dominica": "DM",
"Grenada": "GD",
"Saint Kitts and Nevis": "KN", "St. Kitts and Nevis": "KN",
"Saint Vincent and the Grenadines": "VC", "St. Vincent and the Grenadines": "VC",
"British Virgin Islands": "VG",
"U.S. Virgin Islands": "VI", "US Virgin Islands": "VI",
"Cayman Islands": "KY",
"Turks and Caicos Islands": "TC",
"Bermuda": "BM",
# Additional Africa
"Ivory Coast": "CI", "Côte d'Ivoire": "CI",
"Gabon": "GA",
"Equatorial Guinea": "GQ",
"Guinea": "GN",
"Guinea-Bissau": "GW",
"Liberia": "LR",
"Sierra Leone": "SL",
"Togo": "TG",
"Central African Republic": "CF",
"Eritrea": "ER",
"Djibouti": "DJ",
"Somalia": "SO",
"Lesotho": "LS",
"Eswatini": "SZ", "Swaziland": "SZ",
"São Tomé and Príncipe": "ST",
"Gambia": "GM", "The Gambia": "GM",
"Mauritania": "MR",
# Additional Asia/Pacific
"Bhutan": "BT",
"Maldives": "MV",
"Solomon Islands": "SB",
"Kiribati": "KI",
"Marshall Islands": "MH",
"Micronesia": "FM", "Federated States of Micronesia": "FM",
"Nauru": "NR",
"Palau": "PW",
"Tuvalu": "TV",
"Cook Islands": "CK",
"Niue": "NU",
"Tokelau": "TK",
"American Samoa": "AS",
"Northern Mariana Islands": "MP",
"Macau": "MO", "Macao": "MO",
}
# Institution type mapping based on Wikidata instance_of labels
INSTANCE_TYPE_MAP = {
"archive": "A", "archives": "A", "national archive": "A", "national archives": "A",
"state archive": "A", "state archives": "A", "city archive": "A", "municipal archive": "A",
"library": "L", "national library": "L", "public library": "L", "university library": "L",
"research library": "L", "academic library": "L", "special library": "L",
"museum": "M", "art museum": "M", "history museum": "M", "national museum": "M",
"cultural institution": "M", "heritage institution": "M",
"gallery": "G", "art gallery": "G",
"university": "E", "college": "E", "research institute": "R", "research center": "R",
"foundation": "N", "non-profit organization": "N", "NGO": "N",
"government agency": "O", "government organization": "O", "public body": "O",
"religious organization": "H", "church": "H", "monastery": "H", "abbey": "H",
"cathedral": "H", "temple": "H", "mosque": "H", "synagogue": "H",
"botanical garden": "B", "zoo": "B", "zoological garden": "B",
"television station": "C", "broadcasting company": "C", "newspaper": "C",
}
# Skipped words for abbreviation generation
SKIP_WORDS = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
"'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
'door', 'en', 'of', 'a', 'an', 'the', 'of', 'in', 'at', 'on', 'to', 'for',
'with', 'from', 'by', 'as', 'under', 'and', 'or', 'but', 'le', 'la', 'les',
'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous',
'pour', 'par', 'avec', "l'", 'et', 'ou', 'der', 'die', 'das', 'den',
'dem', 'ein', 'eine', 'einer', 'einem', 'einen', 'von', 'zu', 'für',
'mit', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder',
'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para',
'sobre', 'bajo', 'y', 'o', 'e', 'u', 'o', 'os', 'as', 'um', 'uma', 'uns',
'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na', 'nos', 'nas', 'com',
'sob', 'il', 'lo', 'i', 'gli', 'dello', 'della', 'dei', 'degli', 'delle',
'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai',
'dagli', 'dalle', 'nel', 'nello', 'nella', 'nei', 'negli', 'nelle',
'sul', 'sullo', 'sulla', 'sui', 'sugli', 'sulle', 'per', 'tra', 'fra', 'ed', 'od',
}
def get_country_code(country_label: str) -> str:
"""Convert country label to ISO 3166-1 alpha-2 code."""
return COUNTRY_CODE_MAP.get(country_label, "XX")
def detect_institution_type(instance_of: str, all_types: Optional[List[str]] = None) -> str:
"""Detect institution type code from Wikidata instance_of labels."""
types_to_check = [instance_of] + (all_types or [])
for type_label in types_to_check:
if not type_label:
continue
type_lower = type_label.lower()
for keyword, code in INSTANCE_TYPE_MAP.items():
if keyword in type_lower:
return code
# Default to Archive for MoW custodians (they hold documentary heritage)
return "A"
def generate_abbreviation(name: str) -> str:
"""Generate institution abbreviation from name."""
if not name:
return "UNK"
# Split into words
words = re.split(r'[\s\-/]+', name)
# Filter out skip words and empty strings
significant_words = [
w for w in words
if w.lower() not in SKIP_WORDS and w and not w.isdigit()
]
if not significant_words:
significant_words = words[:3]
# Take first letter of each significant word
abbrev = ''.join(w[0].upper() for w in significant_words[:10] if w)
# Ensure at least 2 characters
if len(abbrev) < 2:
abbrev = name[:3].upper()
return abbrev
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # UUID namespace for URLs
return str(uuid.uuid5(namespace, ghcid_string))
def generate_ghcid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8-style from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
# Set version to 8 and variant
hash_bytes = bytearray(hash_bytes)
hash_bytes[6] = (hash_bytes[6] & 0x0f) | 0x80 # version 8
hash_bytes[8] = (hash_bytes[8] & 0x3f) | 0x80 # variant
return str(uuid.UUID(bytes=bytes(hash_bytes)))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(hash_bytes[:8], 'big')
def lookup_city_geonames(country_code: str, city_name: str) -> Optional[Dict[str, Any]]:
"""Look up city in GeoNames database."""
if not GEONAMES_DB.exists():
return None
try:
conn = sqlite3.connect(str(GEONAMES_DB))
cursor = conn.cursor()
# Try exact match first
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name,
latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = ?
AND (name LIKE ? OR ascii_name LIKE ?)
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (country_code, city_name, city_name))
row = cursor.fetchone()
conn.close()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
except Exception as e:
print(f"GeoNames lookup error: {e}")
return None
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from name."""
if not city_name:
return "XXX"
# Clean the name
clean = re.sub(r'[^a-zA-Z\s]', '', city_name)
words = clean.split()
if len(words) == 1:
# Single word: first 3 letters
return clean[:3].upper()
else:
# Multiple words: initials
return ''.join(w[0].upper() for w in words[:3])
# Admin1 code to ISO 3166-2 region code mapping
# GeoNames admin1 codes -> ISO 3166-2 subdivision codes
ADMIN1_TO_REGION = {
# Netherlands (NL)
'NL': {
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
'15': 'OV', '16': 'FL',
},
# Germany (DE)
'DE': {
'01': 'SH', '02': 'HH', '03': 'NI', '04': 'HB', '05': 'NW',
'06': 'HE', '07': 'RP', '08': 'BW', '09': 'BY', '10': 'SL',
'11': 'BE', '12': 'BB', '13': 'MV', '14': 'SN', '15': 'ST', '16': 'TH',
},
# France (FR) - Régions
'FR': {
'11': 'IDF', # Île-de-France (Paris)
'24': 'CVL', # Centre-Val de Loire
'27': 'BFC', # Bourgogne-Franche-Comté
'28': 'NOR', # Normandie
'32': 'HDF', # Hauts-de-France
'44': 'GES', # Grand Est
'52': 'PDL', # Pays de la Loire
'53': 'BRE', # Bretagne
'75': 'NAQ', # Nouvelle-Aquitaine
'76': 'OCC', # Occitanie
'84': 'ARA', # Auvergne-Rhône-Alpes
'93': 'PAC', # Provence-Alpes-Côte d'Azur
'94': 'COR', # Corse
# Overseas
'01': 'GP', # Guadeloupe
'02': 'MQ', # Martinique
'03': 'GF', # Guyane
'04': 'RE', # La Réunion
'06': 'YT', # Mayotte
},
# United Kingdom (GB)
'GB': {
'ENG': 'EN', # England
'SCT': 'SC', # Scotland
'WLS': 'WA', # Wales
'NIR': 'NI', # Northern Ireland
},
# Spain (ES) - Comunidades Autónomas
'ES': {
'01': 'AN', # Andalucía
'02': 'AR', # Aragón
'03': 'AS', # Asturias
'04': 'IB', # Illes Balears
'05': 'CN', # Canarias
'06': 'CB', # Cantabria
'07': 'CL', # Castilla y León
'08': 'CM', # Castilla-La Mancha
'09': 'CT', # Cataluña
'10': 'VC', # Comunitat Valenciana
'11': 'EX', # Extremadura
'12': 'GA', # Galicia
'13': 'MD', # Madrid
'14': 'MC', # Murcia
'15': 'NC', # Navarra
'16': 'PV', # País Vasco
'17': 'RI', # La Rioja
'18': 'CE', # Ceuta
'19': 'ML', # Melilla
# GeoNames alternate codes
'29': 'MD', # Madrid (alternate)
'34': 'AV', # Ávila (Castilla y León)
'55': 'SA', # Salamanca
'56': 'GI', # Girona (Cataluña)
'58': 'OR', # Ourense (Galicia)
},
# Italy (IT) - Regioni
'IT': {
'01': 'PIE', # Piemonte
'02': 'VDA', # Valle d'Aosta
'03': 'LOM', # Lombardia
'04': 'TAA', # Trentino-Alto Adige
'05': 'VEN', # Veneto
'06': 'FVG', # Friuli-Venezia Giulia
'07': 'LIG', # Liguria
'08': 'EMR', # Emilia-Romagna
'09': 'TOS', # Toscana
'10': 'UMB', # Umbria
'11': 'MAR', # Marche
'12': 'LAZ', # Lazio
'13': 'ABR', # Abruzzo
'14': 'MOL', # Molise
'15': 'CAM', # Campania
'16': 'PUG', # Puglia
'17': 'BAS', # Basilicata
'18': 'CAL', # Calabria
'19': 'SIC', # Sicilia
'20': 'SAR', # Sardegna
},
# Switzerland (CH) - Cantons
'CH': {
'01': 'AG', # Aargau
'02': 'AI', # Appenzell Innerrhoden
'03': 'AR', # Appenzell Ausserrhoden
'04': 'BE', # Bern
'05': 'BL', # Basel-Landschaft
'06': 'BS', # Basel-Stadt
'07': 'FR', # Fribourg
'08': 'GE', # Genève
'09': 'GL', # Glarus
'10': 'GR', # Graubünden
'11': 'JU', # Jura
'12': 'LU', # Luzern
'13': 'NE', # Neuchâtel
'14': 'NW', # Nidwalden
'15': 'OW', # Obwalden
'16': 'SG', # St. Gallen
'17': 'SH', # Schaffhausen
'18': 'SO', # Solothurn
'19': 'SZ', # Schwyz
'20': 'TG', # Thurgau
'21': 'TI', # Ticino
'22': 'UR', # Uri
'23': 'VD', # Vaud
'24': 'VS', # Valais
'25': 'ZG', # Zug
'26': 'ZH', # Zürich
},
# Austria (AT) - Bundesländer
'AT': {
'01': 'B', # Burgenland
'02': 'K', # Kärnten
'03': 'NO', # Niederösterreich
'04': 'OO', # Oberösterreich
'05': 'S', # Salzburg
'06': 'ST', # Steiermark
'07': 'T', # Tirol
'08': 'V', # Vorarlberg
'09': 'W', # Wien
},
# Belgium (BE) - Provinces
'BE': {
'BRU': 'BRU', # Brussels-Capital Region
'VLG': 'VLG', # Flemish Region
'WAL': 'WAL', # Walloon Region
'VAN': 'VAN', # Antwerpen
'VBR': 'VBR', # Vlaams-Brabant
'VLI': 'VLI', # Limburg
'VOV': 'VOV', # Oost-Vlaanderen
'VWV': 'VWV', # West-Vlaanderen
'WBR': 'WBR', # Brabant wallon
'WHT': 'WHT', # Hainaut
'WLG': 'WLG', # Liège
'WLX': 'WLX', # Luxembourg
'WNA': 'WNA', # Namur
},
# Poland (PL) - Voivodeships
'PL': {
'72': 'DS', # Dolnośląskie
'73': 'KP', # Kujawsko-pomorskie
'74': 'LU', # Lubelskie
'75': 'LB', # Lubuskie
'76': 'LD', # Łódzkie
'77': 'MA', # Małopolskie
'78': 'MZ', # Mazowieckie
'79': 'OP', # Opolskie
'80': 'PK', # Podkarpackie
'81': 'PD', # Podlaskie
'82': 'PM', # Pomorskie
'83': 'SL', # Śląskie
'84': 'SK', # Świętokrzyskie
'85': 'WN', # Warmińsko-mazurskie
'86': 'WP', # Wielkopolskie
'87': 'ZP', # Zachodniopomorskie
},
# South Korea (KR) - Provinces and Special Cities
'KR': {
'01': 'SO', # Seoul
'02': 'BS', # Busan
'03': 'DG', # Daegu
'04': 'IC', # Incheon
'05': 'GJ', # Gwangju
'06': 'DJ', # Daejeon
'07': 'US', # Ulsan
'08': 'GG', # Gyeonggi-do
'09': 'GW', # Gangwon-do
'10': 'CB', # Chungcheongbuk-do
'11': 'CN', # Chungcheongnam-do
'12': 'JB', # Jeollabuk-do
'13': 'JN', # Jeollanam-do
'14': 'GB', # Gyeongsangbuk-do
'15': 'GN', # Gyeongsangnam-do
'16': 'JJ', # Jeju-do
'17': 'SJ', # Sejong
},
# Japan (JP) - Prefectures (partial, major ones)
'JP': {
'01': 'HKD', # Hokkaido
'02': 'AOM', # Aomori
'04': 'MYG', # Miyagi
'07': 'FKS', # Fukushima
'08': 'IBR', # Ibaraki
'09': 'TCG', # Tochigi
'10': 'GNM', # Gunma
'11': 'SIT', # Saitama
'12': 'CHB', # Chiba
'13': 'TKY', # Tokyo
'14': 'KNG', # Kanagawa
'15': 'NGT', # Niigata
'17': 'ISK', # Ishikawa
'20': 'NGN', # Nagano
'21': 'GIF', # Gifu
'22': 'SZO', # Shizuoka
'23': 'AIC', # Aichi
'24': 'MIE', # Mie
'25': 'SHG', # Shiga
'26': 'KYT', # Kyoto
'27': 'OSK', # Osaka
'28': 'HYG', # Hyogo
'29': 'NAR', # Nara
'30': 'WKY', # Wakayama
'31': 'TTR', # Tottori
'32': 'SMN', # Shimane
'33': 'OKY', # Okayama
'34': 'HRS', # Hiroshima
'35': 'YMG', # Yamaguchi
'36': 'TKS', # Tokushima
'37': 'KGW', # Kagawa
'38': 'EHM', # Ehime
'39': 'KOC', # Kochi
'40': 'FKO', # Fukuoka
'41': 'SAG', # Saga
'42': 'NGS', # Nagasaki
'43': 'KMM', # Kumamoto
'44': 'OIT', # Oita
'45': 'MYZ', # Miyazaki
'46': 'KGS', # Kagoshima
'47': 'OKN', # Okinawa
},
# China (CN) - Provinces (partial)
'CN': {
'01': 'AH', # Anhui
'02': 'ZJ', # Zhejiang
'03': 'JX', # Jiangxi
'04': 'JS', # Jiangsu
'05': 'JL', # Jilin
'06': 'QH', # Qinghai
'07': 'FJ', # Fujian
'08': 'HI', # Heilongjiang
'09': 'HN', # Henan
'10': 'HB', # Hebei
'11': 'HN', # Hunan
'12': 'HB', # Hubei
'13': 'XZ', # Tibet (Xizang)
'14': 'XZ', # Tibet (alternate)
'15': 'GS', # Gansu
'16': 'GZ', # Guizhou
'18': 'SC', # Sichuan
'19': 'YN', # Yunnan
'20': 'HL', # Hainan
'21': 'TW', # Taiwan (claimed)
'22': 'BJ', # Beijing
'23': 'SH', # Shanghai
'25': 'NM', # Inner Mongolia (Nei Mongol)
'26': 'NX', # Ningxia
'28': 'XJ', # Xinjiang
'30': 'GD', # Guangdong
'31': 'HK', # Hong Kong
'32': 'MO', # Macau
'33': 'TJ', # Tianjin
'36': 'SX', # Shaanxi
'37': 'SD', # Shandong
},
# Australia (AU) - States and Territories
'AU': {
'01': 'ACT', # Australian Capital Territory
'02': 'NSW', # New South Wales
'03': 'NT', # Northern Territory
'04': 'QLD', # Queensland
'05': 'SA', # South Australia
'06': 'TAS', # Tasmania
'07': 'VIC', # Victoria
'08': 'WA', # Western Australia
},
# Canada (CA) - Provinces and Territories
'CA': {
'01': 'AB', # Alberta
'02': 'BC', # British Columbia
'03': 'MB', # Manitoba
'04': 'NB', # New Brunswick
'05': 'NL', # Newfoundland and Labrador
'07': 'NS', # Nova Scotia
'08': 'ON', # Ontario
'09': 'PE', # Prince Edward Island
'10': 'QC', # Quebec
'11': 'SK', # Saskatchewan
'12': 'NT', # Northwest Territories
'13': 'NU', # Nunavut
'14': 'YT', # Yukon
},
# Brazil (BR) - States
'BR': {
'01': 'AC', # Acre
'02': 'AL', # Alagoas
'03': 'AP', # Amapá
'04': 'AM', # Amazonas
'05': 'BA', # Bahia
'06': 'CE', # Ceará
'07': 'DF', # Distrito Federal
'08': 'ES', # Espírito Santo
'29': 'GO', # Goiás
'11': 'MA', # Maranhão
'14': 'MT', # Mato Grosso
'11': 'MS', # Mato Grosso do Sul
'15': 'MG', # Minas Gerais
'16': 'PA', # Pará
'17': 'PB', # Paraíba
'18': 'PR', # Paraná
'19': 'PE', # Pernambuco
'20': 'PI', # Piauí
'21': 'RJ', # Rio de Janeiro
'22': 'RN', # Rio Grande do Norte
'23': 'RS', # Rio Grande do Sul
'24': 'RO', # Rondônia
'25': 'RR', # Roraima
'26': 'SC', # Santa Catarina
'27': 'SP', # São Paulo
'28': 'SE', # Sergipe
'31': 'TO', # Tocantins
},
# India (IN) - States (partial, major ones)
'IN': {
'01': 'AP', # Andhra Pradesh
'02': 'AR', # Arunachal Pradesh
'03': 'AS', # Assam
'04': 'BR', # Bihar
'05': 'CT', # Chhattisgarh
'06': 'GA', # Goa
'07': 'GJ', # Gujarat
'08': 'HR', # Haryana
'09': 'HP', # Himachal Pradesh
'10': 'JK', # Jammu and Kashmir
'11': 'JH', # Jharkhand
'12': 'KA', # Karnataka
'13': 'KL', # Kerala
'14': 'MP', # Madhya Pradesh
'15': 'MH', # Maharashtra
'16': 'MN', # Manipur
'17': 'ML', # Meghalaya
'18': 'MZ', # Mizoram
'19': 'NL', # Nagaland
'20': 'OD', # Odisha
'21': 'PB', # Punjab
'22': 'RJ', # Rajasthan
'23': 'SK', # Sikkim
'24': 'TN', # Tamil Nadu
'25': 'TS', # Telangana
'26': 'TR', # Tripura
'27': 'UP', # Uttar Pradesh
'28': 'UK', # Uttarakhand
'29': 'WB', # West Bengal
'07': 'DL', # Delhi (National Capital Territory)
'36': 'VA', # Varanasi region (approximation)
},
# Russia (RU) - Federal subjects (partial)
'RU': {
'01': 'AD', # Adygea
'48': 'MOW', # Moscow
'66': 'SPE', # Saint Petersburg
'47': 'MOS', # Moscow Oblast
'78': 'LEN', # Leningrad Oblast
},
# Turkey (TR) - Provinces
'TR': {
'01': 'AD', # Adana
'06': 'AN', # Ankara
'07': 'AN', # Antalya
'16': 'BU', # Bursa
'32': 'IS', # Istanbul
'34': 'IS', # Istanbul (alternate)
'38': 'KA', # Kayseri
'68': 'AN', # Ankara (alternate)
'71': 'KO', # Konya
},
# Sweden (SE) - Counties (län)
'SE': {
'01': 'AB', # Stockholm
'03': 'C', # Uppsala
'04': 'D', # Södermanland
'05': 'E', # Östergötland
'06': 'F', # Jönköping
'07': 'G', # Kronoberg
'08': 'H', # Kalmar
'09': 'I', # Gotland
'10': 'K', # Blekinge
'12': 'M', # Skåne
'13': 'N', # Halland
'14': 'O', # Västra Götaland
'17': 'S', # Värmland
'18': 'T', # Örebro
'19': 'U', # Västmanland
'20': 'W', # Dalarna
'21': 'X', # Gävleborg
'22': 'Y', # Västernorrland
'23': 'Z', # Jämtland
'24': 'AC', # Västerbotten
'25': 'BD', # Norrbotten
'26': 'AB', # Stockholm (alternate)
},
# Norway (NO) - Counties (fylker)
'NO': {
'02': 'VK', # Viken
'03': 'OS', # Oslo
'09': 'AG', # Agder
'11': 'RO', # Rogaland
'12': 'VL', # Vestland
'15': 'MR', # Møre og Romsdal
'18': 'NO', # Nordland
'50': 'TR', # Trøndelag
'54': 'TF', # Troms og Finnmark
},
# Denmark (DK) - Regions
'DK': {
'17': 'HS', # Hovedstaden
'18': 'MJ', # Midtjylland
'19': 'NJ', # Nordjylland
'20': 'SJ', # Sjælland
'21': 'SD', # Syddanmark
},
# Finland (FI) - Regions (maakunta)
'FI': {
'01': 'UU', # Uusimaa (Helsinki)
'02': 'VS', # Varsinais-Suomi
'03': 'SA', # Satakunta
'05': 'PI', # Pirkanmaa
'06': 'PH', # Päijät-Häme
'07': 'KY', # Kymenlaakso
'08': 'EK', # Etelä-Karjala
'09': 'ES', # Etelä-Savo
'10': 'PS', # Pohjois-Savo
'11': 'PK', # Pohjois-Karjala
'12': 'KE', # Keski-Suomi
'13': 'EP', # Etelä-Pohjanmaa
'14': 'PO', # Pohjanmaa
'15': 'KP', # Keski-Pohjanmaa
'16': 'PP', # Pohjois-Pohjanmaa
'17': 'KA', # Kainuu
'18': 'LA', # Lappi
'19': 'AX', # Åland
},
# Ireland (IE) - Provinces
'IE': {
'C': 'C', # Connacht
'L': 'L', # Leinster
'M': 'M', # Munster
'U': 'U', # Ulster (IE portion)
},
# Czech Republic (CZ) - Regions (kraje)
'CZ': {
'52': 'PR', # Prague
'78': 'JM', # Jihomoravský (South Moravian)
'79': 'OL', # Olomoucký
'80': 'MS', # Moravskoslezský
'81': 'PL', # Plzeňský
'82': 'KA', # Karlovarský
'83': 'UL', # Ústecký
'84': 'LI', # Liberecký
'85': 'HK', # Královéhradecký
'86': 'PA', # Pardubický
'87': 'VY', # Vysočina
'88': 'SC', # Středočeský
'89': 'JC', # Jihočeský
'90': 'ZL', # Zlínský
},
# Hungary (HU) - Counties (megye)
'HU': {
'01': 'BK', # Bács-Kiskun
'02': 'BA', # Baranya
'03': 'BE', # Békés
'04': 'BZ', # Borsod-Abaúj-Zemplén
'05': 'BU', # Budapest
'06': 'CS', # Csongrád-Csanád
'07': 'FE', # Fejér
'08': 'GS', # Győr-Moson-Sopron
'09': 'HB', # Hajdú-Bihar
'10': 'HE', # Heves
'11': 'JN', # Jász-Nagykun-Szolnok
'12': 'KO', # Komárom-Esztergom
'13': 'NO', # Nógrád
'14': 'PE', # Pest
'15': 'SO', # Somogy
'16': 'SZ', # Szabolcs-Szatmár-Bereg
'17': 'TO', # Tolna
'18': 'VA', # Vas
'19': 'VE', # Veszprém
'20': 'ZA', # Zala
},
# Portugal (PT) - Districts
'PT': {
'01': 'AV', # Aveiro
'02': 'BE', # Beja
'03': 'BR', # Braga
'04': 'BG', # Bragança
'05': 'CB', # Castelo Branco
'06': 'CO', # Coimbra
'07': 'EV', # Évora
'08': 'FA', # Faro
'09': 'GU', # Guarda
'10': 'LE', # Leiria
'11': 'LI', # Lisboa
'12': 'PO', # Portalegre
'13': 'PT', # Porto
'14': 'LI', # Lisboa (alternate)
'15': 'SA', # Santarém
'16': 'SE', # Setúbal
'17': 'VC', # Viana do Castelo
'18': 'VR', # Vila Real
'19': 'VI', # Viseu
'20': 'AC', # Açores
'21': 'MA', # Madeira
},
# Mexico (MX) - States
'MX': {
'01': 'AG', # Aguascalientes
'02': 'BC', # Baja California
'03': 'BS', # Baja California Sur
'04': 'CM', # Campeche
'05': 'CO', # Coahuila
'06': 'CL', # Colima
'07': 'CS', # Chiapas
'08': 'CH', # Chihuahua
'09': 'CMX', # Ciudad de México
'10': 'DG', # Durango
'11': 'GT', # Guanajuato
'12': 'GR', # Guerrero
'13': 'HG', # Hidalgo
'14': 'JA', # Jalisco
'15': 'EM', # Estado de México
'16': 'MI', # Michoacán
'17': 'MO', # Morelos
'18': 'NA', # Nayarit
'19': 'NL', # Nuevo León
'20': 'OA', # Oaxaca
'21': 'PU', # Puebla
'22': 'QT', # Querétaro
'23': 'QR', # Quintana Roo
'24': 'SL', # San Luis Potosí
'25': 'SI', # Sinaloa
'26': 'SO', # Sonora
'27': 'TB', # Tabasco
'28': 'TM', # Tamaulipas
'29': 'TL', # Tlaxcala
'30': 'VE', # Veracruz
'31': 'YU', # Yucatán
'32': 'ZA', # Zacatecas
},
# Argentina (AR) - Provinces
'AR': {
'01': 'BA', # Buenos Aires Province
'02': 'CA', # Catamarca
'03': 'CH', # Chaco
'04': 'CT', # Chubut
'05': 'CB', # Córdoba
'06': 'CR', # Corrientes
'07': 'CF', # Ciudad Autónoma de Buenos Aires
'08': 'ER', # Entre Ríos
'09': 'FM', # Formosa
'10': 'JY', # Jujuy
'11': 'LP', # La Pampa
'12': 'LR', # La Rioja
'13': 'MZ', # Mendoza
'14': 'MN', # Misiones
'15': 'NQ', # Neuquén
'16': 'RN', # Río Negro
'17': 'SA', # Salta
'18': 'SJ', # San Juan
'19': 'SL', # San Luis
'20': 'SC', # Santa Cruz
'21': 'SF', # Santa Fe
'22': 'SE', # Santiago del Estero
'23': 'TF', # Tierra del Fuego
'24': 'TM', # Tucumán
},
# New Zealand (NZ) - Regions
'NZ': {
'G2': 'WGN', # Wellington
'F7': 'AUK', # Auckland
'E7': 'WKO', # Waikato
'F4': 'BOP', # Bay of Plenty
'G1': 'TAS', # Tasman
'F6': 'MWT', # Manawatū-Whanganui
'F3': 'HKB', # Hawke's Bay
'E8': 'CAN', # Canterbury
'F9': 'OTA', # Otago
'G3': 'STL', # Southland
},
# South Africa (ZA) - Provinces
'ZA': {
'01': 'EC', # Eastern Cape
'02': 'FS', # Free State
'03': 'GT', # Gauteng
'04': 'KZN', # KwaZulu-Natal
'05': 'LP', # Limpopo
'06': 'MP', # Mpumalanga
'07': 'NC', # Northern Cape
'08': 'NW', # North West
'09': 'WC', # Western Cape
'11': 'WC', # Western Cape (alternate - Cape Town)
},
}
def get_region_code(country_code: str, admin1_code: str) -> str:
"""Get ISO 3166-2 region code from GeoNames admin1 code."""
if country_code in ADMIN1_TO_REGION:
return ADMIN1_TO_REGION[country_code].get(admin1_code, 'XX')
# Default: use admin1_code directly or XX
return admin1_code[:2].upper() if admin1_code else 'XX'
def create_custodian_yaml(wikidata_id: str, info: Dict[str, Any], dry_run: bool = False) -> Optional[Path]:
"""Create a custodian YAML file from MoW-derived Wikidata data."""
name = info.get('name', '')
country = info.get('country', '')
city = info.get('city', '')
coordinates = info.get('coordinates', '')
instance_of = info.get('instance_of', '')
all_types = info.get('all_types', [])
inscriptions = info.get('inscriptions', [])
if not name:
print(f" Skipping {wikidata_id}: no name")
return None
# Get country code
country_code = get_country_code(country)
if country_code == "XX":
print(f" Warning: Unknown country '{country}' for {wikidata_id}")
# Get institution type
inst_type = detect_institution_type(instance_of, all_types)
# Generate abbreviation
abbrev = generate_abbreviation(name)
# Look up city in GeoNames
region_code = "XX"
city_code = "XXX"
geonames_data = None
if city and country_code != "XX":
geonames_data = lookup_city_geonames(country_code, city)
if geonames_data:
region_code = get_region_code(country_code, geonames_data.get('admin1_code', ''))
city_code = generate_city_code(geonames_data.get('ascii_name', city))
else:
city_code = generate_city_code(city)
# Generate GHCID
ghcid = f"{country_code}-{region_code}-{city_code}-{inst_type}-{abbrev}"
ghcid_uuid = generate_ghcid_uuid(ghcid)
ghcid_sha256 = generate_ghcid_sha256(ghcid)
ghcid_numeric = generate_ghcid_numeric(ghcid)
# uuid7 not in standard library; use uuid4 for record_id (time-ordered not required for DB ID)
record_id = str(uuid.uuid4())
timestamp = datetime.now(timezone.utc).isoformat()
# Build the custodian YAML structure
custodian = {
'original_entry': {
'name': name,
'source': 'UNESCO Memory of the World (via Wikidata)',
'wikidata_id': wikidata_id,
'mow_inscriptions': [
{
'wikidata_id': insc.get('wikidata_id'),
'name': insc.get('name'),
'country': insc.get('country'),
}
for insc in inscriptions
],
},
'entry_index': None,
'processing_timestamp': timestamp,
'wikidata_enrichment': {
'wikidata_entity_id': wikidata_id,
'wikidata_label_en': name,
'wikidata_description_en': f"Heritage institution holding UNESCO Memory of the World inscribed documents",
'instance_of': instance_of,
'all_types': all_types if all_types else None,
},
'ghcid': {
'ghcid_current': ghcid,
'ghcid_original': ghcid,
'ghcid_uuid': ghcid_uuid,
'ghcid_uuid_sha256': ghcid_sha256,
'ghcid_numeric': ghcid_numeric,
'record_id': record_id,
'generation_timestamp': timestamp,
'ghcid_history': [
{
'ghcid': ghcid,
'ghcid_numeric': ghcid_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': 'Initial GHCID from UNESCO MoW Wikidata data (Dec 2025)',
}
],
'location_resolution': {
'method': 'WIKIDATA_LOCATION',
'country_code': country_code,
'country_label': country,
'region_code': region_code,
'city_code': city_code,
'city_label': city,
},
},
'custodian_name': {
'claim_type': 'custodian_name',
'claim_value': name,
'source_type': 'wikidata',
},
'unesco_mow_enrichment': {
'is_mow_custodian': True,
'inscription_count': len(inscriptions),
'inscriptions': [
{
'wikidata_id': insc.get('wikidata_id'),
'name': insc.get('name'),
'inscription_country': insc.get('country'),
}
for insc in inscriptions
],
'enrichment_timestamp': timestamp,
'data_source': 'Wikidata SPARQL (UNESCO has no MoW API)',
},
}
# Add GeoNames data if available
if geonames_data:
custodian['ghcid']['location_resolution'].update({
'geonames_id': geonames_data['geonames_id'],
'geonames_name': geonames_data['name'],
'feature_code': geonames_data['feature_code'],
'population': geonames_data['population'],
'admin1_code': geonames_data['admin1_code'],
})
custodian['ghcid']['geonames_id'] = geonames_data['geonames_id']
# Add coordinates if available
if coordinates:
# Parse "Point(lon lat)" format
match = re.match(r'Point\(([^\s]+)\s+([^\)]+)\)', coordinates)
if match:
lon, lat = float(match.group(1)), float(match.group(2))
custodian['ghcid']['location_resolution']['source_coordinates'] = {
'latitude': lat,
'longitude': lon,
'source': 'wikidata',
}
# Remove None values
def remove_none(d):
if isinstance(d, dict):
return {k: remove_none(v) for k, v in d.items() if v is not None}
elif isinstance(d, list):
return [remove_none(i) for i in d]
return d
custodian = remove_none(custodian)
# Determine filename
filename = f"{ghcid}.yaml"
filepath = CUSTODIAN_DIR / filename
# Check for collision
if filepath.exists():
# Add name suffix
name_suffix = re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_')[:50]
ghcid_suffixed = f"{ghcid}-{name_suffix}"
filename = f"{ghcid_suffixed}.yaml"
filepath = CUSTODIAN_DIR / filename
custodian['ghcid']['ghcid_current'] = ghcid_suffixed
custodian['ghcid']['ghcid_history'][0]['ghcid'] = ghcid_suffixed
custodian['ghcid']['ghcid_history'][0]['reason'] = 'Name suffix added to resolve GHCID collision'
if dry_run:
print(f" Would create: {filename}")
return filepath
# Write YAML file
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f" Created: {filename}")
return filepath
def main():
parser = argparse.ArgumentParser(description='Create custodian files from MoW Wikidata data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be created')
parser.add_argument('--limit', type=int, help='Limit number of files to create')
args = parser.parse_args()
# Load enriched MoW custodians
if not MOW_ENRICHED_FILE.exists():
print(f"Error: {MOW_ENRICHED_FILE} not found. Run the enrichment query first.")
return 1
with open(MOW_ENRICHED_FILE, 'r') as f:
custodians = json.load(f)
print(f"Loaded {len(custodians)} MoW custodians")
if args.dry_run:
print("DRY RUN - no files will be created")
# Create custodian files
created = 0
skipped = 0
errors = 0
for wikidata_id, info in list(custodians.items())[:args.limit]:
try:
result = create_custodian_yaml(wikidata_id, info, dry_run=args.dry_run)
if result:
created += 1
else:
skipped += 1
except Exception as e:
print(f" Error processing {wikidata_id}: {e}")
errors += 1
print(f"\nSummary:")
print(f" Created: {created}")
print(f" Skipped: {skipped}")
print(f" Errors: {errors}")
return 0
if __name__ == "__main__":
exit(main())