1162 lines
40 KiB
Python
1162 lines
40 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create custodian YAML files from UNESCO Memory of the World (MoW) derived institutions.
|
|
|
|
This script creates new custodian entries for heritage institutions that hold
|
|
MoW-inscribed documentary heritage but don't yet exist in our custodian database.
|
|
|
|
The MoW data was sourced from Wikidata (UNESCO has no public MoW API).
|
|
|
|
Usage:
|
|
python scripts/create_mow_custodians.py [--dry-run] [--limit N]
|
|
|
|
Data Sources:
|
|
- /tmp/mow_custodians_enriched.json - Wikidata-enriched MoW custodian data
|
|
- /data/reference/geonames.db - GeoNames database for location resolution
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, Tuple, List
|
|
import argparse
|
|
|
|
import yaml
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
|
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
|
MOW_ENRICHED_FILE = Path("/tmp/mow_custodians_enriched.json")
|
|
|
|
# Country code mapping (Wikidata labels to ISO 3166-1 alpha-2)
|
|
# Complete mapping including territories, historical names, and alternate spellings
|
|
COUNTRY_CODE_MAP = {
|
|
# Major countries
|
|
"Germany": "DE", "France": "FR", "South Korea": "KR", "United Kingdom": "GB",
|
|
"People's Republic of China": "CN", "China": "CN", "Switzerland": "CH",
|
|
"Spain": "ES", "United States of America": "US", "United States": "US",
|
|
"Brazil": "BR", "India": "IN", "Iran": "IR", "Poland": "PL", "Turkey": "TR",
|
|
"Sweden": "SE", "Italy": "IT", "Japan": "JP", "Canada": "CA", "Russia": "RU",
|
|
"Czech Republic": "CZ", "Czechia": "CZ", "Mexico": "MX", "Austria": "AT",
|
|
"Portugal": "PT", "Netherlands": "NL", "Belgium": "BE", "Denmark": "DK",
|
|
"Norway": "NO", "Finland": "FI", "Australia": "AU", "New Zealand": "NZ",
|
|
"Argentina": "AR", "Chile": "CL", "Colombia": "CO", "Peru": "PE",
|
|
"Venezuela": "VE", "Egypt": "EG", "South Africa": "ZA", "Israel": "IL",
|
|
"Indonesia": "ID", "Malaysia": "MY", "Thailand": "TH", "Vietnam": "VN",
|
|
"Philippines": "PH", "Singapore": "SG", "Taiwan": "TW", "Hong Kong": "HK",
|
|
"Pakistan": "PK", "Bangladesh": "BD", "Sri Lanka": "LK", "Nepal": "NP",
|
|
"Greece": "GR", "Hungary": "HU", "Romania": "RO", "Bulgaria": "BG",
|
|
"Slovakia": "SK", "Slovenia": "SI", "Croatia": "HR", "Serbia": "RS",
|
|
"Ukraine": "UA", "Belarus": "BY", "Lithuania": "LT", "Latvia": "LV",
|
|
"Estonia": "EE", "Ireland": "IE", "Scotland": "GB", "Wales": "GB",
|
|
"Morocco": "MA", "Tunisia": "TN", "Algeria": "DZ", "Nigeria": "NG",
|
|
"Kenya": "KE", "Ghana": "GH", "Ethiopia": "ET", "Tanzania": "TZ",
|
|
"Saudi Arabia": "SA", "United Arab Emirates": "AE", "Qatar": "QA",
|
|
"Kuwait": "KW", "Bahrain": "BH", "Oman": "OM", "Jordan": "JO",
|
|
"Lebanon": "LB", "Syria": "SY", "Iraq": "IQ", "Yemen": "YE",
|
|
"Afghanistan": "AF", "Kazakhstan": "KZ", "Uzbekistan": "UZ",
|
|
"Turkmenistan": "TM", "Tajikistan": "TJ", "Kyrgyzstan": "KG",
|
|
"Azerbaijan": "AZ", "Armenia": "AM", "Georgia": "GE", "Mongolia": "MN",
|
|
"North Korea": "KP", "Cuba": "CU", "Jamaica": "JM", "Haiti": "HT",
|
|
"Dominican Republic": "DO", "Puerto Rico": "PR", "Trinidad and Tobago": "TT",
|
|
"Barbados": "BB", "Guatemala": "GT", "Honduras": "HN",
|
|
"El Salvador": "SV", "Nicaragua": "NI", "Costa Rica": "CR", "Panama": "PA",
|
|
"Ecuador": "EC", "Bolivia": "BO", "Paraguay": "PY", "Uruguay": "UY",
|
|
"Luxembourg": "LU", "Liechtenstein": "LI", "Monaco": "MC", "Andorra": "AD",
|
|
"San Marino": "SM", "Vatican City": "VA", "Malta": "MT", "Cyprus": "CY",
|
|
"Iceland": "IS", "Greenland": "GL", "Faroe Islands": "FO",
|
|
"North Macedonia": "MK", "Bosnia and Herzegovina": "BA", "Albania": "AL",
|
|
"Montenegro": "ME", "Kosovo": "XK", "Moldova": "MD",
|
|
"Democratic Republic of the Congo": "CD", "Republic of the Congo": "CG",
|
|
"Cameroon": "CM", "Senegal": "SN", "Mali": "ML", "Burkina Faso": "BF",
|
|
"Niger": "NE", "Chad": "TD", "Sudan": "SD", "South Sudan": "SS",
|
|
"Uganda": "UG", "Rwanda": "RW", "Burundi": "BI", "Malawi": "MW",
|
|
"Zambia": "ZM", "Zimbabwe": "ZW", "Botswana": "BW", "Namibia": "NA",
|
|
"Mozambique": "MZ", "Madagascar": "MG", "Mauritius": "MU",
|
|
"Seychelles": "SC", "Comoros": "KM", "Réunion": "RE",
|
|
"Cambodia": "KH", "Laos": "LA", "Myanmar": "MM", "Brunei": "BN",
|
|
"East Timor": "TL", "Timor-Leste": "TL", "Papua New Guinea": "PG",
|
|
"Fiji": "FJ", "Samoa": "WS", "Tonga": "TO", "Vanuatu": "VU",
|
|
"New Caledonia": "NC", "French Polynesia": "PF", "Guam": "GU",
|
|
|
|
# Caribbean territories and small states (ISO 3166-1 alpha-2)
|
|
"Angola": "AO",
|
|
"Anguilla": "AI",
|
|
"Antigua and Barbuda": "AG",
|
|
"Aruba": "AW",
|
|
"Bahamas": "BS", "The Bahamas": "BS",
|
|
"Belize": "BZ",
|
|
"Benin": "BJ",
|
|
"Cape Verde": "CV", "Cabo Verde": "CV",
|
|
"Curaçao": "CW", "Curacao": "CW",
|
|
"Guyana": "GY",
|
|
"Montserrat": "MS",
|
|
"Saint Lucia": "LC", "St. Lucia": "LC", "St Lucia": "LC",
|
|
"Sint Maarten": "SX",
|
|
"Suriname": "SR",
|
|
|
|
# Historical/dissolved entities (map to successor or use special codes)
|
|
# Netherlands Antilles dissolved in 2010 - use CW (Curaçao) as primary successor
|
|
"Netherlands Antilles": "CW",
|
|
|
|
# Additional Caribbean
|
|
"Dominica": "DM",
|
|
"Grenada": "GD",
|
|
"Saint Kitts and Nevis": "KN", "St. Kitts and Nevis": "KN",
|
|
"Saint Vincent and the Grenadines": "VC", "St. Vincent and the Grenadines": "VC",
|
|
"British Virgin Islands": "VG",
|
|
"U.S. Virgin Islands": "VI", "US Virgin Islands": "VI",
|
|
"Cayman Islands": "KY",
|
|
"Turks and Caicos Islands": "TC",
|
|
"Bermuda": "BM",
|
|
|
|
# Additional Africa
|
|
"Ivory Coast": "CI", "Côte d'Ivoire": "CI",
|
|
"Gabon": "GA",
|
|
"Equatorial Guinea": "GQ",
|
|
"Guinea": "GN",
|
|
"Guinea-Bissau": "GW",
|
|
"Liberia": "LR",
|
|
"Sierra Leone": "SL",
|
|
"Togo": "TG",
|
|
"Central African Republic": "CF",
|
|
"Eritrea": "ER",
|
|
"Djibouti": "DJ",
|
|
"Somalia": "SO",
|
|
"Lesotho": "LS",
|
|
"Eswatini": "SZ", "Swaziland": "SZ",
|
|
"São Tomé and Príncipe": "ST",
|
|
"Gambia": "GM", "The Gambia": "GM",
|
|
"Mauritania": "MR",
|
|
|
|
# Additional Asia/Pacific
|
|
"Bhutan": "BT",
|
|
"Maldives": "MV",
|
|
"Solomon Islands": "SB",
|
|
"Kiribati": "KI",
|
|
"Marshall Islands": "MH",
|
|
"Micronesia": "FM", "Federated States of Micronesia": "FM",
|
|
"Nauru": "NR",
|
|
"Palau": "PW",
|
|
"Tuvalu": "TV",
|
|
"Cook Islands": "CK",
|
|
"Niue": "NU",
|
|
"Tokelau": "TK",
|
|
"American Samoa": "AS",
|
|
"Northern Mariana Islands": "MP",
|
|
"Macau": "MO", "Macao": "MO",
|
|
}
|
|
|
|
# Institution type mapping based on Wikidata instance_of labels
|
|
INSTANCE_TYPE_MAP = {
|
|
"archive": "A", "archives": "A", "national archive": "A", "national archives": "A",
|
|
"state archive": "A", "state archives": "A", "city archive": "A", "municipal archive": "A",
|
|
"library": "L", "national library": "L", "public library": "L", "university library": "L",
|
|
"research library": "L", "academic library": "L", "special library": "L",
|
|
"museum": "M", "art museum": "M", "history museum": "M", "national museum": "M",
|
|
"cultural institution": "M", "heritage institution": "M",
|
|
"gallery": "G", "art gallery": "G",
|
|
"university": "E", "college": "E", "research institute": "R", "research center": "R",
|
|
"foundation": "N", "non-profit organization": "N", "NGO": "N",
|
|
"government agency": "O", "government organization": "O", "public body": "O",
|
|
"religious organization": "H", "church": "H", "monastery": "H", "abbey": "H",
|
|
"cathedral": "H", "temple": "H", "mosque": "H", "synagogue": "H",
|
|
"botanical garden": "B", "zoo": "B", "zoological garden": "B",
|
|
"television station": "C", "broadcasting company": "C", "newspaper": "C",
|
|
}
|
|
|
|
# Skipped words for abbreviation generation
|
|
SKIP_WORDS = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
"'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
|
|
'door', 'en', 'of', 'a', 'an', 'the', 'of', 'in', 'at', 'on', 'to', 'for',
|
|
'with', 'from', 'by', 'as', 'under', 'and', 'or', 'but', 'le', 'la', 'les',
|
|
'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous',
|
|
'pour', 'par', 'avec', "l'", 'et', 'ou', 'der', 'die', 'das', 'den',
|
|
'dem', 'ein', 'eine', 'einer', 'einem', 'einen', 'von', 'zu', 'für',
|
|
'mit', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder',
|
|
'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para',
|
|
'sobre', 'bajo', 'y', 'o', 'e', 'u', 'o', 'os', 'as', 'um', 'uma', 'uns',
|
|
'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na', 'nos', 'nas', 'com',
|
|
'sob', 'il', 'lo', 'i', 'gli', 'dello', 'della', 'dei', 'degli', 'delle',
|
|
'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai',
|
|
'dagli', 'dalle', 'nel', 'nello', 'nella', 'nei', 'negli', 'nelle',
|
|
'sul', 'sullo', 'sulla', 'sui', 'sugli', 'sulle', 'per', 'tra', 'fra', 'ed', 'od',
|
|
}
|
|
|
|
|
|
def get_country_code(country_label: str) -> str:
|
|
"""Convert country label to ISO 3166-1 alpha-2 code."""
|
|
return COUNTRY_CODE_MAP.get(country_label, "XX")
|
|
|
|
|
|
def detect_institution_type(instance_of: str, all_types: Optional[List[str]] = None) -> str:
|
|
"""Detect institution type code from Wikidata instance_of labels."""
|
|
types_to_check = [instance_of] + (all_types or [])
|
|
|
|
for type_label in types_to_check:
|
|
if not type_label:
|
|
continue
|
|
type_lower = type_label.lower()
|
|
for keyword, code in INSTANCE_TYPE_MAP.items():
|
|
if keyword in type_lower:
|
|
return code
|
|
|
|
# Default to Archive for MoW custodians (they hold documentary heritage)
|
|
return "A"
|
|
|
|
|
|
def generate_abbreviation(name: str) -> str:
|
|
"""Generate institution abbreviation from name."""
|
|
if not name:
|
|
return "UNK"
|
|
|
|
# Split into words
|
|
words = re.split(r'[\s\-/]+', name)
|
|
|
|
# Filter out skip words and empty strings
|
|
significant_words = [
|
|
w for w in words
|
|
if w.lower() not in SKIP_WORDS and w and not w.isdigit()
|
|
]
|
|
|
|
if not significant_words:
|
|
significant_words = words[:3]
|
|
|
|
# Take first letter of each significant word
|
|
abbrev = ''.join(w[0].upper() for w in significant_words[:10] if w)
|
|
|
|
# Ensure at least 2 characters
|
|
if len(abbrev) < 2:
|
|
abbrev = name[:3].upper()
|
|
|
|
return abbrev
|
|
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # UUID namespace for URLs
|
|
return str(uuid.uuid5(namespace, ghcid_string))
|
|
|
|
|
|
def generate_ghcid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8-style from SHA-256 hash."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
|
|
# Set version to 8 and variant
|
|
hash_bytes = bytearray(hash_bytes)
|
|
hash_bytes[6] = (hash_bytes[6] & 0x0f) | 0x80 # version 8
|
|
hash_bytes[8] = (hash_bytes[8] & 0x3f) | 0x80 # variant
|
|
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
|
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()
|
|
return int.from_bytes(hash_bytes[:8], 'big')
|
|
|
|
|
|
def lookup_city_geonames(country_code: str, city_name: str) -> Optional[Dict[str, Any]]:
|
|
"""Look up city in GeoNames database."""
|
|
if not GEONAMES_DB.exists():
|
|
return None
|
|
|
|
try:
|
|
conn = sqlite3.connect(str(GEONAMES_DB))
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match first
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_code, admin1_name,
|
|
latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (name LIKE ? OR ascii_name LIKE ?)
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
except Exception as e:
|
|
print(f"GeoNames lookup error: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from name."""
|
|
if not city_name:
|
|
return "XXX"
|
|
|
|
# Clean the name
|
|
clean = re.sub(r'[^a-zA-Z\s]', '', city_name)
|
|
words = clean.split()
|
|
|
|
if len(words) == 1:
|
|
# Single word: first 3 letters
|
|
return clean[:3].upper()
|
|
else:
|
|
# Multiple words: initials
|
|
return ''.join(w[0].upper() for w in words[:3])
|
|
|
|
|
|
# Admin1 code to ISO 3166-2 region code mapping
|
|
# GeoNames admin1 codes -> ISO 3166-2 subdivision codes
|
|
ADMIN1_TO_REGION = {
|
|
# Netherlands (NL)
|
|
'NL': {
|
|
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
|
|
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
|
|
'15': 'OV', '16': 'FL',
|
|
},
|
|
# Germany (DE)
|
|
'DE': {
|
|
'01': 'SH', '02': 'HH', '03': 'NI', '04': 'HB', '05': 'NW',
|
|
'06': 'HE', '07': 'RP', '08': 'BW', '09': 'BY', '10': 'SL',
|
|
'11': 'BE', '12': 'BB', '13': 'MV', '14': 'SN', '15': 'ST', '16': 'TH',
|
|
},
|
|
# France (FR) - Régions
|
|
'FR': {
|
|
'11': 'IDF', # Île-de-France (Paris)
|
|
'24': 'CVL', # Centre-Val de Loire
|
|
'27': 'BFC', # Bourgogne-Franche-Comté
|
|
'28': 'NOR', # Normandie
|
|
'32': 'HDF', # Hauts-de-France
|
|
'44': 'GES', # Grand Est
|
|
'52': 'PDL', # Pays de la Loire
|
|
'53': 'BRE', # Bretagne
|
|
'75': 'NAQ', # Nouvelle-Aquitaine
|
|
'76': 'OCC', # Occitanie
|
|
'84': 'ARA', # Auvergne-Rhône-Alpes
|
|
'93': 'PAC', # Provence-Alpes-Côte d'Azur
|
|
'94': 'COR', # Corse
|
|
# Overseas
|
|
'01': 'GP', # Guadeloupe
|
|
'02': 'MQ', # Martinique
|
|
'03': 'GF', # Guyane
|
|
'04': 'RE', # La Réunion
|
|
'06': 'YT', # Mayotte
|
|
},
|
|
# United Kingdom (GB)
|
|
'GB': {
|
|
'ENG': 'EN', # England
|
|
'SCT': 'SC', # Scotland
|
|
'WLS': 'WA', # Wales
|
|
'NIR': 'NI', # Northern Ireland
|
|
},
|
|
# Spain (ES) - Comunidades Autónomas
|
|
'ES': {
|
|
'01': 'AN', # Andalucía
|
|
'02': 'AR', # Aragón
|
|
'03': 'AS', # Asturias
|
|
'04': 'IB', # Illes Balears
|
|
'05': 'CN', # Canarias
|
|
'06': 'CB', # Cantabria
|
|
'07': 'CL', # Castilla y León
|
|
'08': 'CM', # Castilla-La Mancha
|
|
'09': 'CT', # Cataluña
|
|
'10': 'VC', # Comunitat Valenciana
|
|
'11': 'EX', # Extremadura
|
|
'12': 'GA', # Galicia
|
|
'13': 'MD', # Madrid
|
|
'14': 'MC', # Murcia
|
|
'15': 'NC', # Navarra
|
|
'16': 'PV', # País Vasco
|
|
'17': 'RI', # La Rioja
|
|
'18': 'CE', # Ceuta
|
|
'19': 'ML', # Melilla
|
|
# GeoNames alternate codes
|
|
'29': 'MD', # Madrid (alternate)
|
|
'34': 'AV', # Ávila (Castilla y León)
|
|
'55': 'SA', # Salamanca
|
|
'56': 'GI', # Girona (Cataluña)
|
|
'58': 'OR', # Ourense (Galicia)
|
|
},
|
|
# Italy (IT) - Regioni
|
|
'IT': {
|
|
'01': 'PIE', # Piemonte
|
|
'02': 'VDA', # Valle d'Aosta
|
|
'03': 'LOM', # Lombardia
|
|
'04': 'TAA', # Trentino-Alto Adige
|
|
'05': 'VEN', # Veneto
|
|
'06': 'FVG', # Friuli-Venezia Giulia
|
|
'07': 'LIG', # Liguria
|
|
'08': 'EMR', # Emilia-Romagna
|
|
'09': 'TOS', # Toscana
|
|
'10': 'UMB', # Umbria
|
|
'11': 'MAR', # Marche
|
|
'12': 'LAZ', # Lazio
|
|
'13': 'ABR', # Abruzzo
|
|
'14': 'MOL', # Molise
|
|
'15': 'CAM', # Campania
|
|
'16': 'PUG', # Puglia
|
|
'17': 'BAS', # Basilicata
|
|
'18': 'CAL', # Calabria
|
|
'19': 'SIC', # Sicilia
|
|
'20': 'SAR', # Sardegna
|
|
},
|
|
# Switzerland (CH) - Cantons
|
|
'CH': {
|
|
'01': 'AG', # Aargau
|
|
'02': 'AI', # Appenzell Innerrhoden
|
|
'03': 'AR', # Appenzell Ausserrhoden
|
|
'04': 'BE', # Bern
|
|
'05': 'BL', # Basel-Landschaft
|
|
'06': 'BS', # Basel-Stadt
|
|
'07': 'FR', # Fribourg
|
|
'08': 'GE', # Genève
|
|
'09': 'GL', # Glarus
|
|
'10': 'GR', # Graubünden
|
|
'11': 'JU', # Jura
|
|
'12': 'LU', # Luzern
|
|
'13': 'NE', # Neuchâtel
|
|
'14': 'NW', # Nidwalden
|
|
'15': 'OW', # Obwalden
|
|
'16': 'SG', # St. Gallen
|
|
'17': 'SH', # Schaffhausen
|
|
'18': 'SO', # Solothurn
|
|
'19': 'SZ', # Schwyz
|
|
'20': 'TG', # Thurgau
|
|
'21': 'TI', # Ticino
|
|
'22': 'UR', # Uri
|
|
'23': 'VD', # Vaud
|
|
'24': 'VS', # Valais
|
|
'25': 'ZG', # Zug
|
|
'26': 'ZH', # Zürich
|
|
},
|
|
# Austria (AT) - Bundesländer
|
|
'AT': {
|
|
'01': 'B', # Burgenland
|
|
'02': 'K', # Kärnten
|
|
'03': 'NO', # Niederösterreich
|
|
'04': 'OO', # Oberösterreich
|
|
'05': 'S', # Salzburg
|
|
'06': 'ST', # Steiermark
|
|
'07': 'T', # Tirol
|
|
'08': 'V', # Vorarlberg
|
|
'09': 'W', # Wien
|
|
},
|
|
# Belgium (BE) - Provinces
|
|
'BE': {
|
|
'BRU': 'BRU', # Brussels-Capital Region
|
|
'VLG': 'VLG', # Flemish Region
|
|
'WAL': 'WAL', # Walloon Region
|
|
'VAN': 'VAN', # Antwerpen
|
|
'VBR': 'VBR', # Vlaams-Brabant
|
|
'VLI': 'VLI', # Limburg
|
|
'VOV': 'VOV', # Oost-Vlaanderen
|
|
'VWV': 'VWV', # West-Vlaanderen
|
|
'WBR': 'WBR', # Brabant wallon
|
|
'WHT': 'WHT', # Hainaut
|
|
'WLG': 'WLG', # Liège
|
|
'WLX': 'WLX', # Luxembourg
|
|
'WNA': 'WNA', # Namur
|
|
},
|
|
# Poland (PL) - Voivodeships
|
|
'PL': {
|
|
'72': 'DS', # Dolnośląskie
|
|
'73': 'KP', # Kujawsko-pomorskie
|
|
'74': 'LU', # Lubelskie
|
|
'75': 'LB', # Lubuskie
|
|
'76': 'LD', # Łódzkie
|
|
'77': 'MA', # Małopolskie
|
|
'78': 'MZ', # Mazowieckie
|
|
'79': 'OP', # Opolskie
|
|
'80': 'PK', # Podkarpackie
|
|
'81': 'PD', # Podlaskie
|
|
'82': 'PM', # Pomorskie
|
|
'83': 'SL', # Śląskie
|
|
'84': 'SK', # Świętokrzyskie
|
|
'85': 'WN', # Warmińsko-mazurskie
|
|
'86': 'WP', # Wielkopolskie
|
|
'87': 'ZP', # Zachodniopomorskie
|
|
},
|
|
# South Korea (KR) - Provinces and Special Cities
|
|
'KR': {
|
|
'01': 'SO', # Seoul
|
|
'02': 'BS', # Busan
|
|
'03': 'DG', # Daegu
|
|
'04': 'IC', # Incheon
|
|
'05': 'GJ', # Gwangju
|
|
'06': 'DJ', # Daejeon
|
|
'07': 'US', # Ulsan
|
|
'08': 'GG', # Gyeonggi-do
|
|
'09': 'GW', # Gangwon-do
|
|
'10': 'CB', # Chungcheongbuk-do
|
|
'11': 'CN', # Chungcheongnam-do
|
|
'12': 'JB', # Jeollabuk-do
|
|
'13': 'JN', # Jeollanam-do
|
|
'14': 'GB', # Gyeongsangbuk-do
|
|
'15': 'GN', # Gyeongsangnam-do
|
|
'16': 'JJ', # Jeju-do
|
|
'17': 'SJ', # Sejong
|
|
},
|
|
# Japan (JP) - Prefectures (partial, major ones)
|
|
'JP': {
|
|
'01': 'HKD', # Hokkaido
|
|
'02': 'AOM', # Aomori
|
|
'04': 'MYG', # Miyagi
|
|
'07': 'FKS', # Fukushima
|
|
'08': 'IBR', # Ibaraki
|
|
'09': 'TCG', # Tochigi
|
|
'10': 'GNM', # Gunma
|
|
'11': 'SIT', # Saitama
|
|
'12': 'CHB', # Chiba
|
|
'13': 'TKY', # Tokyo
|
|
'14': 'KNG', # Kanagawa
|
|
'15': 'NGT', # Niigata
|
|
'17': 'ISK', # Ishikawa
|
|
'20': 'NGN', # Nagano
|
|
'21': 'GIF', # Gifu
|
|
'22': 'SZO', # Shizuoka
|
|
'23': 'AIC', # Aichi
|
|
'24': 'MIE', # Mie
|
|
'25': 'SHG', # Shiga
|
|
'26': 'KYT', # Kyoto
|
|
'27': 'OSK', # Osaka
|
|
'28': 'HYG', # Hyogo
|
|
'29': 'NAR', # Nara
|
|
'30': 'WKY', # Wakayama
|
|
'31': 'TTR', # Tottori
|
|
'32': 'SMN', # Shimane
|
|
'33': 'OKY', # Okayama
|
|
'34': 'HRS', # Hiroshima
|
|
'35': 'YMG', # Yamaguchi
|
|
'36': 'TKS', # Tokushima
|
|
'37': 'KGW', # Kagawa
|
|
'38': 'EHM', # Ehime
|
|
'39': 'KOC', # Kochi
|
|
'40': 'FKO', # Fukuoka
|
|
'41': 'SAG', # Saga
|
|
'42': 'NGS', # Nagasaki
|
|
'43': 'KMM', # Kumamoto
|
|
'44': 'OIT', # Oita
|
|
'45': 'MYZ', # Miyazaki
|
|
'46': 'KGS', # Kagoshima
|
|
'47': 'OKN', # Okinawa
|
|
},
|
|
# China (CN) - Provinces (partial)
|
|
'CN': {
|
|
'01': 'AH', # Anhui
|
|
'02': 'ZJ', # Zhejiang
|
|
'03': 'JX', # Jiangxi
|
|
'04': 'JS', # Jiangsu
|
|
'05': 'JL', # Jilin
|
|
'06': 'QH', # Qinghai
|
|
'07': 'FJ', # Fujian
|
|
'08': 'HI', # Heilongjiang
|
|
'09': 'HN', # Henan
|
|
'10': 'HB', # Hebei
|
|
'11': 'HN', # Hunan
|
|
'12': 'HB', # Hubei
|
|
'13': 'XZ', # Tibet (Xizang)
|
|
'14': 'XZ', # Tibet (alternate)
|
|
'15': 'GS', # Gansu
|
|
'16': 'GZ', # Guizhou
|
|
'18': 'SC', # Sichuan
|
|
'19': 'YN', # Yunnan
|
|
'20': 'HL', # Hainan
|
|
'21': 'TW', # Taiwan (claimed)
|
|
'22': 'BJ', # Beijing
|
|
'23': 'SH', # Shanghai
|
|
'25': 'NM', # Inner Mongolia (Nei Mongol)
|
|
'26': 'NX', # Ningxia
|
|
'28': 'XJ', # Xinjiang
|
|
'30': 'GD', # Guangdong
|
|
'31': 'HK', # Hong Kong
|
|
'32': 'MO', # Macau
|
|
'33': 'TJ', # Tianjin
|
|
'36': 'SX', # Shaanxi
|
|
'37': 'SD', # Shandong
|
|
},
|
|
# Australia (AU) - States and Territories
|
|
'AU': {
|
|
'01': 'ACT', # Australian Capital Territory
|
|
'02': 'NSW', # New South Wales
|
|
'03': 'NT', # Northern Territory
|
|
'04': 'QLD', # Queensland
|
|
'05': 'SA', # South Australia
|
|
'06': 'TAS', # Tasmania
|
|
'07': 'VIC', # Victoria
|
|
'08': 'WA', # Western Australia
|
|
},
|
|
# Canada (CA) - Provinces and Territories
|
|
'CA': {
|
|
'01': 'AB', # Alberta
|
|
'02': 'BC', # British Columbia
|
|
'03': 'MB', # Manitoba
|
|
'04': 'NB', # New Brunswick
|
|
'05': 'NL', # Newfoundland and Labrador
|
|
'07': 'NS', # Nova Scotia
|
|
'08': 'ON', # Ontario
|
|
'09': 'PE', # Prince Edward Island
|
|
'10': 'QC', # Quebec
|
|
'11': 'SK', # Saskatchewan
|
|
'12': 'NT', # Northwest Territories
|
|
'13': 'NU', # Nunavut
|
|
'14': 'YT', # Yukon
|
|
},
|
|
# Brazil (BR) - States
|
|
'BR': {
|
|
'01': 'AC', # Acre
|
|
'02': 'AL', # Alagoas
|
|
'03': 'AP', # Amapá
|
|
'04': 'AM', # Amazonas
|
|
'05': 'BA', # Bahia
|
|
'06': 'CE', # Ceará
|
|
'07': 'DF', # Distrito Federal
|
|
'08': 'ES', # Espírito Santo
|
|
'29': 'GO', # Goiás
|
|
'11': 'MA', # Maranhão
|
|
'14': 'MT', # Mato Grosso
|
|
'11': 'MS', # Mato Grosso do Sul
|
|
'15': 'MG', # Minas Gerais
|
|
'16': 'PA', # Pará
|
|
'17': 'PB', # Paraíba
|
|
'18': 'PR', # Paraná
|
|
'19': 'PE', # Pernambuco
|
|
'20': 'PI', # Piauí
|
|
'21': 'RJ', # Rio de Janeiro
|
|
'22': 'RN', # Rio Grande do Norte
|
|
'23': 'RS', # Rio Grande do Sul
|
|
'24': 'RO', # Rondônia
|
|
'25': 'RR', # Roraima
|
|
'26': 'SC', # Santa Catarina
|
|
'27': 'SP', # São Paulo
|
|
'28': 'SE', # Sergipe
|
|
'31': 'TO', # Tocantins
|
|
},
|
|
# India (IN) - States (partial, major ones)
|
|
'IN': {
|
|
'01': 'AP', # Andhra Pradesh
|
|
'02': 'AR', # Arunachal Pradesh
|
|
'03': 'AS', # Assam
|
|
'04': 'BR', # Bihar
|
|
'05': 'CT', # Chhattisgarh
|
|
'06': 'GA', # Goa
|
|
'07': 'GJ', # Gujarat
|
|
'08': 'HR', # Haryana
|
|
'09': 'HP', # Himachal Pradesh
|
|
'10': 'JK', # Jammu and Kashmir
|
|
'11': 'JH', # Jharkhand
|
|
'12': 'KA', # Karnataka
|
|
'13': 'KL', # Kerala
|
|
'14': 'MP', # Madhya Pradesh
|
|
'15': 'MH', # Maharashtra
|
|
'16': 'MN', # Manipur
|
|
'17': 'ML', # Meghalaya
|
|
'18': 'MZ', # Mizoram
|
|
'19': 'NL', # Nagaland
|
|
'20': 'OD', # Odisha
|
|
'21': 'PB', # Punjab
|
|
'22': 'RJ', # Rajasthan
|
|
'23': 'SK', # Sikkim
|
|
'24': 'TN', # Tamil Nadu
|
|
'25': 'TS', # Telangana
|
|
'26': 'TR', # Tripura
|
|
'27': 'UP', # Uttar Pradesh
|
|
'28': 'UK', # Uttarakhand
|
|
'29': 'WB', # West Bengal
|
|
'07': 'DL', # Delhi (National Capital Territory)
|
|
'36': 'VA', # Varanasi region (approximation)
|
|
},
|
|
# Russia (RU) - Federal subjects (partial)
|
|
'RU': {
|
|
'01': 'AD', # Adygea
|
|
'48': 'MOW', # Moscow
|
|
'66': 'SPE', # Saint Petersburg
|
|
'47': 'MOS', # Moscow Oblast
|
|
'78': 'LEN', # Leningrad Oblast
|
|
},
|
|
# Turkey (TR) - Provinces
|
|
'TR': {
|
|
'01': 'AD', # Adana
|
|
'06': 'AN', # Ankara
|
|
'07': 'AN', # Antalya
|
|
'16': 'BU', # Bursa
|
|
'32': 'IS', # Istanbul
|
|
'34': 'IS', # Istanbul (alternate)
|
|
'38': 'KA', # Kayseri
|
|
'68': 'AN', # Ankara (alternate)
|
|
'71': 'KO', # Konya
|
|
},
|
|
# Sweden (SE) - Counties (län)
|
|
'SE': {
|
|
'01': 'AB', # Stockholm
|
|
'03': 'C', # Uppsala
|
|
'04': 'D', # Södermanland
|
|
'05': 'E', # Östergötland
|
|
'06': 'F', # Jönköping
|
|
'07': 'G', # Kronoberg
|
|
'08': 'H', # Kalmar
|
|
'09': 'I', # Gotland
|
|
'10': 'K', # Blekinge
|
|
'12': 'M', # Skåne
|
|
'13': 'N', # Halland
|
|
'14': 'O', # Västra Götaland
|
|
'17': 'S', # Värmland
|
|
'18': 'T', # Örebro
|
|
'19': 'U', # Västmanland
|
|
'20': 'W', # Dalarna
|
|
'21': 'X', # Gävleborg
|
|
'22': 'Y', # Västernorrland
|
|
'23': 'Z', # Jämtland
|
|
'24': 'AC', # Västerbotten
|
|
'25': 'BD', # Norrbotten
|
|
'26': 'AB', # Stockholm (alternate)
|
|
},
|
|
# Norway (NO) - Counties (fylker)
|
|
'NO': {
|
|
'02': 'VK', # Viken
|
|
'03': 'OS', # Oslo
|
|
'09': 'AG', # Agder
|
|
'11': 'RO', # Rogaland
|
|
'12': 'VL', # Vestland
|
|
'15': 'MR', # Møre og Romsdal
|
|
'18': 'NO', # Nordland
|
|
'50': 'TR', # Trøndelag
|
|
'54': 'TF', # Troms og Finnmark
|
|
},
|
|
# Denmark (DK) - Regions
|
|
'DK': {
|
|
'17': 'HS', # Hovedstaden
|
|
'18': 'MJ', # Midtjylland
|
|
'19': 'NJ', # Nordjylland
|
|
'20': 'SJ', # Sjælland
|
|
'21': 'SD', # Syddanmark
|
|
},
|
|
# Finland (FI) - Regions (maakunta)
|
|
'FI': {
|
|
'01': 'UU', # Uusimaa (Helsinki)
|
|
'02': 'VS', # Varsinais-Suomi
|
|
'03': 'SA', # Satakunta
|
|
'05': 'PI', # Pirkanmaa
|
|
'06': 'PH', # Päijät-Häme
|
|
'07': 'KY', # Kymenlaakso
|
|
'08': 'EK', # Etelä-Karjala
|
|
'09': 'ES', # Etelä-Savo
|
|
'10': 'PS', # Pohjois-Savo
|
|
'11': 'PK', # Pohjois-Karjala
|
|
'12': 'KE', # Keski-Suomi
|
|
'13': 'EP', # Etelä-Pohjanmaa
|
|
'14': 'PO', # Pohjanmaa
|
|
'15': 'KP', # Keski-Pohjanmaa
|
|
'16': 'PP', # Pohjois-Pohjanmaa
|
|
'17': 'KA', # Kainuu
|
|
'18': 'LA', # Lappi
|
|
'19': 'AX', # Åland
|
|
},
|
|
# Ireland (IE) - Provinces
|
|
'IE': {
|
|
'C': 'C', # Connacht
|
|
'L': 'L', # Leinster
|
|
'M': 'M', # Munster
|
|
'U': 'U', # Ulster (IE portion)
|
|
},
|
|
# Czech Republic (CZ) - Regions (kraje)
|
|
'CZ': {
|
|
'52': 'PR', # Prague
|
|
'78': 'JM', # Jihomoravský (South Moravian)
|
|
'79': 'OL', # Olomoucký
|
|
'80': 'MS', # Moravskoslezský
|
|
'81': 'PL', # Plzeňský
|
|
'82': 'KA', # Karlovarský
|
|
'83': 'UL', # Ústecký
|
|
'84': 'LI', # Liberecký
|
|
'85': 'HK', # Královéhradecký
|
|
'86': 'PA', # Pardubický
|
|
'87': 'VY', # Vysočina
|
|
'88': 'SC', # Středočeský
|
|
'89': 'JC', # Jihočeský
|
|
'90': 'ZL', # Zlínský
|
|
},
|
|
# Hungary (HU) - Counties (megye)
|
|
'HU': {
|
|
'01': 'BK', # Bács-Kiskun
|
|
'02': 'BA', # Baranya
|
|
'03': 'BE', # Békés
|
|
'04': 'BZ', # Borsod-Abaúj-Zemplén
|
|
'05': 'BU', # Budapest
|
|
'06': 'CS', # Csongrád-Csanád
|
|
'07': 'FE', # Fejér
|
|
'08': 'GS', # Győr-Moson-Sopron
|
|
'09': 'HB', # Hajdú-Bihar
|
|
'10': 'HE', # Heves
|
|
'11': 'JN', # Jász-Nagykun-Szolnok
|
|
'12': 'KO', # Komárom-Esztergom
|
|
'13': 'NO', # Nógrád
|
|
'14': 'PE', # Pest
|
|
'15': 'SO', # Somogy
|
|
'16': 'SZ', # Szabolcs-Szatmár-Bereg
|
|
'17': 'TO', # Tolna
|
|
'18': 'VA', # Vas
|
|
'19': 'VE', # Veszprém
|
|
'20': 'ZA', # Zala
|
|
},
|
|
# Portugal (PT) - Districts
|
|
'PT': {
|
|
'01': 'AV', # Aveiro
|
|
'02': 'BE', # Beja
|
|
'03': 'BR', # Braga
|
|
'04': 'BG', # Bragança
|
|
'05': 'CB', # Castelo Branco
|
|
'06': 'CO', # Coimbra
|
|
'07': 'EV', # Évora
|
|
'08': 'FA', # Faro
|
|
'09': 'GU', # Guarda
|
|
'10': 'LE', # Leiria
|
|
'11': 'LI', # Lisboa
|
|
'12': 'PO', # Portalegre
|
|
'13': 'PT', # Porto
|
|
'14': 'LI', # Lisboa (alternate)
|
|
'15': 'SA', # Santarém
|
|
'16': 'SE', # Setúbal
|
|
'17': 'VC', # Viana do Castelo
|
|
'18': 'VR', # Vila Real
|
|
'19': 'VI', # Viseu
|
|
'20': 'AC', # Açores
|
|
'21': 'MA', # Madeira
|
|
},
|
|
# Mexico (MX) - States
|
|
'MX': {
|
|
'01': 'AG', # Aguascalientes
|
|
'02': 'BC', # Baja California
|
|
'03': 'BS', # Baja California Sur
|
|
'04': 'CM', # Campeche
|
|
'05': 'CO', # Coahuila
|
|
'06': 'CL', # Colima
|
|
'07': 'CS', # Chiapas
|
|
'08': 'CH', # Chihuahua
|
|
'09': 'CMX', # Ciudad de México
|
|
'10': 'DG', # Durango
|
|
'11': 'GT', # Guanajuato
|
|
'12': 'GR', # Guerrero
|
|
'13': 'HG', # Hidalgo
|
|
'14': 'JA', # Jalisco
|
|
'15': 'EM', # Estado de México
|
|
'16': 'MI', # Michoacán
|
|
'17': 'MO', # Morelos
|
|
'18': 'NA', # Nayarit
|
|
'19': 'NL', # Nuevo León
|
|
'20': 'OA', # Oaxaca
|
|
'21': 'PU', # Puebla
|
|
'22': 'QT', # Querétaro
|
|
'23': 'QR', # Quintana Roo
|
|
'24': 'SL', # San Luis Potosí
|
|
'25': 'SI', # Sinaloa
|
|
'26': 'SO', # Sonora
|
|
'27': 'TB', # Tabasco
|
|
'28': 'TM', # Tamaulipas
|
|
'29': 'TL', # Tlaxcala
|
|
'30': 'VE', # Veracruz
|
|
'31': 'YU', # Yucatán
|
|
'32': 'ZA', # Zacatecas
|
|
},
|
|
# Argentina (AR) - Provinces
|
|
'AR': {
|
|
'01': 'BA', # Buenos Aires Province
|
|
'02': 'CA', # Catamarca
|
|
'03': 'CH', # Chaco
|
|
'04': 'CT', # Chubut
|
|
'05': 'CB', # Córdoba
|
|
'06': 'CR', # Corrientes
|
|
'07': 'CF', # Ciudad Autónoma de Buenos Aires
|
|
'08': 'ER', # Entre Ríos
|
|
'09': 'FM', # Formosa
|
|
'10': 'JY', # Jujuy
|
|
'11': 'LP', # La Pampa
|
|
'12': 'LR', # La Rioja
|
|
'13': 'MZ', # Mendoza
|
|
'14': 'MN', # Misiones
|
|
'15': 'NQ', # Neuquén
|
|
'16': 'RN', # Río Negro
|
|
'17': 'SA', # Salta
|
|
'18': 'SJ', # San Juan
|
|
'19': 'SL', # San Luis
|
|
'20': 'SC', # Santa Cruz
|
|
'21': 'SF', # Santa Fe
|
|
'22': 'SE', # Santiago del Estero
|
|
'23': 'TF', # Tierra del Fuego
|
|
'24': 'TM', # Tucumán
|
|
},
|
|
# New Zealand (NZ) - Regions
|
|
'NZ': {
|
|
'G2': 'WGN', # Wellington
|
|
'F7': 'AUK', # Auckland
|
|
'E7': 'WKO', # Waikato
|
|
'F4': 'BOP', # Bay of Plenty
|
|
'G1': 'TAS', # Tasman
|
|
'F6': 'MWT', # Manawatū-Whanganui
|
|
'F3': 'HKB', # Hawke's Bay
|
|
'E8': 'CAN', # Canterbury
|
|
'F9': 'OTA', # Otago
|
|
'G3': 'STL', # Southland
|
|
},
|
|
# South Africa (ZA) - Provinces
|
|
'ZA': {
|
|
'01': 'EC', # Eastern Cape
|
|
'02': 'FS', # Free State
|
|
'03': 'GT', # Gauteng
|
|
'04': 'KZN', # KwaZulu-Natal
|
|
'05': 'LP', # Limpopo
|
|
'06': 'MP', # Mpumalanga
|
|
'07': 'NC', # Northern Cape
|
|
'08': 'NW', # North West
|
|
'09': 'WC', # Western Cape
|
|
'11': 'WC', # Western Cape (alternate - Cape Town)
|
|
},
|
|
}
|
|
|
|
|
|
def get_region_code(country_code: str, admin1_code: str) -> str:
|
|
"""Get ISO 3166-2 region code from GeoNames admin1 code."""
|
|
if country_code in ADMIN1_TO_REGION:
|
|
return ADMIN1_TO_REGION[country_code].get(admin1_code, 'XX')
|
|
# Default: use admin1_code directly or XX
|
|
return admin1_code[:2].upper() if admin1_code else 'XX'
|
|
|
|
|
|
def create_custodian_yaml(wikidata_id: str, info: Dict[str, Any], dry_run: bool = False) -> Optional[Path]:
|
|
"""Create a custodian YAML file from MoW-derived Wikidata data."""
|
|
name = info.get('name', '')
|
|
country = info.get('country', '')
|
|
city = info.get('city', '')
|
|
coordinates = info.get('coordinates', '')
|
|
instance_of = info.get('instance_of', '')
|
|
all_types = info.get('all_types', [])
|
|
inscriptions = info.get('inscriptions', [])
|
|
|
|
if not name:
|
|
print(f" Skipping {wikidata_id}: no name")
|
|
return None
|
|
|
|
# Get country code
|
|
country_code = get_country_code(country)
|
|
if country_code == "XX":
|
|
print(f" Warning: Unknown country '{country}' for {wikidata_id}")
|
|
|
|
# Get institution type
|
|
inst_type = detect_institution_type(instance_of, all_types)
|
|
|
|
# Generate abbreviation
|
|
abbrev = generate_abbreviation(name)
|
|
|
|
# Look up city in GeoNames
|
|
region_code = "XX"
|
|
city_code = "XXX"
|
|
geonames_data = None
|
|
|
|
if city and country_code != "XX":
|
|
geonames_data = lookup_city_geonames(country_code, city)
|
|
if geonames_data:
|
|
region_code = get_region_code(country_code, geonames_data.get('admin1_code', ''))
|
|
city_code = generate_city_code(geonames_data.get('ascii_name', city))
|
|
else:
|
|
city_code = generate_city_code(city)
|
|
|
|
# Generate GHCID
|
|
ghcid = f"{country_code}-{region_code}-{city_code}-{inst_type}-{abbrev}"
|
|
ghcid_uuid = generate_ghcid_uuid(ghcid)
|
|
ghcid_sha256 = generate_ghcid_sha256(ghcid)
|
|
ghcid_numeric = generate_ghcid_numeric(ghcid)
|
|
# uuid7 not in standard library; use uuid4 for record_id (time-ordered not required for DB ID)
|
|
record_id = str(uuid.uuid4())
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Build the custodian YAML structure
|
|
custodian = {
|
|
'original_entry': {
|
|
'name': name,
|
|
'source': 'UNESCO Memory of the World (via Wikidata)',
|
|
'wikidata_id': wikidata_id,
|
|
'mow_inscriptions': [
|
|
{
|
|
'wikidata_id': insc.get('wikidata_id'),
|
|
'name': insc.get('name'),
|
|
'country': insc.get('country'),
|
|
}
|
|
for insc in inscriptions
|
|
],
|
|
},
|
|
'entry_index': None,
|
|
'processing_timestamp': timestamp,
|
|
'wikidata_enrichment': {
|
|
'wikidata_entity_id': wikidata_id,
|
|
'wikidata_label_en': name,
|
|
'wikidata_description_en': f"Heritage institution holding UNESCO Memory of the World inscribed documents",
|
|
'instance_of': instance_of,
|
|
'all_types': all_types if all_types else None,
|
|
},
|
|
'ghcid': {
|
|
'ghcid_current': ghcid,
|
|
'ghcid_original': ghcid,
|
|
'ghcid_uuid': ghcid_uuid,
|
|
'ghcid_uuid_sha256': ghcid_sha256,
|
|
'ghcid_numeric': ghcid_numeric,
|
|
'record_id': record_id,
|
|
'generation_timestamp': timestamp,
|
|
'ghcid_history': [
|
|
{
|
|
'ghcid': ghcid,
|
|
'ghcid_numeric': ghcid_numeric,
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': 'Initial GHCID from UNESCO MoW Wikidata data (Dec 2025)',
|
|
}
|
|
],
|
|
'location_resolution': {
|
|
'method': 'WIKIDATA_LOCATION',
|
|
'country_code': country_code,
|
|
'country_label': country,
|
|
'region_code': region_code,
|
|
'city_code': city_code,
|
|
'city_label': city,
|
|
},
|
|
},
|
|
'custodian_name': {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': name,
|
|
'source_type': 'wikidata',
|
|
},
|
|
'unesco_mow_enrichment': {
|
|
'is_mow_custodian': True,
|
|
'inscription_count': len(inscriptions),
|
|
'inscriptions': [
|
|
{
|
|
'wikidata_id': insc.get('wikidata_id'),
|
|
'name': insc.get('name'),
|
|
'inscription_country': insc.get('country'),
|
|
}
|
|
for insc in inscriptions
|
|
],
|
|
'enrichment_timestamp': timestamp,
|
|
'data_source': 'Wikidata SPARQL (UNESCO has no MoW API)',
|
|
},
|
|
}
|
|
|
|
# Add GeoNames data if available
|
|
if geonames_data:
|
|
custodian['ghcid']['location_resolution'].update({
|
|
'geonames_id': geonames_data['geonames_id'],
|
|
'geonames_name': geonames_data['name'],
|
|
'feature_code': geonames_data['feature_code'],
|
|
'population': geonames_data['population'],
|
|
'admin1_code': geonames_data['admin1_code'],
|
|
})
|
|
custodian['ghcid']['geonames_id'] = geonames_data['geonames_id']
|
|
|
|
# Add coordinates if available
|
|
if coordinates:
|
|
# Parse "Point(lon lat)" format
|
|
match = re.match(r'Point\(([^\s]+)\s+([^\)]+)\)', coordinates)
|
|
if match:
|
|
lon, lat = float(match.group(1)), float(match.group(2))
|
|
custodian['ghcid']['location_resolution']['source_coordinates'] = {
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'source': 'wikidata',
|
|
}
|
|
|
|
# Remove None values
|
|
def remove_none(d):
|
|
if isinstance(d, dict):
|
|
return {k: remove_none(v) for k, v in d.items() if v is not None}
|
|
elif isinstance(d, list):
|
|
return [remove_none(i) for i in d]
|
|
return d
|
|
|
|
custodian = remove_none(custodian)
|
|
|
|
# Determine filename
|
|
filename = f"{ghcid}.yaml"
|
|
filepath = CUSTODIAN_DIR / filename
|
|
|
|
# Check for collision
|
|
if filepath.exists():
|
|
# Add name suffix
|
|
name_suffix = re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_')[:50]
|
|
ghcid_suffixed = f"{ghcid}-{name_suffix}"
|
|
filename = f"{ghcid_suffixed}.yaml"
|
|
filepath = CUSTODIAN_DIR / filename
|
|
custodian['ghcid']['ghcid_current'] = ghcid_suffixed
|
|
custodian['ghcid']['ghcid_history'][0]['ghcid'] = ghcid_suffixed
|
|
custodian['ghcid']['ghcid_history'][0]['reason'] = 'Name suffix added to resolve GHCID collision'
|
|
|
|
if dry_run:
|
|
print(f" Would create: {filename}")
|
|
return filepath
|
|
|
|
# Write YAML file
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" Created: {filename}")
|
|
return filepath
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Create custodian files from MoW Wikidata data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be created')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to create')
|
|
args = parser.parse_args()
|
|
|
|
# Load enriched MoW custodians
|
|
if not MOW_ENRICHED_FILE.exists():
|
|
print(f"Error: {MOW_ENRICHED_FILE} not found. Run the enrichment query first.")
|
|
return 1
|
|
|
|
with open(MOW_ENRICHED_FILE, 'r') as f:
|
|
custodians = json.load(f)
|
|
|
|
print(f"Loaded {len(custodians)} MoW custodians")
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN - no files will be created")
|
|
|
|
# Create custodian files
|
|
created = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for wikidata_id, info in list(custodians.items())[:args.limit]:
|
|
try:
|
|
result = create_custodian_yaml(wikidata_id, info, dry_run=args.dry_run)
|
|
if result:
|
|
created += 1
|
|
else:
|
|
skipped += 1
|
|
except Exception as e:
|
|
print(f" Error processing {wikidata_id}: {e}")
|
|
errors += 1
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Created: {created}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Errors: {errors}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|