#!/usr/bin/env python3 """ Create custodian YAML files from UNESCO Memory of the World (MoW) derived institutions. This script creates new custodian entries for heritage institutions that hold MoW-inscribed documentary heritage but don't yet exist in our custodian database. The MoW data was sourced from Wikidata (UNESCO has no public MoW API). Usage: python scripts/create_mow_custodians.py [--dry-run] [--limit N] Data Sources: - /tmp/mow_custodians_enriched.json - Wikidata-enriched MoW custodian data - /data/reference/geonames.db - GeoNames database for location resolution """ import json import os import re import sqlite3 import uuid import hashlib from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any, Optional, Tuple, List import argparse import yaml # Paths PROJECT_ROOT = Path(__file__).parent.parent CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" MOW_ENRICHED_FILE = Path("/tmp/mow_custodians_enriched.json") # Country code mapping (Wikidata labels to ISO 3166-1 alpha-2) # Complete mapping including territories, historical names, and alternate spellings COUNTRY_CODE_MAP = { # Major countries "Germany": "DE", "France": "FR", "South Korea": "KR", "United Kingdom": "GB", "People's Republic of China": "CN", "China": "CN", "Switzerland": "CH", "Spain": "ES", "United States of America": "US", "United States": "US", "Brazil": "BR", "India": "IN", "Iran": "IR", "Poland": "PL", "Turkey": "TR", "Sweden": "SE", "Italy": "IT", "Japan": "JP", "Canada": "CA", "Russia": "RU", "Czech Republic": "CZ", "Czechia": "CZ", "Mexico": "MX", "Austria": "AT", "Portugal": "PT", "Netherlands": "NL", "Belgium": "BE", "Denmark": "DK", "Norway": "NO", "Finland": "FI", "Australia": "AU", "New Zealand": "NZ", "Argentina": "AR", "Chile": "CL", "Colombia": "CO", "Peru": "PE", "Venezuela": "VE", "Egypt": "EG", "South Africa": "ZA", "Israel": "IL", "Indonesia": "ID", "Malaysia": "MY", "Thailand": "TH", "Vietnam": "VN", "Philippines": "PH", "Singapore": "SG", "Taiwan": "TW", "Hong Kong": "HK", "Pakistan": "PK", "Bangladesh": "BD", "Sri Lanka": "LK", "Nepal": "NP", "Greece": "GR", "Hungary": "HU", "Romania": "RO", "Bulgaria": "BG", "Slovakia": "SK", "Slovenia": "SI", "Croatia": "HR", "Serbia": "RS", "Ukraine": "UA", "Belarus": "BY", "Lithuania": "LT", "Latvia": "LV", "Estonia": "EE", "Ireland": "IE", "Scotland": "GB", "Wales": "GB", "Morocco": "MA", "Tunisia": "TN", "Algeria": "DZ", "Nigeria": "NG", "Kenya": "KE", "Ghana": "GH", "Ethiopia": "ET", "Tanzania": "TZ", "Saudi Arabia": "SA", "United Arab Emirates": "AE", "Qatar": "QA", "Kuwait": "KW", "Bahrain": "BH", "Oman": "OM", "Jordan": "JO", "Lebanon": "LB", "Syria": "SY", "Iraq": "IQ", "Yemen": "YE", "Afghanistan": "AF", "Kazakhstan": "KZ", "Uzbekistan": "UZ", "Turkmenistan": "TM", "Tajikistan": "TJ", "Kyrgyzstan": "KG", "Azerbaijan": "AZ", "Armenia": "AM", "Georgia": "GE", "Mongolia": "MN", "North Korea": "KP", "Cuba": "CU", "Jamaica": "JM", "Haiti": "HT", "Dominican Republic": "DO", "Puerto Rico": "PR", "Trinidad and Tobago": "TT", "Barbados": "BB", "Guatemala": "GT", "Honduras": "HN", "El Salvador": "SV", "Nicaragua": "NI", "Costa Rica": "CR", "Panama": "PA", "Ecuador": "EC", "Bolivia": "BO", "Paraguay": "PY", "Uruguay": "UY", "Luxembourg": "LU", "Liechtenstein": "LI", "Monaco": "MC", "Andorra": "AD", "San Marino": "SM", "Vatican City": "VA", "Malta": "MT", "Cyprus": "CY", "Iceland": "IS", "Greenland": "GL", "Faroe Islands": "FO", "North Macedonia": "MK", "Bosnia and Herzegovina": "BA", "Albania": "AL", "Montenegro": "ME", "Kosovo": "XK", "Moldova": "MD", "Democratic Republic of the Congo": "CD", "Republic of the Congo": "CG", "Cameroon": "CM", "Senegal": "SN", "Mali": "ML", "Burkina Faso": "BF", "Niger": "NE", "Chad": "TD", "Sudan": "SD", "South Sudan": "SS", "Uganda": "UG", "Rwanda": "RW", "Burundi": "BI", "Malawi": "MW", "Zambia": "ZM", "Zimbabwe": "ZW", "Botswana": "BW", "Namibia": "NA", "Mozambique": "MZ", "Madagascar": "MG", "Mauritius": "MU", "Seychelles": "SC", "Comoros": "KM", "Réunion": "RE", "Cambodia": "KH", "Laos": "LA", "Myanmar": "MM", "Brunei": "BN", "East Timor": "TL", "Timor-Leste": "TL", "Papua New Guinea": "PG", "Fiji": "FJ", "Samoa": "WS", "Tonga": "TO", "Vanuatu": "VU", "New Caledonia": "NC", "French Polynesia": "PF", "Guam": "GU", # Caribbean territories and small states (ISO 3166-1 alpha-2) "Angola": "AO", "Anguilla": "AI", "Antigua and Barbuda": "AG", "Aruba": "AW", "Bahamas": "BS", "The Bahamas": "BS", "Belize": "BZ", "Benin": "BJ", "Cape Verde": "CV", "Cabo Verde": "CV", "Curaçao": "CW", "Curacao": "CW", "Guyana": "GY", "Montserrat": "MS", "Saint Lucia": "LC", "St. Lucia": "LC", "St Lucia": "LC", "Sint Maarten": "SX", "Suriname": "SR", # Historical/dissolved entities (map to successor or use special codes) # Netherlands Antilles dissolved in 2010 - use CW (Curaçao) as primary successor "Netherlands Antilles": "CW", # Additional Caribbean "Dominica": "DM", "Grenada": "GD", "Saint Kitts and Nevis": "KN", "St. Kitts and Nevis": "KN", "Saint Vincent and the Grenadines": "VC", "St. Vincent and the Grenadines": "VC", "British Virgin Islands": "VG", "U.S. Virgin Islands": "VI", "US Virgin Islands": "VI", "Cayman Islands": "KY", "Turks and Caicos Islands": "TC", "Bermuda": "BM", # Additional Africa "Ivory Coast": "CI", "Côte d'Ivoire": "CI", "Gabon": "GA", "Equatorial Guinea": "GQ", "Guinea": "GN", "Guinea-Bissau": "GW", "Liberia": "LR", "Sierra Leone": "SL", "Togo": "TG", "Central African Republic": "CF", "Eritrea": "ER", "Djibouti": "DJ", "Somalia": "SO", "Lesotho": "LS", "Eswatini": "SZ", "Swaziland": "SZ", "São Tomé and Príncipe": "ST", "Gambia": "GM", "The Gambia": "GM", "Mauritania": "MR", # Additional Asia/Pacific "Bhutan": "BT", "Maldives": "MV", "Solomon Islands": "SB", "Kiribati": "KI", "Marshall Islands": "MH", "Micronesia": "FM", "Federated States of Micronesia": "FM", "Nauru": "NR", "Palau": "PW", "Tuvalu": "TV", "Cook Islands": "CK", "Niue": "NU", "Tokelau": "TK", "American Samoa": "AS", "Northern Mariana Islands": "MP", "Macau": "MO", "Macao": "MO", } # Institution type mapping based on Wikidata instance_of labels INSTANCE_TYPE_MAP = { "archive": "A", "archives": "A", "national archive": "A", "national archives": "A", "state archive": "A", "state archives": "A", "city archive": "A", "municipal archive": "A", "library": "L", "national library": "L", "public library": "L", "university library": "L", "research library": "L", "academic library": "L", "special library": "L", "museum": "M", "art museum": "M", "history museum": "M", "national museum": "M", "cultural institution": "M", "heritage institution": "M", "gallery": "G", "art gallery": "G", "university": "E", "college": "E", "research institute": "R", "research center": "R", "foundation": "N", "non-profit organization": "N", "NGO": "N", "government agency": "O", "government organization": "O", "public body": "O", "religious organization": "H", "church": "H", "monastery": "H", "abbey": "H", "cathedral": "H", "temple": "H", "mosque": "H", "synagogue": "H", "botanical garden": "B", "zoo": "B", "zoological garden": "B", "television station": "C", "broadcasting company": "C", "newspaper": "C", } # Skipped words for abbreviation generation SKIP_WORDS = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', 'a', 'an', 'the', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'under', 'and', 'or', 'but', 'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous', 'pour', 'par', 'avec', "l'", 'et', 'ou', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine', 'einer', 'einem', 'einen', 'von', 'zu', 'für', 'mit', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder', 'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para', 'sobre', 'bajo', 'y', 'o', 'e', 'u', 'o', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na', 'nos', 'nas', 'com', 'sob', 'il', 'lo', 'i', 'gli', 'dello', 'della', 'dei', 'degli', 'delle', 'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle', 'nel', 'nello', 'nella', 'nei', 'negli', 'nelle', 'sul', 'sullo', 'sulla', 'sui', 'sugli', 'sulle', 'per', 'tra', 'fra', 'ed', 'od', } def get_country_code(country_label: str) -> str: """Convert country label to ISO 3166-1 alpha-2 code.""" return COUNTRY_CODE_MAP.get(country_label, "XX") def detect_institution_type(instance_of: str, all_types: Optional[List[str]] = None) -> str: """Detect institution type code from Wikidata instance_of labels.""" types_to_check = [instance_of] + (all_types or []) for type_label in types_to_check: if not type_label: continue type_lower = type_label.lower() for keyword, code in INSTANCE_TYPE_MAP.items(): if keyword in type_lower: return code # Default to Archive for MoW custodians (they hold documentary heritage) return "A" def generate_abbreviation(name: str) -> str: """Generate institution abbreviation from name.""" if not name: return "UNK" # Split into words words = re.split(r'[\s\-/]+', name) # Filter out skip words and empty strings significant_words = [ w for w in words if w.lower() not in SKIP_WORDS and w and not w.isdigit() ] if not significant_words: significant_words = words[:3] # Take first letter of each significant word abbrev = ''.join(w[0].upper() for w in significant_words[:10] if w) # Ensure at least 2 characters if len(abbrev) < 2: abbrev = name[:3].upper() return abbrev def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # UUID namespace for URLs return str(uuid.uuid5(namespace, ghcid_string)) def generate_ghcid_sha256(ghcid_string: str) -> str: """Generate UUID v8-style from SHA-256 hash.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16] # Set version to 8 and variant hash_bytes = bytearray(hash_bytes) hash_bytes[6] = (hash_bytes[6] & 0x0f) | 0x80 # version 8 hash_bytes[8] = (hash_bytes[8] & 0x3f) | 0x80 # variant return str(uuid.UUID(bytes=bytes(hash_bytes))) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest() return int.from_bytes(hash_bytes[:8], 'big') def lookup_city_geonames(country_code: str, city_name: str) -> Optional[Dict[str, Any]]: """Look up city in GeoNames database.""" if not GEONAMES_DB.exists(): return None try: conn = sqlite3.connect(str(GEONAMES_DB)) cursor = conn.cursor() # Try exact match first cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = ? AND (name LIKE ? OR ascii_name LIKE ?) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (country_code, city_name, city_name)) row = cursor.fetchone() conn.close() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } except Exception as e: print(f"GeoNames lookup error: {e}") return None def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from name.""" if not city_name: return "XXX" # Clean the name clean = re.sub(r'[^a-zA-Z\s]', '', city_name) words = clean.split() if len(words) == 1: # Single word: first 3 letters return clean[:3].upper() else: # Multiple words: initials return ''.join(w[0].upper() for w in words[:3]) # Admin1 code to ISO 3166-2 region code mapping # GeoNames admin1 codes -> ISO 3166-2 subdivision codes ADMIN1_TO_REGION = { # Netherlands (NL) 'NL': { '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL', }, # Germany (DE) 'DE': { '01': 'SH', '02': 'HH', '03': 'NI', '04': 'HB', '05': 'NW', '06': 'HE', '07': 'RP', '08': 'BW', '09': 'BY', '10': 'SL', '11': 'BE', '12': 'BB', '13': 'MV', '14': 'SN', '15': 'ST', '16': 'TH', }, # France (FR) - Régions 'FR': { '11': 'IDF', # Île-de-France (Paris) '24': 'CVL', # Centre-Val de Loire '27': 'BFC', # Bourgogne-Franche-Comté '28': 'NOR', # Normandie '32': 'HDF', # Hauts-de-France '44': 'GES', # Grand Est '52': 'PDL', # Pays de la Loire '53': 'BRE', # Bretagne '75': 'NAQ', # Nouvelle-Aquitaine '76': 'OCC', # Occitanie '84': 'ARA', # Auvergne-Rhône-Alpes '93': 'PAC', # Provence-Alpes-Côte d'Azur '94': 'COR', # Corse # Overseas '01': 'GP', # Guadeloupe '02': 'MQ', # Martinique '03': 'GF', # Guyane '04': 'RE', # La Réunion '06': 'YT', # Mayotte }, # United Kingdom (GB) 'GB': { 'ENG': 'EN', # England 'SCT': 'SC', # Scotland 'WLS': 'WA', # Wales 'NIR': 'NI', # Northern Ireland }, # Spain (ES) - Comunidades Autónomas 'ES': { '01': 'AN', # Andalucía '02': 'AR', # Aragón '03': 'AS', # Asturias '04': 'IB', # Illes Balears '05': 'CN', # Canarias '06': 'CB', # Cantabria '07': 'CL', # Castilla y León '08': 'CM', # Castilla-La Mancha '09': 'CT', # Cataluña '10': 'VC', # Comunitat Valenciana '11': 'EX', # Extremadura '12': 'GA', # Galicia '13': 'MD', # Madrid '14': 'MC', # Murcia '15': 'NC', # Navarra '16': 'PV', # País Vasco '17': 'RI', # La Rioja '18': 'CE', # Ceuta '19': 'ML', # Melilla # GeoNames alternate codes '29': 'MD', # Madrid (alternate) '34': 'AV', # Ávila (Castilla y León) '55': 'SA', # Salamanca '56': 'GI', # Girona (Cataluña) '58': 'OR', # Ourense (Galicia) }, # Italy (IT) - Regioni 'IT': { '01': 'PIE', # Piemonte '02': 'VDA', # Valle d'Aosta '03': 'LOM', # Lombardia '04': 'TAA', # Trentino-Alto Adige '05': 'VEN', # Veneto '06': 'FVG', # Friuli-Venezia Giulia '07': 'LIG', # Liguria '08': 'EMR', # Emilia-Romagna '09': 'TOS', # Toscana '10': 'UMB', # Umbria '11': 'MAR', # Marche '12': 'LAZ', # Lazio '13': 'ABR', # Abruzzo '14': 'MOL', # Molise '15': 'CAM', # Campania '16': 'PUG', # Puglia '17': 'BAS', # Basilicata '18': 'CAL', # Calabria '19': 'SIC', # Sicilia '20': 'SAR', # Sardegna }, # Switzerland (CH) - Cantons 'CH': { '01': 'AG', # Aargau '02': 'AI', # Appenzell Innerrhoden '03': 'AR', # Appenzell Ausserrhoden '04': 'BE', # Bern '05': 'BL', # Basel-Landschaft '06': 'BS', # Basel-Stadt '07': 'FR', # Fribourg '08': 'GE', # Genève '09': 'GL', # Glarus '10': 'GR', # Graubünden '11': 'JU', # Jura '12': 'LU', # Luzern '13': 'NE', # Neuchâtel '14': 'NW', # Nidwalden '15': 'OW', # Obwalden '16': 'SG', # St. Gallen '17': 'SH', # Schaffhausen '18': 'SO', # Solothurn '19': 'SZ', # Schwyz '20': 'TG', # Thurgau '21': 'TI', # Ticino '22': 'UR', # Uri '23': 'VD', # Vaud '24': 'VS', # Valais '25': 'ZG', # Zug '26': 'ZH', # Zürich }, # Austria (AT) - Bundesländer 'AT': { '01': 'B', # Burgenland '02': 'K', # Kärnten '03': 'NO', # Niederösterreich '04': 'OO', # Oberösterreich '05': 'S', # Salzburg '06': 'ST', # Steiermark '07': 'T', # Tirol '08': 'V', # Vorarlberg '09': 'W', # Wien }, # Belgium (BE) - Provinces 'BE': { 'BRU': 'BRU', # Brussels-Capital Region 'VLG': 'VLG', # Flemish Region 'WAL': 'WAL', # Walloon Region 'VAN': 'VAN', # Antwerpen 'VBR': 'VBR', # Vlaams-Brabant 'VLI': 'VLI', # Limburg 'VOV': 'VOV', # Oost-Vlaanderen 'VWV': 'VWV', # West-Vlaanderen 'WBR': 'WBR', # Brabant wallon 'WHT': 'WHT', # Hainaut 'WLG': 'WLG', # Liège 'WLX': 'WLX', # Luxembourg 'WNA': 'WNA', # Namur }, # Poland (PL) - Voivodeships 'PL': { '72': 'DS', # Dolnośląskie '73': 'KP', # Kujawsko-pomorskie '74': 'LU', # Lubelskie '75': 'LB', # Lubuskie '76': 'LD', # Łódzkie '77': 'MA', # Małopolskie '78': 'MZ', # Mazowieckie '79': 'OP', # Opolskie '80': 'PK', # Podkarpackie '81': 'PD', # Podlaskie '82': 'PM', # Pomorskie '83': 'SL', # Śląskie '84': 'SK', # Świętokrzyskie '85': 'WN', # Warmińsko-mazurskie '86': 'WP', # Wielkopolskie '87': 'ZP', # Zachodniopomorskie }, # South Korea (KR) - Provinces and Special Cities 'KR': { '01': 'SO', # Seoul '02': 'BS', # Busan '03': 'DG', # Daegu '04': 'IC', # Incheon '05': 'GJ', # Gwangju '06': 'DJ', # Daejeon '07': 'US', # Ulsan '08': 'GG', # Gyeonggi-do '09': 'GW', # Gangwon-do '10': 'CB', # Chungcheongbuk-do '11': 'CN', # Chungcheongnam-do '12': 'JB', # Jeollabuk-do '13': 'JN', # Jeollanam-do '14': 'GB', # Gyeongsangbuk-do '15': 'GN', # Gyeongsangnam-do '16': 'JJ', # Jeju-do '17': 'SJ', # Sejong }, # Japan (JP) - Prefectures (partial, major ones) 'JP': { '01': 'HKD', # Hokkaido '02': 'AOM', # Aomori '04': 'MYG', # Miyagi '07': 'FKS', # Fukushima '08': 'IBR', # Ibaraki '09': 'TCG', # Tochigi '10': 'GNM', # Gunma '11': 'SIT', # Saitama '12': 'CHB', # Chiba '13': 'TKY', # Tokyo '14': 'KNG', # Kanagawa '15': 'NGT', # Niigata '17': 'ISK', # Ishikawa '20': 'NGN', # Nagano '21': 'GIF', # Gifu '22': 'SZO', # Shizuoka '23': 'AIC', # Aichi '24': 'MIE', # Mie '25': 'SHG', # Shiga '26': 'KYT', # Kyoto '27': 'OSK', # Osaka '28': 'HYG', # Hyogo '29': 'NAR', # Nara '30': 'WKY', # Wakayama '31': 'TTR', # Tottori '32': 'SMN', # Shimane '33': 'OKY', # Okayama '34': 'HRS', # Hiroshima '35': 'YMG', # Yamaguchi '36': 'TKS', # Tokushima '37': 'KGW', # Kagawa '38': 'EHM', # Ehime '39': 'KOC', # Kochi '40': 'FKO', # Fukuoka '41': 'SAG', # Saga '42': 'NGS', # Nagasaki '43': 'KMM', # Kumamoto '44': 'OIT', # Oita '45': 'MYZ', # Miyazaki '46': 'KGS', # Kagoshima '47': 'OKN', # Okinawa }, # China (CN) - Provinces (partial) 'CN': { '01': 'AH', # Anhui '02': 'ZJ', # Zhejiang '03': 'JX', # Jiangxi '04': 'JS', # Jiangsu '05': 'JL', # Jilin '06': 'QH', # Qinghai '07': 'FJ', # Fujian '08': 'HI', # Heilongjiang '09': 'HN', # Henan '10': 'HB', # Hebei '11': 'HN', # Hunan '12': 'HB', # Hubei '13': 'XZ', # Tibet (Xizang) '14': 'XZ', # Tibet (alternate) '15': 'GS', # Gansu '16': 'GZ', # Guizhou '18': 'SC', # Sichuan '19': 'YN', # Yunnan '20': 'HL', # Hainan '21': 'TW', # Taiwan (claimed) '22': 'BJ', # Beijing '23': 'SH', # Shanghai '25': 'NM', # Inner Mongolia (Nei Mongol) '26': 'NX', # Ningxia '28': 'XJ', # Xinjiang '30': 'GD', # Guangdong '31': 'HK', # Hong Kong '32': 'MO', # Macau '33': 'TJ', # Tianjin '36': 'SX', # Shaanxi '37': 'SD', # Shandong }, # Australia (AU) - States and Territories 'AU': { '01': 'ACT', # Australian Capital Territory '02': 'NSW', # New South Wales '03': 'NT', # Northern Territory '04': 'QLD', # Queensland '05': 'SA', # South Australia '06': 'TAS', # Tasmania '07': 'VIC', # Victoria '08': 'WA', # Western Australia }, # Canada (CA) - Provinces and Territories 'CA': { '01': 'AB', # Alberta '02': 'BC', # British Columbia '03': 'MB', # Manitoba '04': 'NB', # New Brunswick '05': 'NL', # Newfoundland and Labrador '07': 'NS', # Nova Scotia '08': 'ON', # Ontario '09': 'PE', # Prince Edward Island '10': 'QC', # Quebec '11': 'SK', # Saskatchewan '12': 'NT', # Northwest Territories '13': 'NU', # Nunavut '14': 'YT', # Yukon }, # Brazil (BR) - States 'BR': { '01': 'AC', # Acre '02': 'AL', # Alagoas '03': 'AP', # Amapá '04': 'AM', # Amazonas '05': 'BA', # Bahia '06': 'CE', # Ceará '07': 'DF', # Distrito Federal '08': 'ES', # Espírito Santo '29': 'GO', # Goiás '11': 'MA', # Maranhão '14': 'MT', # Mato Grosso '11': 'MS', # Mato Grosso do Sul '15': 'MG', # Minas Gerais '16': 'PA', # Pará '17': 'PB', # Paraíba '18': 'PR', # Paraná '19': 'PE', # Pernambuco '20': 'PI', # Piauí '21': 'RJ', # Rio de Janeiro '22': 'RN', # Rio Grande do Norte '23': 'RS', # Rio Grande do Sul '24': 'RO', # Rondônia '25': 'RR', # Roraima '26': 'SC', # Santa Catarina '27': 'SP', # São Paulo '28': 'SE', # Sergipe '31': 'TO', # Tocantins }, # India (IN) - States (partial, major ones) 'IN': { '01': 'AP', # Andhra Pradesh '02': 'AR', # Arunachal Pradesh '03': 'AS', # Assam '04': 'BR', # Bihar '05': 'CT', # Chhattisgarh '06': 'GA', # Goa '07': 'GJ', # Gujarat '08': 'HR', # Haryana '09': 'HP', # Himachal Pradesh '10': 'JK', # Jammu and Kashmir '11': 'JH', # Jharkhand '12': 'KA', # Karnataka '13': 'KL', # Kerala '14': 'MP', # Madhya Pradesh '15': 'MH', # Maharashtra '16': 'MN', # Manipur '17': 'ML', # Meghalaya '18': 'MZ', # Mizoram '19': 'NL', # Nagaland '20': 'OD', # Odisha '21': 'PB', # Punjab '22': 'RJ', # Rajasthan '23': 'SK', # Sikkim '24': 'TN', # Tamil Nadu '25': 'TS', # Telangana '26': 'TR', # Tripura '27': 'UP', # Uttar Pradesh '28': 'UK', # Uttarakhand '29': 'WB', # West Bengal '07': 'DL', # Delhi (National Capital Territory) '36': 'VA', # Varanasi region (approximation) }, # Russia (RU) - Federal subjects (partial) 'RU': { '01': 'AD', # Adygea '48': 'MOW', # Moscow '66': 'SPE', # Saint Petersburg '47': 'MOS', # Moscow Oblast '78': 'LEN', # Leningrad Oblast }, # Turkey (TR) - Provinces 'TR': { '01': 'AD', # Adana '06': 'AN', # Ankara '07': 'AN', # Antalya '16': 'BU', # Bursa '32': 'IS', # Istanbul '34': 'IS', # Istanbul (alternate) '38': 'KA', # Kayseri '68': 'AN', # Ankara (alternate) '71': 'KO', # Konya }, # Sweden (SE) - Counties (län) 'SE': { '01': 'AB', # Stockholm '03': 'C', # Uppsala '04': 'D', # Södermanland '05': 'E', # Östergötland '06': 'F', # Jönköping '07': 'G', # Kronoberg '08': 'H', # Kalmar '09': 'I', # Gotland '10': 'K', # Blekinge '12': 'M', # Skåne '13': 'N', # Halland '14': 'O', # Västra Götaland '17': 'S', # Värmland '18': 'T', # Örebro '19': 'U', # Västmanland '20': 'W', # Dalarna '21': 'X', # Gävleborg '22': 'Y', # Västernorrland '23': 'Z', # Jämtland '24': 'AC', # Västerbotten '25': 'BD', # Norrbotten '26': 'AB', # Stockholm (alternate) }, # Norway (NO) - Counties (fylker) 'NO': { '02': 'VK', # Viken '03': 'OS', # Oslo '09': 'AG', # Agder '11': 'RO', # Rogaland '12': 'VL', # Vestland '15': 'MR', # Møre og Romsdal '18': 'NO', # Nordland '50': 'TR', # Trøndelag '54': 'TF', # Troms og Finnmark }, # Denmark (DK) - Regions 'DK': { '17': 'HS', # Hovedstaden '18': 'MJ', # Midtjylland '19': 'NJ', # Nordjylland '20': 'SJ', # Sjælland '21': 'SD', # Syddanmark }, # Finland (FI) - Regions (maakunta) 'FI': { '01': 'UU', # Uusimaa (Helsinki) '02': 'VS', # Varsinais-Suomi '03': 'SA', # Satakunta '05': 'PI', # Pirkanmaa '06': 'PH', # Päijät-Häme '07': 'KY', # Kymenlaakso '08': 'EK', # Etelä-Karjala '09': 'ES', # Etelä-Savo '10': 'PS', # Pohjois-Savo '11': 'PK', # Pohjois-Karjala '12': 'KE', # Keski-Suomi '13': 'EP', # Etelä-Pohjanmaa '14': 'PO', # Pohjanmaa '15': 'KP', # Keski-Pohjanmaa '16': 'PP', # Pohjois-Pohjanmaa '17': 'KA', # Kainuu '18': 'LA', # Lappi '19': 'AX', # Åland }, # Ireland (IE) - Provinces 'IE': { 'C': 'C', # Connacht 'L': 'L', # Leinster 'M': 'M', # Munster 'U': 'U', # Ulster (IE portion) }, # Czech Republic (CZ) - Regions (kraje) 'CZ': { '52': 'PR', # Prague '78': 'JM', # Jihomoravský (South Moravian) '79': 'OL', # Olomoucký '80': 'MS', # Moravskoslezský '81': 'PL', # Plzeňský '82': 'KA', # Karlovarský '83': 'UL', # Ústecký '84': 'LI', # Liberecký '85': 'HK', # Královéhradecký '86': 'PA', # Pardubický '87': 'VY', # Vysočina '88': 'SC', # Středočeský '89': 'JC', # Jihočeský '90': 'ZL', # Zlínský }, # Hungary (HU) - Counties (megye) 'HU': { '01': 'BK', # Bács-Kiskun '02': 'BA', # Baranya '03': 'BE', # Békés '04': 'BZ', # Borsod-Abaúj-Zemplén '05': 'BU', # Budapest '06': 'CS', # Csongrád-Csanád '07': 'FE', # Fejér '08': 'GS', # Győr-Moson-Sopron '09': 'HB', # Hajdú-Bihar '10': 'HE', # Heves '11': 'JN', # Jász-Nagykun-Szolnok '12': 'KO', # Komárom-Esztergom '13': 'NO', # Nógrád '14': 'PE', # Pest '15': 'SO', # Somogy '16': 'SZ', # Szabolcs-Szatmár-Bereg '17': 'TO', # Tolna '18': 'VA', # Vas '19': 'VE', # Veszprém '20': 'ZA', # Zala }, # Portugal (PT) - Districts 'PT': { '01': 'AV', # Aveiro '02': 'BE', # Beja '03': 'BR', # Braga '04': 'BG', # Bragança '05': 'CB', # Castelo Branco '06': 'CO', # Coimbra '07': 'EV', # Évora '08': 'FA', # Faro '09': 'GU', # Guarda '10': 'LE', # Leiria '11': 'LI', # Lisboa '12': 'PO', # Portalegre '13': 'PT', # Porto '14': 'LI', # Lisboa (alternate) '15': 'SA', # Santarém '16': 'SE', # Setúbal '17': 'VC', # Viana do Castelo '18': 'VR', # Vila Real '19': 'VI', # Viseu '20': 'AC', # Açores '21': 'MA', # Madeira }, # Mexico (MX) - States 'MX': { '01': 'AG', # Aguascalientes '02': 'BC', # Baja California '03': 'BS', # Baja California Sur '04': 'CM', # Campeche '05': 'CO', # Coahuila '06': 'CL', # Colima '07': 'CS', # Chiapas '08': 'CH', # Chihuahua '09': 'CMX', # Ciudad de México '10': 'DG', # Durango '11': 'GT', # Guanajuato '12': 'GR', # Guerrero '13': 'HG', # Hidalgo '14': 'JA', # Jalisco '15': 'EM', # Estado de México '16': 'MI', # Michoacán '17': 'MO', # Morelos '18': 'NA', # Nayarit '19': 'NL', # Nuevo León '20': 'OA', # Oaxaca '21': 'PU', # Puebla '22': 'QT', # Querétaro '23': 'QR', # Quintana Roo '24': 'SL', # San Luis Potosí '25': 'SI', # Sinaloa '26': 'SO', # Sonora '27': 'TB', # Tabasco '28': 'TM', # Tamaulipas '29': 'TL', # Tlaxcala '30': 'VE', # Veracruz '31': 'YU', # Yucatán '32': 'ZA', # Zacatecas }, # Argentina (AR) - Provinces 'AR': { '01': 'BA', # Buenos Aires Province '02': 'CA', # Catamarca '03': 'CH', # Chaco '04': 'CT', # Chubut '05': 'CB', # Córdoba '06': 'CR', # Corrientes '07': 'CF', # Ciudad Autónoma de Buenos Aires '08': 'ER', # Entre Ríos '09': 'FM', # Formosa '10': 'JY', # Jujuy '11': 'LP', # La Pampa '12': 'LR', # La Rioja '13': 'MZ', # Mendoza '14': 'MN', # Misiones '15': 'NQ', # Neuquén '16': 'RN', # Río Negro '17': 'SA', # Salta '18': 'SJ', # San Juan '19': 'SL', # San Luis '20': 'SC', # Santa Cruz '21': 'SF', # Santa Fe '22': 'SE', # Santiago del Estero '23': 'TF', # Tierra del Fuego '24': 'TM', # Tucumán }, # New Zealand (NZ) - Regions 'NZ': { 'G2': 'WGN', # Wellington 'F7': 'AUK', # Auckland 'E7': 'WKO', # Waikato 'F4': 'BOP', # Bay of Plenty 'G1': 'TAS', # Tasman 'F6': 'MWT', # Manawatū-Whanganui 'F3': 'HKB', # Hawke's Bay 'E8': 'CAN', # Canterbury 'F9': 'OTA', # Otago 'G3': 'STL', # Southland }, # South Africa (ZA) - Provinces 'ZA': { '01': 'EC', # Eastern Cape '02': 'FS', # Free State '03': 'GT', # Gauteng '04': 'KZN', # KwaZulu-Natal '05': 'LP', # Limpopo '06': 'MP', # Mpumalanga '07': 'NC', # Northern Cape '08': 'NW', # North West '09': 'WC', # Western Cape '11': 'WC', # Western Cape (alternate - Cape Town) }, } def get_region_code(country_code: str, admin1_code: str) -> str: """Get ISO 3166-2 region code from GeoNames admin1 code.""" if country_code in ADMIN1_TO_REGION: return ADMIN1_TO_REGION[country_code].get(admin1_code, 'XX') # Default: use admin1_code directly or XX return admin1_code[:2].upper() if admin1_code else 'XX' def create_custodian_yaml(wikidata_id: str, info: Dict[str, Any], dry_run: bool = False) -> Optional[Path]: """Create a custodian YAML file from MoW-derived Wikidata data.""" name = info.get('name', '') country = info.get('country', '') city = info.get('city', '') coordinates = info.get('coordinates', '') instance_of = info.get('instance_of', '') all_types = info.get('all_types', []) inscriptions = info.get('inscriptions', []) if not name: print(f" Skipping {wikidata_id}: no name") return None # Get country code country_code = get_country_code(country) if country_code == "XX": print(f" Warning: Unknown country '{country}' for {wikidata_id}") # Get institution type inst_type = detect_institution_type(instance_of, all_types) # Generate abbreviation abbrev = generate_abbreviation(name) # Look up city in GeoNames region_code = "XX" city_code = "XXX" geonames_data = None if city and country_code != "XX": geonames_data = lookup_city_geonames(country_code, city) if geonames_data: region_code = get_region_code(country_code, geonames_data.get('admin1_code', '')) city_code = generate_city_code(geonames_data.get('ascii_name', city)) else: city_code = generate_city_code(city) # Generate GHCID ghcid = f"{country_code}-{region_code}-{city_code}-{inst_type}-{abbrev}" ghcid_uuid = generate_ghcid_uuid(ghcid) ghcid_sha256 = generate_ghcid_sha256(ghcid) ghcid_numeric = generate_ghcid_numeric(ghcid) # uuid7 not in standard library; use uuid4 for record_id (time-ordered not required for DB ID) record_id = str(uuid.uuid4()) timestamp = datetime.now(timezone.utc).isoformat() # Build the custodian YAML structure custodian = { 'original_entry': { 'name': name, 'source': 'UNESCO Memory of the World (via Wikidata)', 'wikidata_id': wikidata_id, 'mow_inscriptions': [ { 'wikidata_id': insc.get('wikidata_id'), 'name': insc.get('name'), 'country': insc.get('country'), } for insc in inscriptions ], }, 'entry_index': None, 'processing_timestamp': timestamp, 'wikidata_enrichment': { 'wikidata_entity_id': wikidata_id, 'wikidata_label_en': name, 'wikidata_description_en': f"Heritage institution holding UNESCO Memory of the World inscribed documents", 'instance_of': instance_of, 'all_types': all_types if all_types else None, }, 'ghcid': { 'ghcid_current': ghcid, 'ghcid_original': ghcid, 'ghcid_uuid': ghcid_uuid, 'ghcid_uuid_sha256': ghcid_sha256, 'ghcid_numeric': ghcid_numeric, 'record_id': record_id, 'generation_timestamp': timestamp, 'ghcid_history': [ { 'ghcid': ghcid, 'ghcid_numeric': ghcid_numeric, 'valid_from': timestamp, 'valid_to': None, 'reason': 'Initial GHCID from UNESCO MoW Wikidata data (Dec 2025)', } ], 'location_resolution': { 'method': 'WIKIDATA_LOCATION', 'country_code': country_code, 'country_label': country, 'region_code': region_code, 'city_code': city_code, 'city_label': city, }, }, 'custodian_name': { 'claim_type': 'custodian_name', 'claim_value': name, 'source_type': 'wikidata', }, 'unesco_mow_enrichment': { 'is_mow_custodian': True, 'inscription_count': len(inscriptions), 'inscriptions': [ { 'wikidata_id': insc.get('wikidata_id'), 'name': insc.get('name'), 'inscription_country': insc.get('country'), } for insc in inscriptions ], 'enrichment_timestamp': timestamp, 'data_source': 'Wikidata SPARQL (UNESCO has no MoW API)', }, } # Add GeoNames data if available if geonames_data: custodian['ghcid']['location_resolution'].update({ 'geonames_id': geonames_data['geonames_id'], 'geonames_name': geonames_data['name'], 'feature_code': geonames_data['feature_code'], 'population': geonames_data['population'], 'admin1_code': geonames_data['admin1_code'], }) custodian['ghcid']['geonames_id'] = geonames_data['geonames_id'] # Add coordinates if available if coordinates: # Parse "Point(lon lat)" format match = re.match(r'Point\(([^\s]+)\s+([^\)]+)\)', coordinates) if match: lon, lat = float(match.group(1)), float(match.group(2)) custodian['ghcid']['location_resolution']['source_coordinates'] = { 'latitude': lat, 'longitude': lon, 'source': 'wikidata', } # Remove None values def remove_none(d): if isinstance(d, dict): return {k: remove_none(v) for k, v in d.items() if v is not None} elif isinstance(d, list): return [remove_none(i) for i in d] return d custodian = remove_none(custodian) # Determine filename filename = f"{ghcid}.yaml" filepath = CUSTODIAN_DIR / filename # Check for collision if filepath.exists(): # Add name suffix name_suffix = re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_')[:50] ghcid_suffixed = f"{ghcid}-{name_suffix}" filename = f"{ghcid_suffixed}.yaml" filepath = CUSTODIAN_DIR / filename custodian['ghcid']['ghcid_current'] = ghcid_suffixed custodian['ghcid']['ghcid_history'][0]['ghcid'] = ghcid_suffixed custodian['ghcid']['ghcid_history'][0]['reason'] = 'Name suffix added to resolve GHCID collision' if dry_run: print(f" Would create: {filename}") return filepath # Write YAML file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" Created: {filename}") return filepath def main(): parser = argparse.ArgumentParser(description='Create custodian files from MoW Wikidata data') parser.add_argument('--dry-run', action='store_true', help='Show what would be created') parser.add_argument('--limit', type=int, help='Limit number of files to create') args = parser.parse_args() # Load enriched MoW custodians if not MOW_ENRICHED_FILE.exists(): print(f"Error: {MOW_ENRICHED_FILE} not found. Run the enrichment query first.") return 1 with open(MOW_ENRICHED_FILE, 'r') as f: custodians = json.load(f) print(f"Loaded {len(custodians)} MoW custodians") if args.dry_run: print("DRY RUN - no files will be created") # Create custodian files created = 0 skipped = 0 errors = 0 for wikidata_id, info in list(custodians.items())[:args.limit]: try: result = create_custodian_yaml(wikidata_id, info, dry_run=args.dry_run) if result: created += 1 else: skipped += 1 except Exception as e: print(f" Error processing {wikidata_id}: {e}") errors += 1 print(f"\nSummary:") print(f" Created: {created}") print(f" Skipped: {skipped}") print(f" Errors: {errors}") return 0 if __name__ == "__main__": exit(main())