#!/usr/bin/env python3 """ Fix institutions incorrectly assigned to NL (Netherlands) that are actually in other countries. These institutions were imported from LinkedIn batch import but have wrong country codes. """ import yaml import os import re import uuid import hashlib from datetime import datetime, timezone from pathlib import Path # Non-Dutch institutions to fix # Verified via Exa web search 2025-12-17 NON_DUTCH_INSTITUTIONS = [ { 'old_filename': 'NL-XX-XXX-A-HAEU.yaml', 'institution_name': 'Historical Archives of the European Union', 'country': 'IT', 'country_name': 'Italy', 'region': 'Tuscany', 'region_code': '52', # Italian region code 'city': 'Firenze', 'city_code': 'FIR', 'address': 'Via Bolognese 156, 50139 Firenze, Villa Salviati', 'source_url': 'https://archives.eui.eu/en/repositories/1', 'notes': 'Part of European University Institute, Florence' }, { 'old_filename': 'NL-XX-XXX-A-VZWADEB.yaml', 'institution_name': 'v.z.w. Archief- en Documentatiecentrum Erfgoed Binnenvaart', 'country': 'BE', 'country_name': 'Belgium', 'region': 'West-Vlaanderen', 'region_code': 'VWV', 'city': 'Oudenburg', 'city_code': 'OUD', 'address': 'Vaartdijk zuid 11, 8460 Oudenburg (aboard Museumschip Tordino)', 'source_url': 'http://binnenvaarterfgoed.be/', 'notes': 'Belgian v.z.w. (vzw = Belgian non-profit), located aboard museum ship' }, { 'old_filename': 'NL-XX-XXX-M-FM-ford_museum.yaml', 'institution_name': 'Gerald R. Ford Presidential Museum', 'country': 'US', 'country_name': 'United States', 'region': 'Michigan', 'region_code': 'MI', 'city': 'Grand Rapids', 'city_code': 'GRA', 'address': '303 Pearl Street NW, Grand Rapids, MI 49504', 'source_url': 'https://www.fordlibrarymuseum.gov/visit/museum', 'notes': 'Part of National Archives system, commemorates 38th US President', # Update institution_type from M to O (Official Institution - Presidential Library) 'new_institution_type': 'O', }, { 'old_filename': 'NL-XX-XXX-M-DAJ.yaml', 'institution_name': 'Diorama Arsip Jogja', 'country': 'ID', 'country_name': 'Indonesia', 'region': 'Daerah Istimewa Yogyakarta', 'region_code': 'YO', 'city': 'Bantul', 'city_code': 'BAN', 'address': 'LT 1 Gedung DEPO ARSIP, Jl. Janti, Banguntapan, Kabupaten Bantul, Yogyakarta 55198', 'source_url': 'https://dioramaarsip.jogjaprov.go.id/home', 'notes': 'Digital archive diorama of Yogyakarta history, opened February 2022', # It's actually an Archive (A), not Museum (M) 'new_institution_type': 'A', }, # Batch 2: Added 2025-12-17 - More Indonesian and Palestinian institutions { 'old_filename': 'NL-XX-XXX-M-MBV.yaml', 'institution_name': 'Museum Benteng Vredeburg', 'country': 'ID', 'country_name': 'Indonesia', 'region': 'Daerah Istimewa Yogyakarta', 'region_code': 'YO', 'city': 'Yogyakarta', 'city_code': 'YOG', 'address': 'Jl. Margo Mulyo No.6, Ngupasan, Kec. Gondomanan, Kota Yogyakarta 55122', 'source_url': 'https://forevervacation.com/yogyakarta/museum-benteng-vredeburg', 'notes': 'Dutch colonial fortress converted to museum in 1992, documents Indonesian independence struggle', }, { 'old_filename': 'NL-XX-XXX-M-MBP.yaml', 'institution_name': 'Museum Batik Pekalongan', 'country': 'ID', 'country_name': 'Indonesia', 'region': 'Jawa Tengah', 'region_code': 'JT', # Central Java 'city': 'Pekalongan', 'city_code': 'PEK', 'address': 'Jl. Jetayu No.1, Pekalongan 51152', 'source_url': 'https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan', 'notes': 'UNESCO recognized museum for batik conservation, opened 12 July 2006 by President SBY', }, { 'old_filename': 'NL-XX-XXX-M-MG.yaml', 'institution_name': 'Municipality of Gaza', 'country': 'PS', 'country_name': 'Palestine', 'region': 'Gaza Strip', 'region_code': 'GZ', 'city': 'Gaza City', 'city_code': 'GAZ', 'address': None, # Address not verifiable due to current situation 'source_url': 'https://www.gaza-city.org', 'notes': 'Municipal government, founded 1898. Type corrected from M (Museum) to O (Official Institution)', # It's a municipality (government), not a museum 'new_institution_type': 'O', }, ] def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string)) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() return int.from_bytes(sha256_hash[:8], byteorder='big') def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: """Generate UUID v8 (SHA-256 based) from GHCID string.""" sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() uuid_bytes = bytearray(sha256_hash[:16]) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant return str(uuid.UUID(bytes=bytes(uuid_bytes))) def fix_institution(custodian_dir: Path, inst: dict) -> tuple[str | None, str | None]: """Fix a non-Dutch institution and return (old_path, new_path).""" old_path = custodian_dir / inst['old_filename'] if not old_path.exists(): print(f" File not found: {old_path}") return None, None # Load YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Extract current GHCID components old_ghcid = data['ghcid']['ghcid_current'] # Parse old GHCID to get type and abbreviation # Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}] match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid) if not match: print(f" Could not parse GHCID: {old_ghcid}") return None, None inst_type = match.group(1) abbrev = match.group(2) name_suffix = match.group(3) # May be None # Check if we need to change institution type if inst.get('new_institution_type'): inst_type = inst['new_institution_type'] # Also update the institution_type field data['institution_type'] = [inst_type] # Build new GHCID with correct country new_ghcid = f"{inst['country']}-{inst['region_code']}-{inst['city_code']}-{inst_type}-{abbrev}" if name_suffix: new_ghcid += f"-{name_suffix}" # Generate new identifiers new_uuid = generate_ghcid_uuid(new_ghcid) new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid) new_numeric = generate_ghcid_numeric(new_ghcid) timestamp = datetime.now(timezone.utc).isoformat() # Update location data['location'] = { 'city': inst['city'], 'region': inst['region'], 'country': inst['country'], } if inst.get('address'): data['location']['address'] = inst['address'] # Close out old ghcid_history entries if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] for entry in data['ghcid']['ghcid_history']: if entry.get('valid_to') is None: entry['valid_to'] = timestamp # Add new history entry data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': new_numeric, 'valid_from': timestamp, 'valid_to': None, 'reason': f"Country code corrected: NL -> {inst['country']} ({inst['country_name']}). " f"Location: {inst['city']}, {inst['region']}" }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_original'] = new_ghcid # Also update original since NL was wrong data['ghcid']['ghcid_uuid'] = new_uuid data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256 data['ghcid']['ghcid_numeric'] = new_numeric # Add location_resolution data['ghcid']['location_resolution'] = { 'method': 'EXA_WEB_SEARCH', 'city_code': inst['city_code'], 'city_name': inst['city'], 'region_code': inst['region_code'], 'region_name': inst['region'], 'country_code': inst['country'], 'resolution_date': timestamp, 'source_url': inst.get('source_url'), 'notes': inst.get('notes'), } # Update provenance if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] data['provenance']['notes'].append( f"Country code corrected on {timestamp[:10]}: NL was incorrect, " f"institution is in {inst['country_name']} ({inst['country']})" ) # Add web search source to provenance if 'sources' not in data['provenance']: data['provenance']['sources'] = {} if 'web_search' not in data['provenance']['sources']: data['provenance']['sources']['web_search'] = [] data['provenance']['sources']['web_search'].append({ 'source_type': 'exa_web_search', 'data_tier': 'TIER_2_VERIFIED', # Higher tier since we verified country 'source_url': inst.get('source_url'), 'extraction_timestamp': timestamp, 'claims_extracted': ['country', 'region', 'city', 'address'], }) # Write updated YAML to new filename new_filename = new_ghcid.replace('/', '_') + '.yaml' new_path = custodian_dir / new_filename with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove old file if old_path != new_path: old_path.unlink() return str(old_path), str(new_path) def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') print("=" * 70) print("Fixing Non-Dutch Institutions (Country Code Corrections)") print("=" * 70) success_count = 0 for inst in NON_DUTCH_INSTITUTIONS: print(f"\nProcessing: {inst['old_filename']}") print(f" Institution: {inst['institution_name']}") print(f" Correction: NL -> {inst['country']} ({inst['country_name']})") print(f" Location: {inst['city']}, {inst['region']}") old_path, new_path = fix_institution(custodian_dir, inst) if old_path and new_path: old_name = os.path.basename(old_path) new_name = os.path.basename(new_path) print(f" Renamed: {old_name}") print(f" -> {new_name}") success_count += 1 print("\n" + "=" * 70) print(f"Summary: {success_count}/{len(NON_DUTCH_INSTITUTIONS)} institutions corrected") print("=" * 70) if __name__ == '__main__': main()