#!/usr/bin/env python3 """ Apply verified location enrichments to XXX files and rename them. This script: 1. Updates ghcid_current with the correct region/city codes 2. Updates location with city/region 3. Adds ghcid_history entry 4. Updates provenance notes 5. Renames file to match new GHCID """ import yaml import os import re import uuid import hashlib from datetime import datetime, timezone from pathlib import Path # Verified enrichments from Exa web search # History: # - Batch 1 (2025-12-17): 4 files - Crypto Museum, Allard Pierson, DPM Rotterdam, Cow Museum # - Batch 2 (2025-12-17): 8 files - Bierreclame, Cacao, Edah, Flessenscheepjes, Eddie the Eagle, Jopie Huisman, Fortuna, Atlantikwall # - Batch 3 (2025-12-17): 7 files - Ajax, C1000, Moslim Archief, CODART, Blik Trommel, Klompenmakerij, CultuurSchakel # - Batch 4 (2025-12-17): 7 files - Autoriteit Persoonsgegevens, Raad voor Cultuur, IJV, Erotisch Museum, Hollands Kaas Museum, Kresse Museum, Van Gogh Museum Enterprises # - Batch 5 (2025-12-17): 4 files - Huis73, Dutch Directors Guild, Het Kaas Museum (Bodegraven), Stichting Abrahamdag # - Batch 6 (2025-12-17): 2 files - Museum 1939-1945, Brandkas van Henny # - Batch 7 (2025-12-17): 5 files - Frans Maas Museum, Museum Buitenlust, Museum De Canonije, Museum Dijkmagazijn De Heul, Museumboerderij Erve Hofman # - Batch 8 (2025-12-17): 7 files - Museum Janning, Museum Geelvinck Hinlopen Huis, Museumboerderij De Grote Glind, Museum Galerie RAT, Museum Averlo-Frieswijk-Schalkhaar, Museum Ceuclum, Museum van Brabantse Mutsen en Poffers # - Batch 9 (2025-12-17): 3 files - Museum Collectie ter Borg, Museum Dansant, Museum van alles wa # - Batch 10 (2025-12-17): 6 files - Jocus Museum (Venlo), Museum Meermanno (Den Haag), Museum of the Mind (Haarlem), Museum Mariënkroon (Nieuwkuijk), het MAG museum (Ruurlo), Mineralogisch Museum (Grou) # - Archived: 3 files - Museum Kempenland (closed 2012), Miniature Museum (UK), Museum Keris (Indonesia) # Total enriched: 53 files # Remaining: ~118 NL-XX-XXX files # # All previously processed entries have been removed from VERIFIED_ENRICHMENTS. # Only add new entries that have not been processed yet. VERIFIED_ENRICHMENTS = [ # Batch 10 - 2025-12-17 # Jocus Museum (carnival museum) - Venlo, Limburg # Note: File incorrectly named "museum_jocas" - correct name is "Jocus Museum" # Address: Dominicanenplein 25, 5911 JG Venlo # Oldest carnival society in the Netherlands (founded 1842) { 'old_filename': 'NL-XX-XXX-M-MJ-museum_jocas.yaml', 'institution_name': 'Jocus Museum (Carnival Museum)', 'city': 'Venlo', 'region': 'Limburg', 'region_code': 'LI', 'city_code': 'VEN', 'address': 'Dominicanenplein 25, 5911 JG Venlo', 'source': 'exa_web_search', 'source_url': 'https://www.jocusvenlo.nl/', }, # Museum Meermanno (Huis van het boek) - The Hague, Zuid-Holland # Address: Prinsessegracht 30, 2514 AP Den Haag # National museum of the book { 'old_filename': 'NL-XX-XXX-M-MM-museum_meermanno.yaml', 'institution_name': 'Museum Meermanno | Huis van het boek', 'city': 'Den Haag', 'region': 'Zuid-Holland', 'region_code': 'ZH', 'city_code': 'DHA', 'address': 'Prinsessegracht 30, 2514 AP Den Haag', 'source': 'exa_web_search', 'source_url': 'https://www.huisvanhetboek.nl/', }, # Museum of the Mind (Dolhuys) - Haarlem, Noord-Holland # Address: Schotersingel 2, 2023 EM Haarlem # Museum of the Year 2022 { 'old_filename': 'NL-XX-XXX-M-MM-museum_of_the_mind.yaml', 'institution_name': 'Museum of the Mind (Dolhuys)', 'city': 'Haarlem', 'region': 'Noord-Holland', 'region_code': 'NH', 'city_code': 'HAA', 'address': 'Schotersingel 2, 2023 EM Haarlem', 'source': 'exa_web_search', 'source_url': 'https://museumvandegeest.nl/', }, # Museum Mariënkroon (Abbey Museum) - Nieuwkuijk, Noord-Brabant # Address: Abdijlaan 8, 5253 VP Nieuwkuijk # Former Norbertine abbey { 'old_filename': 'NL-XX-XXX-M-MM-museum_marienkroon.yaml', 'institution_name': 'Museum Mariënkroon', 'city': 'Nieuwkuijk', 'region': 'Noord-Brabant', 'region_code': 'NB', 'city_code': 'NIE', 'address': 'Abdijlaan 8, 5253 VP Nieuwkuijk', 'source': 'exa_web_search', 'source_url': 'https://marienkroon.nl/', }, # het MAG museum (Maastricht Aardewerk & Glas) - Ruurlo, Gelderland # Address: Borculoseweg 2, 7261 BJ Ruurlo # Note: Located at Kasteel Ruurlo (formerly Museum MORE annex) { 'old_filename': 'NL-XX-XXX-M-MM-het_mag_museum.yaml', 'institution_name': 'het MAG museum (Maastricht Aardewerk & Glas)', 'city': 'Ruurlo', 'region': 'Gelderland', 'region_code': 'GE', 'city_code': 'RUU', 'address': 'Borculoseweg 2, 7261 BJ Ruurlo', 'source': 'exa_web_search', 'source_url': 'http://www.hetmagmuseum.nl/', }, # Mineralogisch Museum - Grou, Friesland # Address: Leechlân 22, 9001 ZH Grou { 'old_filename': 'NL-XX-XXX-M-MM-mineralogisch_museum.yaml', 'institution_name': 'Mineralogisch Museum Grou', 'city': 'Grou', 'region': 'Friesland', 'region_code': 'FR', 'city_code': 'GRO', 'address': 'Leechlân 22, 9001 ZH Grou', 'source': 'exa_web_search', 'source_url': 'https://mineralengrou.nl/', }, ] def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string)) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() return int.from_bytes(sha256_hash[:8], byteorder='big') def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: """Generate UUID v8 (SHA-256 based) from GHCID string.""" sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() # Create UUID v8 format uuid_bytes = bytearray(sha256_hash[:16]) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant return str(uuid.UUID(bytes=bytes(uuid_bytes))) def apply_enrichment(custodian_dir: Path, enrichment: dict) -> tuple[str | None, str | None]: """Apply enrichment to a file and return (old_path, new_path).""" old_path = custodian_dir / enrichment['old_filename'] if not old_path.exists(): print(f" ❌ File not found: {old_path}") return None, None # Load YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Extract current GHCID components old_ghcid = data['ghcid']['ghcid_current'] # Parse old GHCID to get type and abbreviation # Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}] match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid) if not match: print(f" ❌ Could not parse GHCID: {old_ghcid}") return None, None inst_type = match.group(1) abbrev = match.group(2) name_suffix = match.group(3) # May be None # Build new GHCID new_ghcid = f"NL-{enrichment['region_code']}-{enrichment['city_code']}-{inst_type}-{abbrev}" if name_suffix: new_ghcid += f"-{name_suffix}" # Generate new identifiers new_uuid = generate_ghcid_uuid(new_ghcid) new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid) new_numeric = generate_ghcid_numeric(new_ghcid) timestamp = datetime.now(timezone.utc).isoformat() # Update location data['location'] = { 'city': enrichment['city'], 'region': enrichment['region'], 'country': 'NL', } if enrichment.get('address'): data['location']['address'] = enrichment['address'] # Update ghcid old_numeric = data['ghcid'].get('ghcid_numeric', 0) # Add to ghcid_history - mark old as ended if 'ghcid_history' not in data['ghcid']: data['ghcid_history'] = [] # Close out the old entry for entry in data['ghcid']['ghcid_history']: if entry.get('valid_to') is None: entry['valid_to'] = timestamp # Add new history entry data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': new_numeric, 'valid_from': timestamp, 'valid_to': None, 'reason': f"Location enriched via Exa web search - {enrichment['city']}, {enrichment['region']}" }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_uuid'] = new_uuid data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256 data['ghcid']['ghcid_numeric'] = new_numeric # Add location_resolution data['ghcid']['location_resolution'] = { 'method': 'EXA_WEB_SEARCH', 'city_code': enrichment['city_code'], 'city_name': enrichment['city'], 'region_code': enrichment['region_code'], 'region_name': enrichment['region'], 'country_code': 'NL', 'resolution_date': timestamp, 'source_url': enrichment.get('source_url'), } # Update provenance notes if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] data['provenance']['notes'].append( f"Location enriched on {timestamp[:10]} via Exa web search: {enrichment['city']}, {enrichment['region']}" ) # Add web search source to provenance if 'sources' not in data['provenance']: data['provenance']['sources'] = {} if 'web_search' not in data['provenance']['sources']: data['provenance']['sources']['web_search'] = [] data['provenance']['sources']['web_search'].append({ 'source_type': 'exa_web_search', 'data_tier': 'TIER_3_CROWD_SOURCED', 'source_url': enrichment.get('source_url'), 'extraction_timestamp': timestamp, 'claims_extracted': ['city', 'region', 'address'], }) # Write updated YAML to new filename new_filename = new_ghcid.replace('/', '_') + '.yaml' new_path = custodian_dir / new_filename with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove old file if old_path != new_path: old_path.unlink() return str(old_path), str(new_path) def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') print("=" * 60) print("Applying Verified Location Enrichments") print("=" * 60) if not VERIFIED_ENRICHMENTS: print("\nNo enrichments to process. Add entries to VERIFIED_ENRICHMENTS list.") return success_count = 0 for enrichment in VERIFIED_ENRICHMENTS: print(f"\nProcessing: {enrichment['old_filename']}") print(f" → {enrichment['city']}, {enrichment['region']} ({enrichment['region_code']}-{enrichment['city_code']})") old_path, new_path = apply_enrichment(custodian_dir, enrichment) if old_path and new_path: old_name = os.path.basename(old_path) new_name = os.path.basename(new_path) print(f" ✅ Renamed: {old_name}") print(f" → {new_name}") success_count += 1 print("\n" + "=" * 60) print(f"Summary: {success_count}/{len(VERIFIED_ENRICHMENTS)} files enriched and renamed") print("=" * 60) if __name__ == '__main__': main()