#!/usr/bin/env python3 """ Enrich XXX placeholder custodian files with known location data. This script updates custodian YAML files that have placeholder region/city codes (XX/XXX) with correct GeoNames-derived location data. """ import os import uuid import hashlib import shutil from datetime import datetime, timezone from pathlib import Path import yaml # Province code mapping: GeoNames admin1_code -> ISO 3166-2-NL ADMIN1_TO_ISO = { '01': 'DR', # Drenthe '02': 'FR', # Friesland '03': 'GE', # Gelderland '04': 'GR', # Groningen '05': 'LI', # Limburg '06': 'NB', # North Brabant '07': 'NH', # North Holland '09': 'UT', # Utrecht '10': 'ZE', # Zeeland '11': 'ZH', # South Holland '15': 'OV', # Overijssel '16': 'FL', # Flevoland } # City code generation rules (first 3 letters of ASCII name, or special cases) CITY_CODES = { 'Leeuwarden': 'LEE', 'West-Terschelling': 'TER', # Use TER for Terschelling island 'Leiden': 'LEI', 'Eindhoven': 'EIN', 'Den Helder': 'DHE', # D + HE 'Breda': 'BRE', 'Almere Stad': 'ALM', # Use ALM for Almere } # Enrichment data for each file ENRICHMENTS = [ { 'old_file': 'NL-XX-XXX-M-FLMD.yaml', 'emic_name': 'Frysk Letterkundich Museum en Dokumintaasjesintrum', 'city': 'Leeuwarden', 'geonames_id': 2751792, 'admin1_code': '02', 'province_name': 'Friesland', 'latitude': 53.2012, 'longitude': 5.7999, 'abbrev': 'FLMD', 'inst_type': 'M', }, { 'old_file': 'NL-XX-XXX-M-BMT.yaml', 'emic_name': 'Bunker Museum Terschelling', 'city': 'West-Terschelling', 'geonames_id': 2744608, 'admin1_code': '02', 'province_name': 'Friesland', 'latitude': 53.3605, 'longitude': 5.2192, 'abbrev': 'BMT', 'inst_type': 'M', }, { 'old_file': 'NL-XX-XXX-M-MLW.yaml', 'emic_name': 'Museum het Leids Wevershuis', 'city': 'Leiden', 'geonames_id': 2751773, 'admin1_code': '11', 'province_name': 'South Holland', 'latitude': 52.1583, 'longitude': 4.4931, 'abbrev': 'MLW', 'inst_type': 'M', }, { 'old_file': 'NL-XX-XXX-M-EMM.yaml', 'emic_name': 'Stichting Eindhovens Muziek Museum', 'city': 'Eindhoven', 'geonames_id': 2756253, 'admin1_code': '06', 'province_name': 'North Brabant', 'latitude': 51.4416, 'longitude': 5.4697, 'abbrev': 'EMM', 'inst_type': 'M', }, { 'old_file': 'NL-XX-XXX-M-HV.yaml', 'emic_name': 'De Helderse Vallei', 'city': 'Den Helder', 'geonames_id': 2757220, 'admin1_code': '07', 'province_name': 'North Holland', 'latitude': 52.9564, 'longitude': 4.7600, 'abbrev': 'HV', 'inst_type': 'M', }, { 'old_file': 'NL-XX-XXX-A-ALS.yaml', 'emic_name': 'Stichting Archief Leids Studentenleven', 'city': 'Leiden', 'geonames_id': 2751773, 'admin1_code': '11', 'province_name': 'South Holland', 'latitude': 52.1583, 'longitude': 4.4931, 'abbrev': 'ALS', 'inst_type': 'A', }, { 'old_file': 'NL-XX-XXX-M-PM-princenhaags_museum.yaml', 'emic_name': 'Princenhaags Museum', 'city': 'Breda', # Princenhage is part of Breda 'geonames_id': 2758401, 'admin1_code': '06', 'province_name': 'North Brabant', 'latitude': 51.5719, 'longitude': 4.7683, 'abbrev': 'PM', 'inst_type': 'M', 'name_suffix': 'princenhaags_museum', # Keep collision suffix }, { 'old_file': 'NL-XX-XXX-M-DDAMAD.yaml', 'emic_name': 'Dutch Digital Art Museum Almere (DDAMA)', 'city': 'Almere Stad', 'geonames_id': 2759879, 'admin1_code': '16', 'province_name': 'Flevoland', 'latitude': 52.3508, 'longitude': 5.2647, 'abbrev': 'DDAMAD', 'inst_type': 'M', }, ] # GHCID namespace UUID (same as in create_custodians_from_linkedin.py) GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate deterministic UUID v5 from GHCID string.""" return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: """Generate UUID v8 (custom) from SHA-256 hash.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16] # Set version to 8 (custom) and variant bits hash_bytes = bytearray(hash_bytes) hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8 hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant 1 return str(uuid.UUID(bytes=bytes(hash_bytes))) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from SHA-256.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:8] return int.from_bytes(hash_bytes, byteorder='big') def enrich_file(enrichment: dict, base_dir: Path) -> tuple[str, str]: """ Enrich a single XXX placeholder file with location data. Returns (old_path, new_path) tuple. """ old_path = base_dir / enrichment['old_file'] if not old_path.exists(): print(f" ⚠️ File not found: {old_path}") return None, None # Load existing YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Derive codes province_code = ADMIN1_TO_ISO[enrichment['admin1_code']] city_code = CITY_CODES[enrichment['city']] # Build new GHCID if enrichment.get('name_suffix'): new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}" else: new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}" old_ghcid = data['ghcid']['ghcid_current'] # Generate new UUIDs new_uuid = generate_ghcid_uuid(new_ghcid) new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid) new_numeric = generate_ghcid_numeric(new_ghcid) now = datetime.now(timezone.utc).isoformat() # Update location data['location'] = { 'city': enrichment['city'], 'region': province_code, 'region_name': enrichment['province_name'], 'country': 'NL', 'latitude': enrichment['latitude'], 'longitude': enrichment['longitude'], } # Update GHCID section # Mark old GHCID as ended for history_entry in data['ghcid']['ghcid_history']: if history_entry['valid_to'] is None: history_entry['valid_to'] = now # Add new GHCID to history data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': new_numeric, 'valid_from': now, 'valid_to': None, 'reason': f'Location enrichment: {enrichment["city"]}, {enrichment["province_name"]} (GeoNames ID: {enrichment["geonames_id"]})' }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_uuid'] = new_uuid data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256 data['ghcid']['ghcid_numeric'] = new_numeric # Update location_resolution data['ghcid']['location_resolution'] = { 'method': 'NAME_INFERENCE', 'geonames_id': enrichment['geonames_id'], 'geonames_name': enrichment['city'], 'city_code': city_code, 'admin1_code': enrichment['admin1_code'], 'region_code': province_code, 'country_code': 'NL', 'resolution_date': now, 'notes': f"City inferred from institution name containing '{enrichment['city'].split()[0]}'" } # Add enrichment note to provenance if 'notes' not in data['provenance']: data['provenance']['notes'] = [] data['provenance']['notes'].append(f"Location enriched from institution name on {now[:10]}") # Determine new filename if enrichment.get('name_suffix'): new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}.yaml" else: new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}.yaml" new_path = base_dir / new_filename # Write to new file with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove old file (if different from new) if old_path != new_path: old_path.unlink() print(f" ✅ {enrichment['old_file']}") print(f" → {new_filename}") print(f" GHCID: {old_ghcid} → {new_ghcid}") return str(old_path), str(new_path) def main(): """Main entry point.""" base_dir = Path('/Users/kempersc/apps/glam/data/custodian') print("=" * 60) print("Enriching XXX placeholder files with location data") print("=" * 60) print() results = [] for enrichment in ENRICHMENTS: old_path, new_path = enrich_file(enrichment, base_dir) if old_path and new_path: results.append((old_path, new_path)) print() print("=" * 60) print(f"Summary: {len(results)} files enriched") print("=" * 60) # Check for collisions print() print("Checking for GHCID collisions...") all_ghcids = [] for f in base_dir.glob('NL-*.yaml'): ghcid = f.stem all_ghcids.append(ghcid) from collections import Counter duplicates = [item for item, count in Counter(all_ghcids).items() if count > 1] if duplicates: print(f" ⚠️ Found {len(duplicates)} duplicate GHCIDs:") for dup in duplicates: print(f" - {dup}") else: print(" ✅ No GHCID collisions found") if __name__ == '__main__': main()