glam/scripts/enrich_xxx_placeholders.py
2025-12-17 10:11:56 +01:00

317 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""
Enrich XXX placeholder custodian files with known location data.
This script updates custodian YAML files that have placeholder region/city codes
(XX/XXX) with correct GeoNames-derived location data.
"""
import os
import uuid
import hashlib
import shutil
from datetime import datetime, timezone
from pathlib import Path
import yaml
# Province code mapping: GeoNames admin1_code -> ISO 3166-2-NL
ADMIN1_TO_ISO = {
'01': 'DR', # Drenthe
'02': 'FR', # Friesland
'03': 'GE', # Gelderland
'04': 'GR', # Groningen
'05': 'LI', # Limburg
'06': 'NB', # North Brabant
'07': 'NH', # North Holland
'09': 'UT', # Utrecht
'10': 'ZE', # Zeeland
'11': 'ZH', # South Holland
'15': 'OV', # Overijssel
'16': 'FL', # Flevoland
}
# City code generation rules (first 3 letters of ASCII name, or special cases)
CITY_CODES = {
'Leeuwarden': 'LEE',
'West-Terschelling': 'TER', # Use TER for Terschelling island
'Leiden': 'LEI',
'Eindhoven': 'EIN',
'Den Helder': 'DHE', # D + HE
'Breda': 'BRE',
'Almere Stad': 'ALM', # Use ALM for Almere
}
# Enrichment data for each file
ENRICHMENTS = [
{
'old_file': 'NL-XX-XXX-M-FLMD.yaml',
'emic_name': 'Frysk Letterkundich Museum en Dokumintaasjesintrum',
'city': 'Leeuwarden',
'geonames_id': 2751792,
'admin1_code': '02',
'province_name': 'Friesland',
'latitude': 53.2012,
'longitude': 5.7999,
'abbrev': 'FLMD',
'inst_type': 'M',
},
{
'old_file': 'NL-XX-XXX-M-BMT.yaml',
'emic_name': 'Bunker Museum Terschelling',
'city': 'West-Terschelling',
'geonames_id': 2744608,
'admin1_code': '02',
'province_name': 'Friesland',
'latitude': 53.3605,
'longitude': 5.2192,
'abbrev': 'BMT',
'inst_type': 'M',
},
{
'old_file': 'NL-XX-XXX-M-MLW.yaml',
'emic_name': 'Museum het Leids Wevershuis',
'city': 'Leiden',
'geonames_id': 2751773,
'admin1_code': '11',
'province_name': 'South Holland',
'latitude': 52.1583,
'longitude': 4.4931,
'abbrev': 'MLW',
'inst_type': 'M',
},
{
'old_file': 'NL-XX-XXX-M-EMM.yaml',
'emic_name': 'Stichting Eindhovens Muziek Museum',
'city': 'Eindhoven',
'geonames_id': 2756253,
'admin1_code': '06',
'province_name': 'North Brabant',
'latitude': 51.4416,
'longitude': 5.4697,
'abbrev': 'EMM',
'inst_type': 'M',
},
{
'old_file': 'NL-XX-XXX-M-HV.yaml',
'emic_name': 'De Helderse Vallei',
'city': 'Den Helder',
'geonames_id': 2757220,
'admin1_code': '07',
'province_name': 'North Holland',
'latitude': 52.9564,
'longitude': 4.7600,
'abbrev': 'HV',
'inst_type': 'M',
},
{
'old_file': 'NL-XX-XXX-A-ALS.yaml',
'emic_name': 'Stichting Archief Leids Studentenleven',
'city': 'Leiden',
'geonames_id': 2751773,
'admin1_code': '11',
'province_name': 'South Holland',
'latitude': 52.1583,
'longitude': 4.4931,
'abbrev': 'ALS',
'inst_type': 'A',
},
{
'old_file': 'NL-XX-XXX-M-PM-princenhaags_museum.yaml',
'emic_name': 'Princenhaags Museum',
'city': 'Breda', # Princenhage is part of Breda
'geonames_id': 2758401,
'admin1_code': '06',
'province_name': 'North Brabant',
'latitude': 51.5719,
'longitude': 4.7683,
'abbrev': 'PM',
'inst_type': 'M',
'name_suffix': 'princenhaags_museum', # Keep collision suffix
},
{
'old_file': 'NL-XX-XXX-M-DDAMAD.yaml',
'emic_name': 'Dutch Digital Art Museum Almere (DDAMA)',
'city': 'Almere Stad',
'geonames_id': 2759879,
'admin1_code': '16',
'province_name': 'Flevoland',
'latitude': 52.3508,
'longitude': 5.2647,
'abbrev': 'DDAMAD',
'inst_type': 'M',
},
]
# GHCID namespace UUID (same as in create_custodians_from_linkedin.py)
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (custom) from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
# Set version to 8 (custom) and variant bits
hash_bytes = bytearray(hash_bytes)
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant 1
return str(uuid.UUID(bytes=bytes(hash_bytes)))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from SHA-256."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:8]
return int.from_bytes(hash_bytes, byteorder='big')
def enrich_file(enrichment: dict, base_dir: Path) -> tuple[str, str]:
"""
Enrich a single XXX placeholder file with location data.
Returns (old_path, new_path) tuple.
"""
old_path = base_dir / enrichment['old_file']
if not old_path.exists():
print(f" ⚠️ File not found: {old_path}")
return None, None
# Load existing YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Derive codes
province_code = ADMIN1_TO_ISO[enrichment['admin1_code']]
city_code = CITY_CODES[enrichment['city']]
# Build new GHCID
if enrichment.get('name_suffix'):
new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}"
else:
new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}"
old_ghcid = data['ghcid']['ghcid_current']
# Generate new UUIDs
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
now = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': enrichment['city'],
'region': province_code,
'region_name': enrichment['province_name'],
'country': 'NL',
'latitude': enrichment['latitude'],
'longitude': enrichment['longitude'],
}
# Update GHCID section
# Mark old GHCID as ended
for history_entry in data['ghcid']['ghcid_history']:
if history_entry['valid_to'] is None:
history_entry['valid_to'] = now
# Add new GHCID to history
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': now,
'valid_to': None,
'reason': f'Location enrichment: {enrichment["city"]}, {enrichment["province_name"]} (GeoNames ID: {enrichment["geonames_id"]})'
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
# Update location_resolution
data['ghcid']['location_resolution'] = {
'method': 'NAME_INFERENCE',
'geonames_id': enrichment['geonames_id'],
'geonames_name': enrichment['city'],
'city_code': city_code,
'admin1_code': enrichment['admin1_code'],
'region_code': province_code,
'country_code': 'NL',
'resolution_date': now,
'notes': f"City inferred from institution name containing '{enrichment['city'].split()[0]}'"
}
# Add enrichment note to provenance
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
data['provenance']['notes'].append(f"Location enriched from institution name on {now[:10]}")
# Determine new filename
if enrichment.get('name_suffix'):
new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}.yaml"
else:
new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}.yaml"
new_path = base_dir / new_filename
# Write to new file
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove old file (if different from new)
if old_path != new_path:
old_path.unlink()
print(f"{enrichment['old_file']}")
print(f"{new_filename}")
print(f" GHCID: {old_ghcid}{new_ghcid}")
return str(old_path), str(new_path)
def main():
"""Main entry point."""
base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 60)
print("Enriching XXX placeholder files with location data")
print("=" * 60)
print()
results = []
for enrichment in ENRICHMENTS:
old_path, new_path = enrich_file(enrichment, base_dir)
if old_path and new_path:
results.append((old_path, new_path))
print()
print("=" * 60)
print(f"Summary: {len(results)} files enriched")
print("=" * 60)
# Check for collisions
print()
print("Checking for GHCID collisions...")
all_ghcids = []
for f in base_dir.glob('NL-*.yaml'):
ghcid = f.stem
all_ghcids.append(ghcid)
from collections import Counter
duplicates = [item for item, count in Counter(all_ghcids).items() if count > 1]
if duplicates:
print(f" ⚠️ Found {len(duplicates)} duplicate GHCIDs:")
for dup in duplicates:
print(f" - {dup}")
else:
print(" ✅ No GHCID collisions found")
if __name__ == '__main__':
main()