317 lines
9.9 KiB
Python
317 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich XXX placeholder custodian files with known location data.
|
|
|
|
This script updates custodian YAML files that have placeholder region/city codes
|
|
(XX/XXX) with correct GeoNames-derived location data.
|
|
"""
|
|
|
|
import os
|
|
import uuid
|
|
import hashlib
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
# Province code mapping: GeoNames admin1_code -> ISO 3166-2-NL
|
|
ADMIN1_TO_ISO = {
|
|
'01': 'DR', # Drenthe
|
|
'02': 'FR', # Friesland
|
|
'03': 'GE', # Gelderland
|
|
'04': 'GR', # Groningen
|
|
'05': 'LI', # Limburg
|
|
'06': 'NB', # North Brabant
|
|
'07': 'NH', # North Holland
|
|
'09': 'UT', # Utrecht
|
|
'10': 'ZE', # Zeeland
|
|
'11': 'ZH', # South Holland
|
|
'15': 'OV', # Overijssel
|
|
'16': 'FL', # Flevoland
|
|
}
|
|
|
|
# City code generation rules (first 3 letters of ASCII name, or special cases)
|
|
CITY_CODES = {
|
|
'Leeuwarden': 'LEE',
|
|
'West-Terschelling': 'TER', # Use TER for Terschelling island
|
|
'Leiden': 'LEI',
|
|
'Eindhoven': 'EIN',
|
|
'Den Helder': 'DHE', # D + HE
|
|
'Breda': 'BRE',
|
|
'Almere Stad': 'ALM', # Use ALM for Almere
|
|
}
|
|
|
|
# Enrichment data for each file
|
|
ENRICHMENTS = [
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-FLMD.yaml',
|
|
'emic_name': 'Frysk Letterkundich Museum en Dokumintaasjesintrum',
|
|
'city': 'Leeuwarden',
|
|
'geonames_id': 2751792,
|
|
'admin1_code': '02',
|
|
'province_name': 'Friesland',
|
|
'latitude': 53.2012,
|
|
'longitude': 5.7999,
|
|
'abbrev': 'FLMD',
|
|
'inst_type': 'M',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-BMT.yaml',
|
|
'emic_name': 'Bunker Museum Terschelling',
|
|
'city': 'West-Terschelling',
|
|
'geonames_id': 2744608,
|
|
'admin1_code': '02',
|
|
'province_name': 'Friesland',
|
|
'latitude': 53.3605,
|
|
'longitude': 5.2192,
|
|
'abbrev': 'BMT',
|
|
'inst_type': 'M',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-MLW.yaml',
|
|
'emic_name': 'Museum het Leids Wevershuis',
|
|
'city': 'Leiden',
|
|
'geonames_id': 2751773,
|
|
'admin1_code': '11',
|
|
'province_name': 'South Holland',
|
|
'latitude': 52.1583,
|
|
'longitude': 4.4931,
|
|
'abbrev': 'MLW',
|
|
'inst_type': 'M',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-EMM.yaml',
|
|
'emic_name': 'Stichting Eindhovens Muziek Museum',
|
|
'city': 'Eindhoven',
|
|
'geonames_id': 2756253,
|
|
'admin1_code': '06',
|
|
'province_name': 'North Brabant',
|
|
'latitude': 51.4416,
|
|
'longitude': 5.4697,
|
|
'abbrev': 'EMM',
|
|
'inst_type': 'M',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-HV.yaml',
|
|
'emic_name': 'De Helderse Vallei',
|
|
'city': 'Den Helder',
|
|
'geonames_id': 2757220,
|
|
'admin1_code': '07',
|
|
'province_name': 'North Holland',
|
|
'latitude': 52.9564,
|
|
'longitude': 4.7600,
|
|
'abbrev': 'HV',
|
|
'inst_type': 'M',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-A-ALS.yaml',
|
|
'emic_name': 'Stichting Archief Leids Studentenleven',
|
|
'city': 'Leiden',
|
|
'geonames_id': 2751773,
|
|
'admin1_code': '11',
|
|
'province_name': 'South Holland',
|
|
'latitude': 52.1583,
|
|
'longitude': 4.4931,
|
|
'abbrev': 'ALS',
|
|
'inst_type': 'A',
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-PM-princenhaags_museum.yaml',
|
|
'emic_name': 'Princenhaags Museum',
|
|
'city': 'Breda', # Princenhage is part of Breda
|
|
'geonames_id': 2758401,
|
|
'admin1_code': '06',
|
|
'province_name': 'North Brabant',
|
|
'latitude': 51.5719,
|
|
'longitude': 4.7683,
|
|
'abbrev': 'PM',
|
|
'inst_type': 'M',
|
|
'name_suffix': 'princenhaags_museum', # Keep collision suffix
|
|
},
|
|
{
|
|
'old_file': 'NL-XX-XXX-M-DDAMAD.yaml',
|
|
'emic_name': 'Dutch Digital Art Museum Almere (DDAMA)',
|
|
'city': 'Almere Stad',
|
|
'geonames_id': 2759879,
|
|
'admin1_code': '16',
|
|
'province_name': 'Flevoland',
|
|
'latitude': 52.3508,
|
|
'longitude': 5.2647,
|
|
'abbrev': 'DDAMAD',
|
|
'inst_type': 'M',
|
|
},
|
|
]
|
|
|
|
# GHCID namespace UUID (same as in create_custodians_from_linkedin.py)
|
|
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
|
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate deterministic UUID v5 from GHCID string."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
|
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (custom) from SHA-256 hash."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
|
|
# Set version to 8 (custom) and variant bits
|
|
hash_bytes = bytearray(hash_bytes)
|
|
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant 1
|
|
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
|
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from SHA-256."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:8]
|
|
return int.from_bytes(hash_bytes, byteorder='big')
|
|
|
|
|
|
def enrich_file(enrichment: dict, base_dir: Path) -> tuple[str, str]:
|
|
"""
|
|
Enrich a single XXX placeholder file with location data.
|
|
|
|
Returns (old_path, new_path) tuple.
|
|
"""
|
|
old_path = base_dir / enrichment['old_file']
|
|
|
|
if not old_path.exists():
|
|
print(f" ⚠️ File not found: {old_path}")
|
|
return None, None
|
|
|
|
# Load existing YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Derive codes
|
|
province_code = ADMIN1_TO_ISO[enrichment['admin1_code']]
|
|
city_code = CITY_CODES[enrichment['city']]
|
|
|
|
# Build new GHCID
|
|
if enrichment.get('name_suffix'):
|
|
new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}"
|
|
else:
|
|
new_ghcid = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}"
|
|
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
|
|
# Generate new UUIDs
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data['location'] = {
|
|
'city': enrichment['city'],
|
|
'region': province_code,
|
|
'region_name': enrichment['province_name'],
|
|
'country': 'NL',
|
|
'latitude': enrichment['latitude'],
|
|
'longitude': enrichment['longitude'],
|
|
}
|
|
|
|
# Update GHCID section
|
|
# Mark old GHCID as ended
|
|
for history_entry in data['ghcid']['ghcid_history']:
|
|
if history_entry['valid_to'] is None:
|
|
history_entry['valid_to'] = now
|
|
|
|
# Add new GHCID to history
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': new_numeric,
|
|
'valid_from': now,
|
|
'valid_to': None,
|
|
'reason': f'Location enrichment: {enrichment["city"]}, {enrichment["province_name"]} (GeoNames ID: {enrichment["geonames_id"]})'
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = new_uuid
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
|
|
# Update location_resolution
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'NAME_INFERENCE',
|
|
'geonames_id': enrichment['geonames_id'],
|
|
'geonames_name': enrichment['city'],
|
|
'city_code': city_code,
|
|
'admin1_code': enrichment['admin1_code'],
|
|
'region_code': province_code,
|
|
'country_code': 'NL',
|
|
'resolution_date': now,
|
|
'notes': f"City inferred from institution name containing '{enrichment['city'].split()[0]}'"
|
|
}
|
|
|
|
# Add enrichment note to provenance
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
data['provenance']['notes'].append(f"Location enriched from institution name on {now[:10]}")
|
|
|
|
# Determine new filename
|
|
if enrichment.get('name_suffix'):
|
|
new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}-{enrichment['name_suffix']}.yaml"
|
|
else:
|
|
new_filename = f"NL-{province_code}-{city_code}-{enrichment['inst_type']}-{enrichment['abbrev']}.yaml"
|
|
|
|
new_path = base_dir / new_filename
|
|
|
|
# Write to new file
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Remove old file (if different from new)
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
print(f" ✅ {enrichment['old_file']}")
|
|
print(f" → {new_filename}")
|
|
print(f" GHCID: {old_ghcid} → {new_ghcid}")
|
|
|
|
return str(old_path), str(new_path)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print("=" * 60)
|
|
print("Enriching XXX placeholder files with location data")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
results = []
|
|
for enrichment in ENRICHMENTS:
|
|
old_path, new_path = enrich_file(enrichment, base_dir)
|
|
if old_path and new_path:
|
|
results.append((old_path, new_path))
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print(f"Summary: {len(results)} files enriched")
|
|
print("=" * 60)
|
|
|
|
# Check for collisions
|
|
print()
|
|
print("Checking for GHCID collisions...")
|
|
all_ghcids = []
|
|
for f in base_dir.glob('NL-*.yaml'):
|
|
ghcid = f.stem
|
|
all_ghcids.append(ghcid)
|
|
|
|
from collections import Counter
|
|
duplicates = [item for item, count in Counter(all_ghcids).items() if count > 1]
|
|
|
|
if duplicates:
|
|
print(f" ⚠️ Found {len(duplicates)} duplicate GHCIDs:")
|
|
for dup in duplicates:
|
|
print(f" - {dup}")
|
|
else:
|
|
print(" ✅ No GHCID collisions found")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|