glam/scripts/apply_verified_enrichments.py
2025-12-17 11:58:40 +01:00

291 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Apply verified location enrichments to XXX files and rename them.
This script:
1. Updates ghcid_current with the correct region/city codes
2. Updates location with city/region
3. Adds ghcid_history entry
4. Updates provenance notes
5. Renames file to match new GHCID
"""
import yaml
import os
import re
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
# Verified enrichments from Exa web search
# History:
# - Batch 1 (2025-12-17): 4 files - Crypto Museum, Allard Pierson, DPM Rotterdam, Cow Museum
# - Batch 2 (2025-12-17): 8 files - Bierreclame, Cacao, Edah, Flessenscheepjes, Eddie the Eagle, Jopie Huisman, Fortuna, Atlantikwall
# - Batch 3 (2025-12-17): 7 files - Ajax, C1000, Moslim Archief, CODART, Blik Trommel, Klompenmakerij, CultuurSchakel
# - Batch 4 (2025-12-17): 7 files - Autoriteit Persoonsgegevens, Raad voor Cultuur, IJV, Erotisch Museum, Hollands Kaas Museum, Kresse Museum, Van Gogh Museum Enterprises
# - Batch 5 (2025-12-17): 4 files - Huis73, Dutch Directors Guild, Het Kaas Museum (Bodegraven), Stichting Abrahamdag
# - Batch 6 (2025-12-17): 2 files - Museum 1939-1945, Brandkas van Henny
# - Batch 7 (2025-12-17): 5 files - Frans Maas Museum, Museum Buitenlust, Museum De Canonije, Museum Dijkmagazijn De Heul, Museumboerderij Erve Hofman
# - Batch 8 (2025-12-17): 7 files - Museum Janning, Museum Geelvinck Hinlopen Huis, Museumboerderij De Grote Glind, Museum Galerie RAT, Museum Averlo-Frieswijk-Schalkhaar, Museum Ceuclum, Museum van Brabantse Mutsen en Poffers
# Total enriched: 44 files
# Remaining: ~133 NL-XX-XXX files
#
# All previously processed entries have been removed from VERIFIED_ENRICHMENTS.
# Only add new entries that have not been processed yet.
VERIFIED_ENRICHMENTS = [
# Batch 8 - 2025-12-17
{
'old_filename': 'NL-XX-XXX-M-MJ-museum_janning.yaml',
'institution_name': 'Museum Janning',
'city': 'Nieuw Schoonebeek',
'region': 'Drenthe',
'region_code': 'DR',
'city_code': 'NIS',
'address': 'Europaweg 143a, 7766 AE Nieuw Schoonebeek',
'source': 'exa_web_search',
'source_url': 'https://www.museumjanning.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGHH.yaml',
'institution_name': 'Museum Geelvinck Hinlopen Huis',
'city': 'Heerde',
'region': 'Gelderland',
'region_code': 'GE',
'city_code': 'HEE',
'address': 'Kamperweg 23, 8181 CS Heerde',
'source': 'exa_web_search',
'source_url': 'https://geelvinck.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGG.yaml',
'institution_name': 'Museumboerderij De Grote Glind',
'city': 'Barneveld',
'region': 'Gelderland',
'region_code': 'GE',
'city_code': 'BAR',
'address': 'Scherpenzeelseweg 158, 3772 MG Barneveld',
'source': 'exa_web_search',
'source_url': 'https://www.degroteglind.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGR.yaml',
'institution_name': 'Museum Galerie RAT',
'city': 'Den Burg',
'region': 'Noord-Holland',
'region_code': 'NH',
'city_code': 'DEB',
'address': 'Burgwal 20, 1791 Den Burg, Texel',
'source': 'exa_web_search',
'source_url': 'https://www.mapquest.com/',
},
{
'old_filename': 'NL-XX-XXX-M-MAFS.yaml',
'institution_name': 'Museum Averlo-Frieswijk-Schalkhaar',
'city': 'Schalkhaar',
'region': 'Overijssel',
'region_code': 'OV',
'city_code': 'SCK',
'address': 'Frieswijkerweg 7, 7433 RB Schalkhaar',
'source': 'exa_web_search',
'source_url': 'https://www.museum-afs.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MC.yaml',
'institution_name': 'Museum Ceuclum',
'city': 'Cuijk',
'region': 'Noord-Brabant',
'region_code': 'NB',
'city_code': 'CUI',
'address': 'Castellum 1, 5431 EM Cuijk',
'source': 'exa_web_search',
'source_url': 'https://www.museumceuclum.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MBMP.yaml',
'institution_name': 'Museum van Brabantse Mutsen en Poffers',
'city': 'Sint-Oedenrode',
'region': 'Noord-Brabant',
'region_code': 'NB',
'city_code': 'SOR',
'address': 'Kerkstraat 20, 5492 AH Sint-Oedenrode',
'source': 'exa_web_search',
'source_url': 'https://mutsenmuseum.nl/',
},
]
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
# Create UUID v8 format
uuid_bytes = bytearray(sha256_hash[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def apply_enrichment(custodian_dir: Path, enrichment: dict) -> tuple[str | None, str | None]:
"""Apply enrichment to a file and return (old_path, new_path)."""
old_path = custodian_dir / enrichment['old_filename']
if not old_path.exists():
print(f" ❌ File not found: {old_path}")
return None, None
# Load YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Extract current GHCID components
old_ghcid = data['ghcid']['ghcid_current']
# Parse old GHCID to get type and abbreviation
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
if not match:
print(f" ❌ Could not parse GHCID: {old_ghcid}")
return None, None
inst_type = match.group(1)
abbrev = match.group(2)
name_suffix = match.group(3) # May be None
# Build new GHCID
new_ghcid = f"NL-{enrichment['region_code']}-{enrichment['city_code']}-{inst_type}-{abbrev}"
if name_suffix:
new_ghcid += f"-{name_suffix}"
# Generate new identifiers
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
timestamp = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': enrichment['city'],
'region': enrichment['region'],
'country': 'NL',
}
if enrichment.get('address'):
data['location']['address'] = enrichment['address']
# Update ghcid
old_numeric = data['ghcid'].get('ghcid_numeric', 0)
# Add to ghcid_history - mark old as ended
if 'ghcid_history' not in data['ghcid']:
data['ghcid_history'] = []
# Close out the old entry
for entry in data['ghcid']['ghcid_history']:
if entry.get('valid_to') is None:
entry['valid_to'] = timestamp
# Add new history entry
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': f"Location enriched via Exa web search - {enrichment['city']}, {enrichment['region']}"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
# Add location_resolution
data['ghcid']['location_resolution'] = {
'method': 'EXA_WEB_SEARCH',
'city_code': enrichment['city_code'],
'city_name': enrichment['city'],
'region_code': enrichment['region_code'],
'region_name': enrichment['region'],
'country_code': 'NL',
'resolution_date': timestamp,
'source_url': enrichment.get('source_url'),
}
# Update provenance notes
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
data['provenance']['notes'].append(
f"Location enriched on {timestamp[:10]} via Exa web search: {enrichment['city']}, {enrichment['region']}"
)
# Add web search source to provenance
if 'sources' not in data['provenance']:
data['provenance']['sources'] = {}
if 'web_search' not in data['provenance']['sources']:
data['provenance']['sources']['web_search'] = []
data['provenance']['sources']['web_search'].append({
'source_type': 'exa_web_search',
'data_tier': 'TIER_3_CROWD_SOURCED',
'source_url': enrichment.get('source_url'),
'extraction_timestamp': timestamp,
'claims_extracted': ['city', 'region', 'address'],
})
# Write updated YAML to new filename
new_filename = new_ghcid.replace('/', '_') + '.yaml'
new_path = custodian_dir / new_filename
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove old file
if old_path != new_path:
old_path.unlink()
return str(old_path), str(new_path)
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 60)
print("Applying Verified Location Enrichments")
print("=" * 60)
if not VERIFIED_ENRICHMENTS:
print("\nNo enrichments to process. Add entries to VERIFIED_ENRICHMENTS list.")
return
success_count = 0
for enrichment in VERIFIED_ENRICHMENTS:
print(f"\nProcessing: {enrichment['old_filename']}")
print(f"{enrichment['city']}, {enrichment['region']} ({enrichment['region_code']}-{enrichment['city_code']})")
old_path, new_path = apply_enrichment(custodian_dir, enrichment)
if old_path and new_path:
old_name = os.path.basename(old_path)
new_name = os.path.basename(new_path)
print(f" ✅ Renamed: {old_name}")
print(f"{new_name}")
success_count += 1
print("\n" + "=" * 60)
print(f"Summary: {success_count}/{len(VERIFIED_ENRICHMENTS)} files enriched and renamed")
print("=" * 60)
if __name__ == '__main__':
main()