291 lines
11 KiB
Python
291 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply verified location enrichments to XXX files and rename them.
|
|
|
|
This script:
|
|
1. Updates ghcid_current with the correct region/city codes
|
|
2. Updates location with city/region
|
|
3. Adds ghcid_history entry
|
|
4. Updates provenance notes
|
|
5. Renames file to match new GHCID
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
import re
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Verified enrichments from Exa web search
|
|
# History:
|
|
# - Batch 1 (2025-12-17): 4 files - Crypto Museum, Allard Pierson, DPM Rotterdam, Cow Museum
|
|
# - Batch 2 (2025-12-17): 8 files - Bierreclame, Cacao, Edah, Flessenscheepjes, Eddie the Eagle, Jopie Huisman, Fortuna, Atlantikwall
|
|
# - Batch 3 (2025-12-17): 7 files - Ajax, C1000, Moslim Archief, CODART, Blik Trommel, Klompenmakerij, CultuurSchakel
|
|
# - Batch 4 (2025-12-17): 7 files - Autoriteit Persoonsgegevens, Raad voor Cultuur, IJV, Erotisch Museum, Hollands Kaas Museum, Kresse Museum, Van Gogh Museum Enterprises
|
|
# - Batch 5 (2025-12-17): 4 files - Huis73, Dutch Directors Guild, Het Kaas Museum (Bodegraven), Stichting Abrahamdag
|
|
# - Batch 6 (2025-12-17): 2 files - Museum 1939-1945, Brandkas van Henny
|
|
# - Batch 7 (2025-12-17): 5 files - Frans Maas Museum, Museum Buitenlust, Museum De Canonije, Museum Dijkmagazijn De Heul, Museumboerderij Erve Hofman
|
|
# - Batch 8 (2025-12-17): 7 files - Museum Janning, Museum Geelvinck Hinlopen Huis, Museumboerderij De Grote Glind, Museum Galerie RAT, Museum Averlo-Frieswijk-Schalkhaar, Museum Ceuclum, Museum van Brabantse Mutsen en Poffers
|
|
# Total enriched: 44 files
|
|
# Remaining: ~133 NL-XX-XXX files
|
|
#
|
|
# All previously processed entries have been removed from VERIFIED_ENRICHMENTS.
|
|
# Only add new entries that have not been processed yet.
|
|
|
|
VERIFIED_ENRICHMENTS = [
|
|
# Batch 8 - 2025-12-17
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MJ-museum_janning.yaml',
|
|
'institution_name': 'Museum Janning',
|
|
'city': 'Nieuw Schoonebeek',
|
|
'region': 'Drenthe',
|
|
'region_code': 'DR',
|
|
'city_code': 'NIS',
|
|
'address': 'Europaweg 143a, 7766 AE Nieuw Schoonebeek',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.museumjanning.nl/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MGHH.yaml',
|
|
'institution_name': 'Museum Geelvinck Hinlopen Huis',
|
|
'city': 'Heerde',
|
|
'region': 'Gelderland',
|
|
'region_code': 'GE',
|
|
'city_code': 'HEE',
|
|
'address': 'Kamperweg 23, 8181 CS Heerde',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://geelvinck.nl/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MGG.yaml',
|
|
'institution_name': 'Museumboerderij De Grote Glind',
|
|
'city': 'Barneveld',
|
|
'region': 'Gelderland',
|
|
'region_code': 'GE',
|
|
'city_code': 'BAR',
|
|
'address': 'Scherpenzeelseweg 158, 3772 MG Barneveld',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.degroteglind.nl/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MGR.yaml',
|
|
'institution_name': 'Museum Galerie RAT',
|
|
'city': 'Den Burg',
|
|
'region': 'Noord-Holland',
|
|
'region_code': 'NH',
|
|
'city_code': 'DEB',
|
|
'address': 'Burgwal 20, 1791 Den Burg, Texel',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.mapquest.com/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MAFS.yaml',
|
|
'institution_name': 'Museum Averlo-Frieswijk-Schalkhaar',
|
|
'city': 'Schalkhaar',
|
|
'region': 'Overijssel',
|
|
'region_code': 'OV',
|
|
'city_code': 'SCK',
|
|
'address': 'Frieswijkerweg 7, 7433 RB Schalkhaar',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.museum-afs.nl/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MC.yaml',
|
|
'institution_name': 'Museum Ceuclum',
|
|
'city': 'Cuijk',
|
|
'region': 'Noord-Brabant',
|
|
'region_code': 'NB',
|
|
'city_code': 'CUI',
|
|
'address': 'Castellum 1, 5431 EM Cuijk',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.museumceuclum.nl/',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MBMP.yaml',
|
|
'institution_name': 'Museum van Brabantse Mutsen en Poffers',
|
|
'city': 'Sint-Oedenrode',
|
|
'region': 'Noord-Brabant',
|
|
'region_code': 'NB',
|
|
'city_code': 'SOR',
|
|
'address': 'Kerkstraat 20, 5492 AH Sint-Oedenrode',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://mutsenmuseum.nl/',
|
|
},
|
|
]
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
|
|
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
return int.from_bytes(sha256_hash[:8], byteorder='big')
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
# Create UUID v8 format
|
|
uuid_bytes = bytearray(sha256_hash[:16])
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
|
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
|
|
|
def apply_enrichment(custodian_dir: Path, enrichment: dict) -> tuple[str | None, str | None]:
|
|
"""Apply enrichment to a file and return (old_path, new_path)."""
|
|
old_path = custodian_dir / enrichment['old_filename']
|
|
|
|
if not old_path.exists():
|
|
print(f" ❌ File not found: {old_path}")
|
|
return None, None
|
|
|
|
# Load YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Extract current GHCID components
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
|
|
# Parse old GHCID to get type and abbreviation
|
|
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
|
|
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
|
|
if not match:
|
|
print(f" ❌ Could not parse GHCID: {old_ghcid}")
|
|
return None, None
|
|
|
|
inst_type = match.group(1)
|
|
abbrev = match.group(2)
|
|
name_suffix = match.group(3) # May be None
|
|
|
|
# Build new GHCID
|
|
new_ghcid = f"NL-{enrichment['region_code']}-{enrichment['city_code']}-{inst_type}-{abbrev}"
|
|
if name_suffix:
|
|
new_ghcid += f"-{name_suffix}"
|
|
|
|
# Generate new identifiers
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data['location'] = {
|
|
'city': enrichment['city'],
|
|
'region': enrichment['region'],
|
|
'country': 'NL',
|
|
}
|
|
if enrichment.get('address'):
|
|
data['location']['address'] = enrichment['address']
|
|
|
|
# Update ghcid
|
|
old_numeric = data['ghcid'].get('ghcid_numeric', 0)
|
|
|
|
# Add to ghcid_history - mark old as ended
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid_history'] = []
|
|
|
|
# Close out the old entry
|
|
for entry in data['ghcid']['ghcid_history']:
|
|
if entry.get('valid_to') is None:
|
|
entry['valid_to'] = timestamp
|
|
|
|
# Add new history entry
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': new_numeric,
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': f"Location enriched via Exa web search - {enrichment['city']}, {enrichment['region']}"
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = new_uuid
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
|
|
# Add location_resolution
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'EXA_WEB_SEARCH',
|
|
'city_code': enrichment['city_code'],
|
|
'city_name': enrichment['city'],
|
|
'region_code': enrichment['region_code'],
|
|
'region_name': enrichment['region'],
|
|
'country_code': 'NL',
|
|
'resolution_date': timestamp,
|
|
'source_url': enrichment.get('source_url'),
|
|
}
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
data['provenance']['notes'].append(
|
|
f"Location enriched on {timestamp[:10]} via Exa web search: {enrichment['city']}, {enrichment['region']}"
|
|
)
|
|
|
|
# Add web search source to provenance
|
|
if 'sources' not in data['provenance']:
|
|
data['provenance']['sources'] = {}
|
|
if 'web_search' not in data['provenance']['sources']:
|
|
data['provenance']['sources']['web_search'] = []
|
|
data['provenance']['sources']['web_search'].append({
|
|
'source_type': 'exa_web_search',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'source_url': enrichment.get('source_url'),
|
|
'extraction_timestamp': timestamp,
|
|
'claims_extracted': ['city', 'region', 'address'],
|
|
})
|
|
|
|
# Write updated YAML to new filename
|
|
new_filename = new_ghcid.replace('/', '_') + '.yaml'
|
|
new_path = custodian_dir / new_filename
|
|
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Remove old file
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
return str(old_path), str(new_path)
|
|
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print("=" * 60)
|
|
print("Applying Verified Location Enrichments")
|
|
print("=" * 60)
|
|
|
|
if not VERIFIED_ENRICHMENTS:
|
|
print("\nNo enrichments to process. Add entries to VERIFIED_ENRICHMENTS list.")
|
|
return
|
|
|
|
success_count = 0
|
|
|
|
for enrichment in VERIFIED_ENRICHMENTS:
|
|
print(f"\nProcessing: {enrichment['old_filename']}")
|
|
print(f" → {enrichment['city']}, {enrichment['region']} ({enrichment['region_code']}-{enrichment['city_code']})")
|
|
|
|
old_path, new_path = apply_enrichment(custodian_dir, enrichment)
|
|
|
|
if old_path and new_path:
|
|
old_name = os.path.basename(old_path)
|
|
new_name = os.path.basename(new_path)
|
|
print(f" ✅ Renamed: {old_name}")
|
|
print(f" → {new_name}")
|
|
success_count += 1
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Summary: {success_count}/{len(VERIFIED_ENRICHMENTS)} files enriched and renamed")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|