301 lines
12 KiB
Python
301 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply verified location enrichments to XXX files and rename them.
|
|
|
|
This script:
|
|
1. Updates ghcid_current with the correct region/city codes
|
|
2. Updates location with city/region
|
|
3. Adds ghcid_history entry
|
|
4. Updates provenance notes
|
|
5. Renames file to match new GHCID
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
import re
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Verified enrichments from Exa web search
|
|
# History:
|
|
# - Batch 1 (2025-12-17): 4 files - Crypto Museum, Allard Pierson, DPM Rotterdam, Cow Museum
|
|
# - Batch 2 (2025-12-17): 8 files - Bierreclame, Cacao, Edah, Flessenscheepjes, Eddie the Eagle, Jopie Huisman, Fortuna, Atlantikwall
|
|
# - Batch 3 (2025-12-17): 7 files - Ajax, C1000, Moslim Archief, CODART, Blik Trommel, Klompenmakerij, CultuurSchakel
|
|
# - Batch 4 (2025-12-17): 7 files - Autoriteit Persoonsgegevens, Raad voor Cultuur, IJV, Erotisch Museum, Hollands Kaas Museum, Kresse Museum, Van Gogh Museum Enterprises
|
|
# - Batch 5 (2025-12-17): 4 files - Huis73, Dutch Directors Guild, Het Kaas Museum (Bodegraven), Stichting Abrahamdag
|
|
# - Batch 6 (2025-12-17): 2 files - Museum 1939-1945, Brandkas van Henny
|
|
# - Batch 7 (2025-12-17): 5 files - Frans Maas Museum, Museum Buitenlust, Museum De Canonije, Museum Dijkmagazijn De Heul, Museumboerderij Erve Hofman
|
|
# - Batch 8 (2025-12-17): 7 files - Museum Janning, Museum Geelvinck Hinlopen Huis, Museumboerderij De Grote Glind, Museum Galerie RAT, Museum Averlo-Frieswijk-Schalkhaar, Museum Ceuclum, Museum van Brabantse Mutsen en Poffers
|
|
# - Batch 9 (2025-12-17): 3 files - Museum Collectie ter Borg, Museum Dansant, Museum van alles wa
|
|
# - Batch 10 (2025-12-17): 6 files - Jocus Museum (Venlo), Museum Meermanno (Den Haag), Museum of the Mind (Haarlem), Museum Mariënkroon (Nieuwkuijk), het MAG museum (Ruurlo), Mineralogisch Museum (Grou)
|
|
# - Archived: 3 files - Museum Kempenland (closed 2012), Miniature Museum (UK), Museum Keris (Indonesia)
|
|
# Total enriched: 53 files
|
|
# Remaining: ~118 NL-XX-XXX files
|
|
#
|
|
# All previously processed entries have been removed from VERIFIED_ENRICHMENTS.
|
|
# Only add new entries that have not been processed yet.
|
|
|
|
VERIFIED_ENRICHMENTS = [
|
|
# Batch 10 - 2025-12-17
|
|
# Jocus Museum (carnival museum) - Venlo, Limburg
|
|
# Note: File incorrectly named "museum_jocas" - correct name is "Jocus Museum"
|
|
# Address: Dominicanenplein 25, 5911 JG Venlo
|
|
# Oldest carnival society in the Netherlands (founded 1842)
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MJ-museum_jocas.yaml',
|
|
'institution_name': 'Jocus Museum (Carnival Museum)',
|
|
'city': 'Venlo',
|
|
'region': 'Limburg',
|
|
'region_code': 'LI',
|
|
'city_code': 'VEN',
|
|
'address': 'Dominicanenplein 25, 5911 JG Venlo',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.jocusvenlo.nl/',
|
|
},
|
|
# Museum Meermanno (Huis van het boek) - The Hague, Zuid-Holland
|
|
# Address: Prinsessegracht 30, 2514 AP Den Haag
|
|
# National museum of the book
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MM-museum_meermanno.yaml',
|
|
'institution_name': 'Museum Meermanno | Huis van het boek',
|
|
'city': 'Den Haag',
|
|
'region': 'Zuid-Holland',
|
|
'region_code': 'ZH',
|
|
'city_code': 'DHA',
|
|
'address': 'Prinsessegracht 30, 2514 AP Den Haag',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://www.huisvanhetboek.nl/',
|
|
},
|
|
# Museum of the Mind (Dolhuys) - Haarlem, Noord-Holland
|
|
# Address: Schotersingel 2, 2023 EM Haarlem
|
|
# Museum of the Year 2022
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MM-museum_of_the_mind.yaml',
|
|
'institution_name': 'Museum of the Mind (Dolhuys)',
|
|
'city': 'Haarlem',
|
|
'region': 'Noord-Holland',
|
|
'region_code': 'NH',
|
|
'city_code': 'HAA',
|
|
'address': 'Schotersingel 2, 2023 EM Haarlem',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://museumvandegeest.nl/',
|
|
},
|
|
# Museum Mariënkroon (Abbey Museum) - Nieuwkuijk, Noord-Brabant
|
|
# Address: Abdijlaan 8, 5253 VP Nieuwkuijk
|
|
# Former Norbertine abbey
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MM-museum_marienkroon.yaml',
|
|
'institution_name': 'Museum Mariënkroon',
|
|
'city': 'Nieuwkuijk',
|
|
'region': 'Noord-Brabant',
|
|
'region_code': 'NB',
|
|
'city_code': 'NIE',
|
|
'address': 'Abdijlaan 8, 5253 VP Nieuwkuijk',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://marienkroon.nl/',
|
|
},
|
|
# het MAG museum (Maastricht Aardewerk & Glas) - Ruurlo, Gelderland
|
|
# Address: Borculoseweg 2, 7261 BJ Ruurlo
|
|
# Note: Located at Kasteel Ruurlo (formerly Museum MORE annex)
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MM-het_mag_museum.yaml',
|
|
'institution_name': 'het MAG museum (Maastricht Aardewerk & Glas)',
|
|
'city': 'Ruurlo',
|
|
'region': 'Gelderland',
|
|
'region_code': 'GE',
|
|
'city_code': 'RUU',
|
|
'address': 'Borculoseweg 2, 7261 BJ Ruurlo',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'http://www.hetmagmuseum.nl/',
|
|
},
|
|
# Mineralogisch Museum - Grou, Friesland
|
|
# Address: Leechlân 22, 9001 ZH Grou
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MM-mineralogisch_museum.yaml',
|
|
'institution_name': 'Mineralogisch Museum Grou',
|
|
'city': 'Grou',
|
|
'region': 'Friesland',
|
|
'region_code': 'FR',
|
|
'city_code': 'GRO',
|
|
'address': 'Leechlân 22, 9001 ZH Grou',
|
|
'source': 'exa_web_search',
|
|
'source_url': 'https://mineralengrou.nl/',
|
|
},
|
|
]
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
|
|
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
return int.from_bytes(sha256_hash[:8], byteorder='big')
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
# Create UUID v8 format
|
|
uuid_bytes = bytearray(sha256_hash[:16])
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
|
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
|
|
|
def apply_enrichment(custodian_dir: Path, enrichment: dict) -> tuple[str | None, str | None]:
|
|
"""Apply enrichment to a file and return (old_path, new_path)."""
|
|
old_path = custodian_dir / enrichment['old_filename']
|
|
|
|
if not old_path.exists():
|
|
print(f" ❌ File not found: {old_path}")
|
|
return None, None
|
|
|
|
# Load YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Extract current GHCID components
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
|
|
# Parse old GHCID to get type and abbreviation
|
|
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
|
|
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
|
|
if not match:
|
|
print(f" ❌ Could not parse GHCID: {old_ghcid}")
|
|
return None, None
|
|
|
|
inst_type = match.group(1)
|
|
abbrev = match.group(2)
|
|
name_suffix = match.group(3) # May be None
|
|
|
|
# Build new GHCID
|
|
new_ghcid = f"NL-{enrichment['region_code']}-{enrichment['city_code']}-{inst_type}-{abbrev}"
|
|
if name_suffix:
|
|
new_ghcid += f"-{name_suffix}"
|
|
|
|
# Generate new identifiers
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data['location'] = {
|
|
'city': enrichment['city'],
|
|
'region': enrichment['region'],
|
|
'country': 'NL',
|
|
}
|
|
if enrichment.get('address'):
|
|
data['location']['address'] = enrichment['address']
|
|
|
|
# Update ghcid
|
|
old_numeric = data['ghcid'].get('ghcid_numeric', 0)
|
|
|
|
# Add to ghcid_history - mark old as ended
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid_history'] = []
|
|
|
|
# Close out the old entry
|
|
for entry in data['ghcid']['ghcid_history']:
|
|
if entry.get('valid_to') is None:
|
|
entry['valid_to'] = timestamp
|
|
|
|
# Add new history entry
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': new_numeric,
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': f"Location enriched via Exa web search - {enrichment['city']}, {enrichment['region']}"
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = new_uuid
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
|
|
# Add location_resolution
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'EXA_WEB_SEARCH',
|
|
'city_code': enrichment['city_code'],
|
|
'city_name': enrichment['city'],
|
|
'region_code': enrichment['region_code'],
|
|
'region_name': enrichment['region'],
|
|
'country_code': 'NL',
|
|
'resolution_date': timestamp,
|
|
'source_url': enrichment.get('source_url'),
|
|
}
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
data['provenance']['notes'].append(
|
|
f"Location enriched on {timestamp[:10]} via Exa web search: {enrichment['city']}, {enrichment['region']}"
|
|
)
|
|
|
|
# Add web search source to provenance
|
|
if 'sources' not in data['provenance']:
|
|
data['provenance']['sources'] = {}
|
|
if 'web_search' not in data['provenance']['sources']:
|
|
data['provenance']['sources']['web_search'] = []
|
|
data['provenance']['sources']['web_search'].append({
|
|
'source_type': 'exa_web_search',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'source_url': enrichment.get('source_url'),
|
|
'extraction_timestamp': timestamp,
|
|
'claims_extracted': ['city', 'region', 'address'],
|
|
})
|
|
|
|
# Write updated YAML to new filename
|
|
new_filename = new_ghcid.replace('/', '_') + '.yaml'
|
|
new_path = custodian_dir / new_filename
|
|
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Remove old file
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
return str(old_path), str(new_path)
|
|
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print("=" * 60)
|
|
print("Applying Verified Location Enrichments")
|
|
print("=" * 60)
|
|
|
|
if not VERIFIED_ENRICHMENTS:
|
|
print("\nNo enrichments to process. Add entries to VERIFIED_ENRICHMENTS list.")
|
|
return
|
|
|
|
success_count = 0
|
|
|
|
for enrichment in VERIFIED_ENRICHMENTS:
|
|
print(f"\nProcessing: {enrichment['old_filename']}")
|
|
print(f" → {enrichment['city']}, {enrichment['region']} ({enrichment['region_code']}-{enrichment['city_code']})")
|
|
|
|
old_path, new_path = apply_enrichment(custodian_dir, enrichment)
|
|
|
|
if old_path and new_path:
|
|
old_name = os.path.basename(old_path)
|
|
new_name = os.path.basename(new_path)
|
|
print(f" ✅ Renamed: {old_name}")
|
|
print(f" → {new_name}")
|
|
success_count += 1
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Summary: {success_count}/{len(VERIFIED_ENRICHMENTS)} files enriched and renamed")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|