glam/scripts/fix_non_dutch_institutions.py
2025-12-17 11:58:40 +01:00

299 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Fix institutions incorrectly assigned to NL (Netherlands) that are actually in other countries.
These institutions were imported from LinkedIn batch import but have wrong country codes.
"""
import yaml
import os
import re
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
# Non-Dutch institutions to fix
# Verified via Exa web search 2025-12-17
NON_DUTCH_INSTITUTIONS = [
{
'old_filename': 'NL-XX-XXX-A-HAEU.yaml',
'institution_name': 'Historical Archives of the European Union',
'country': 'IT',
'country_name': 'Italy',
'region': 'Tuscany',
'region_code': '52', # Italian region code
'city': 'Firenze',
'city_code': 'FIR',
'address': 'Via Bolognese 156, 50139 Firenze, Villa Salviati',
'source_url': 'https://archives.eui.eu/en/repositories/1',
'notes': 'Part of European University Institute, Florence'
},
{
'old_filename': 'NL-XX-XXX-A-VZWADEB.yaml',
'institution_name': 'v.z.w. Archief- en Documentatiecentrum Erfgoed Binnenvaart',
'country': 'BE',
'country_name': 'Belgium',
'region': 'West-Vlaanderen',
'region_code': 'VWV',
'city': 'Oudenburg',
'city_code': 'OUD',
'address': 'Vaartdijk zuid 11, 8460 Oudenburg (aboard Museumschip Tordino)',
'source_url': 'http://binnenvaarterfgoed.be/',
'notes': 'Belgian v.z.w. (vzw = Belgian non-profit), located aboard museum ship'
},
{
'old_filename': 'NL-XX-XXX-M-FM-ford_museum.yaml',
'institution_name': 'Gerald R. Ford Presidential Museum',
'country': 'US',
'country_name': 'United States',
'region': 'Michigan',
'region_code': 'MI',
'city': 'Grand Rapids',
'city_code': 'GRA',
'address': '303 Pearl Street NW, Grand Rapids, MI 49504',
'source_url': 'https://www.fordlibrarymuseum.gov/visit/museum',
'notes': 'Part of National Archives system, commemorates 38th US President',
# Update institution_type from M to O (Official Institution - Presidential Library)
'new_institution_type': 'O',
},
{
'old_filename': 'NL-XX-XXX-M-DAJ.yaml',
'institution_name': 'Diorama Arsip Jogja',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Daerah Istimewa Yogyakarta',
'region_code': 'YO',
'city': 'Bantul',
'city_code': 'BAN',
'address': 'LT 1 Gedung DEPO ARSIP, Jl. Janti, Banguntapan, Kabupaten Bantul, Yogyakarta 55198',
'source_url': 'https://dioramaarsip.jogjaprov.go.id/home',
'notes': 'Digital archive diorama of Yogyakarta history, opened February 2022',
# It's actually an Archive (A), not Museum (M)
'new_institution_type': 'A',
},
# Batch 2: Added 2025-12-17 - More Indonesian and Palestinian institutions
{
'old_filename': 'NL-XX-XXX-M-MBV.yaml',
'institution_name': 'Museum Benteng Vredeburg',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Daerah Istimewa Yogyakarta',
'region_code': 'YO',
'city': 'Yogyakarta',
'city_code': 'YOG',
'address': 'Jl. Margo Mulyo No.6, Ngupasan, Kec. Gondomanan, Kota Yogyakarta 55122',
'source_url': 'https://forevervacation.com/yogyakarta/museum-benteng-vredeburg',
'notes': 'Dutch colonial fortress converted to museum in 1992, documents Indonesian independence struggle',
},
{
'old_filename': 'NL-XX-XXX-M-MBP.yaml',
'institution_name': 'Museum Batik Pekalongan',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Jawa Tengah',
'region_code': 'JT', # Central Java
'city': 'Pekalongan',
'city_code': 'PEK',
'address': 'Jl. Jetayu No.1, Pekalongan 51152',
'source_url': 'https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan',
'notes': 'UNESCO recognized museum for batik conservation, opened 12 July 2006 by President SBY',
},
{
'old_filename': 'NL-XX-XXX-M-MG.yaml',
'institution_name': 'Municipality of Gaza',
'country': 'PS',
'country_name': 'Palestine',
'region': 'Gaza Strip',
'region_code': 'GZ',
'city': 'Gaza City',
'city_code': 'GAZ',
'address': None, # Address not verifiable due to current situation
'source_url': 'https://www.gaza-city.org',
'notes': 'Municipal government, founded 1898. Type corrected from M (Museum) to O (Official Institution)',
# It's a municipality (government), not a museum
'new_institution_type': 'O',
},
]
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
uuid_bytes = bytearray(sha256_hash[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def fix_institution(custodian_dir: Path, inst: dict) -> tuple[str | None, str | None]:
"""Fix a non-Dutch institution and return (old_path, new_path)."""
old_path = custodian_dir / inst['old_filename']
if not old_path.exists():
print(f" File not found: {old_path}")
return None, None
# Load YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Extract current GHCID components
old_ghcid = data['ghcid']['ghcid_current']
# Parse old GHCID to get type and abbreviation
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
if not match:
print(f" Could not parse GHCID: {old_ghcid}")
return None, None
inst_type = match.group(1)
abbrev = match.group(2)
name_suffix = match.group(3) # May be None
# Check if we need to change institution type
if inst.get('new_institution_type'):
inst_type = inst['new_institution_type']
# Also update the institution_type field
data['institution_type'] = [inst_type]
# Build new GHCID with correct country
new_ghcid = f"{inst['country']}-{inst['region_code']}-{inst['city_code']}-{inst_type}-{abbrev}"
if name_suffix:
new_ghcid += f"-{name_suffix}"
# Generate new identifiers
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
timestamp = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': inst['city'],
'region': inst['region'],
'country': inst['country'],
}
if inst.get('address'):
data['location']['address'] = inst['address']
# Close out old ghcid_history entries
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
for entry in data['ghcid']['ghcid_history']:
if entry.get('valid_to') is None:
entry['valid_to'] = timestamp
# Add new history entry
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': f"Country code corrected: NL -> {inst['country']} ({inst['country_name']}). "
f"Location: {inst['city']}, {inst['region']}"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_original'] = new_ghcid # Also update original since NL was wrong
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
# Add location_resolution
data['ghcid']['location_resolution'] = {
'method': 'EXA_WEB_SEARCH',
'city_code': inst['city_code'],
'city_name': inst['city'],
'region_code': inst['region_code'],
'region_name': inst['region'],
'country_code': inst['country'],
'resolution_date': timestamp,
'source_url': inst.get('source_url'),
'notes': inst.get('notes'),
}
# Update provenance
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
data['provenance']['notes'].append(
f"Country code corrected on {timestamp[:10]}: NL was incorrect, "
f"institution is in {inst['country_name']} ({inst['country']})"
)
# Add web search source to provenance
if 'sources' not in data['provenance']:
data['provenance']['sources'] = {}
if 'web_search' not in data['provenance']['sources']:
data['provenance']['sources']['web_search'] = []
data['provenance']['sources']['web_search'].append({
'source_type': 'exa_web_search',
'data_tier': 'TIER_2_VERIFIED', # Higher tier since we verified country
'source_url': inst.get('source_url'),
'extraction_timestamp': timestamp,
'claims_extracted': ['country', 'region', 'city', 'address'],
})
# Write updated YAML to new filename
new_filename = new_ghcid.replace('/', '_') + '.yaml'
new_path = custodian_dir / new_filename
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove old file
if old_path != new_path:
old_path.unlink()
return str(old_path), str(new_path)
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 70)
print("Fixing Non-Dutch Institutions (Country Code Corrections)")
print("=" * 70)
success_count = 0
for inst in NON_DUTCH_INSTITUTIONS:
print(f"\nProcessing: {inst['old_filename']}")
print(f" Institution: {inst['institution_name']}")
print(f" Correction: NL -> {inst['country']} ({inst['country_name']})")
print(f" Location: {inst['city']}, {inst['region']}")
old_path, new_path = fix_institution(custodian_dir, inst)
if old_path and new_path:
old_name = os.path.basename(old_path)
new_name = os.path.basename(new_path)
print(f" Renamed: {old_name}")
print(f" -> {new_name}")
success_count += 1
print("\n" + "=" * 70)
print(f"Summary: {success_count}/{len(NON_DUTCH_INSTITUTIONS)} institutions corrected")
print("=" * 70)
if __name__ == '__main__':
main()