299 lines
11 KiB
Python
299 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix institutions incorrectly assigned to NL (Netherlands) that are actually in other countries.
|
|
|
|
These institutions were imported from LinkedIn batch import but have wrong country codes.
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
import re
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Non-Dutch institutions to fix
|
|
# Verified via Exa web search 2025-12-17
|
|
NON_DUTCH_INSTITUTIONS = [
|
|
{
|
|
'old_filename': 'NL-XX-XXX-A-HAEU.yaml',
|
|
'institution_name': 'Historical Archives of the European Union',
|
|
'country': 'IT',
|
|
'country_name': 'Italy',
|
|
'region': 'Tuscany',
|
|
'region_code': '52', # Italian region code
|
|
'city': 'Firenze',
|
|
'city_code': 'FIR',
|
|
'address': 'Via Bolognese 156, 50139 Firenze, Villa Salviati',
|
|
'source_url': 'https://archives.eui.eu/en/repositories/1',
|
|
'notes': 'Part of European University Institute, Florence'
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-A-VZWADEB.yaml',
|
|
'institution_name': 'v.z.w. Archief- en Documentatiecentrum Erfgoed Binnenvaart',
|
|
'country': 'BE',
|
|
'country_name': 'Belgium',
|
|
'region': 'West-Vlaanderen',
|
|
'region_code': 'VWV',
|
|
'city': 'Oudenburg',
|
|
'city_code': 'OUD',
|
|
'address': 'Vaartdijk zuid 11, 8460 Oudenburg (aboard Museumschip Tordino)',
|
|
'source_url': 'http://binnenvaarterfgoed.be/',
|
|
'notes': 'Belgian v.z.w. (vzw = Belgian non-profit), located aboard museum ship'
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-FM-ford_museum.yaml',
|
|
'institution_name': 'Gerald R. Ford Presidential Museum',
|
|
'country': 'US',
|
|
'country_name': 'United States',
|
|
'region': 'Michigan',
|
|
'region_code': 'MI',
|
|
'city': 'Grand Rapids',
|
|
'city_code': 'GRA',
|
|
'address': '303 Pearl Street NW, Grand Rapids, MI 49504',
|
|
'source_url': 'https://www.fordlibrarymuseum.gov/visit/museum',
|
|
'notes': 'Part of National Archives system, commemorates 38th US President',
|
|
# Update institution_type from M to O (Official Institution - Presidential Library)
|
|
'new_institution_type': 'O',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-DAJ.yaml',
|
|
'institution_name': 'Diorama Arsip Jogja',
|
|
'country': 'ID',
|
|
'country_name': 'Indonesia',
|
|
'region': 'Daerah Istimewa Yogyakarta',
|
|
'region_code': 'YO',
|
|
'city': 'Bantul',
|
|
'city_code': 'BAN',
|
|
'address': 'LT 1 Gedung DEPO ARSIP, Jl. Janti, Banguntapan, Kabupaten Bantul, Yogyakarta 55198',
|
|
'source_url': 'https://dioramaarsip.jogjaprov.go.id/home',
|
|
'notes': 'Digital archive diorama of Yogyakarta history, opened February 2022',
|
|
# It's actually an Archive (A), not Museum (M)
|
|
'new_institution_type': 'A',
|
|
},
|
|
# Batch 2: Added 2025-12-17 - More Indonesian and Palestinian institutions
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MBV.yaml',
|
|
'institution_name': 'Museum Benteng Vredeburg',
|
|
'country': 'ID',
|
|
'country_name': 'Indonesia',
|
|
'region': 'Daerah Istimewa Yogyakarta',
|
|
'region_code': 'YO',
|
|
'city': 'Yogyakarta',
|
|
'city_code': 'YOG',
|
|
'address': 'Jl. Margo Mulyo No.6, Ngupasan, Kec. Gondomanan, Kota Yogyakarta 55122',
|
|
'source_url': 'https://forevervacation.com/yogyakarta/museum-benteng-vredeburg',
|
|
'notes': 'Dutch colonial fortress converted to museum in 1992, documents Indonesian independence struggle',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MBP.yaml',
|
|
'institution_name': 'Museum Batik Pekalongan',
|
|
'country': 'ID',
|
|
'country_name': 'Indonesia',
|
|
'region': 'Jawa Tengah',
|
|
'region_code': 'JT', # Central Java
|
|
'city': 'Pekalongan',
|
|
'city_code': 'PEK',
|
|
'address': 'Jl. Jetayu No.1, Pekalongan 51152',
|
|
'source_url': 'https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan',
|
|
'notes': 'UNESCO recognized museum for batik conservation, opened 12 July 2006 by President SBY',
|
|
},
|
|
{
|
|
'old_filename': 'NL-XX-XXX-M-MG.yaml',
|
|
'institution_name': 'Municipality of Gaza',
|
|
'country': 'PS',
|
|
'country_name': 'Palestine',
|
|
'region': 'Gaza Strip',
|
|
'region_code': 'GZ',
|
|
'city': 'Gaza City',
|
|
'city_code': 'GAZ',
|
|
'address': None, # Address not verifiable due to current situation
|
|
'source_url': 'https://www.gaza-city.org',
|
|
'notes': 'Municipal government, founded 1898. Type corrected from M (Museum) to O (Official Institution)',
|
|
# It's a municipality (government), not a museum
|
|
'new_institution_type': 'O',
|
|
},
|
|
]
|
|
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
|
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
|
|
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
return int.from_bytes(sha256_hash[:8], byteorder='big')
|
|
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
uuid_bytes = bytearray(sha256_hash[:16])
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
|
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
|
|
|
|
|
def fix_institution(custodian_dir: Path, inst: dict) -> tuple[str | None, str | None]:
|
|
"""Fix a non-Dutch institution and return (old_path, new_path)."""
|
|
old_path = custodian_dir / inst['old_filename']
|
|
|
|
if not old_path.exists():
|
|
print(f" File not found: {old_path}")
|
|
return None, None
|
|
|
|
# Load YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Extract current GHCID components
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
|
|
# Parse old GHCID to get type and abbreviation
|
|
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
|
|
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
|
|
if not match:
|
|
print(f" Could not parse GHCID: {old_ghcid}")
|
|
return None, None
|
|
|
|
inst_type = match.group(1)
|
|
abbrev = match.group(2)
|
|
name_suffix = match.group(3) # May be None
|
|
|
|
# Check if we need to change institution type
|
|
if inst.get('new_institution_type'):
|
|
inst_type = inst['new_institution_type']
|
|
# Also update the institution_type field
|
|
data['institution_type'] = [inst_type]
|
|
|
|
# Build new GHCID with correct country
|
|
new_ghcid = f"{inst['country']}-{inst['region_code']}-{inst['city_code']}-{inst_type}-{abbrev}"
|
|
if name_suffix:
|
|
new_ghcid += f"-{name_suffix}"
|
|
|
|
# Generate new identifiers
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data['location'] = {
|
|
'city': inst['city'],
|
|
'region': inst['region'],
|
|
'country': inst['country'],
|
|
}
|
|
if inst.get('address'):
|
|
data['location']['address'] = inst['address']
|
|
|
|
# Close out old ghcid_history entries
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
for entry in data['ghcid']['ghcid_history']:
|
|
if entry.get('valid_to') is None:
|
|
entry['valid_to'] = timestamp
|
|
|
|
# Add new history entry
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': new_numeric,
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': f"Country code corrected: NL -> {inst['country']} ({inst['country_name']}). "
|
|
f"Location: {inst['city']}, {inst['region']}"
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_original'] = new_ghcid # Also update original since NL was wrong
|
|
data['ghcid']['ghcid_uuid'] = new_uuid
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
|
|
# Add location_resolution
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'EXA_WEB_SEARCH',
|
|
'city_code': inst['city_code'],
|
|
'city_name': inst['city'],
|
|
'region_code': inst['region_code'],
|
|
'region_name': inst['region'],
|
|
'country_code': inst['country'],
|
|
'resolution_date': timestamp,
|
|
'source_url': inst.get('source_url'),
|
|
'notes': inst.get('notes'),
|
|
}
|
|
|
|
# Update provenance
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
data['provenance']['notes'].append(
|
|
f"Country code corrected on {timestamp[:10]}: NL was incorrect, "
|
|
f"institution is in {inst['country_name']} ({inst['country']})"
|
|
)
|
|
|
|
# Add web search source to provenance
|
|
if 'sources' not in data['provenance']:
|
|
data['provenance']['sources'] = {}
|
|
if 'web_search' not in data['provenance']['sources']:
|
|
data['provenance']['sources']['web_search'] = []
|
|
data['provenance']['sources']['web_search'].append({
|
|
'source_type': 'exa_web_search',
|
|
'data_tier': 'TIER_2_VERIFIED', # Higher tier since we verified country
|
|
'source_url': inst.get('source_url'),
|
|
'extraction_timestamp': timestamp,
|
|
'claims_extracted': ['country', 'region', 'city', 'address'],
|
|
})
|
|
|
|
# Write updated YAML to new filename
|
|
new_filename = new_ghcid.replace('/', '_') + '.yaml'
|
|
new_path = custodian_dir / new_filename
|
|
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Remove old file
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
return str(old_path), str(new_path)
|
|
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print("=" * 70)
|
|
print("Fixing Non-Dutch Institutions (Country Code Corrections)")
|
|
print("=" * 70)
|
|
|
|
success_count = 0
|
|
|
|
for inst in NON_DUTCH_INSTITUTIONS:
|
|
print(f"\nProcessing: {inst['old_filename']}")
|
|
print(f" Institution: {inst['institution_name']}")
|
|
print(f" Correction: NL -> {inst['country']} ({inst['country_name']})")
|
|
print(f" Location: {inst['city']}, {inst['region']}")
|
|
|
|
old_path, new_path = fix_institution(custodian_dir, inst)
|
|
|
|
if old_path and new_path:
|
|
old_name = os.path.basename(old_path)
|
|
new_name = os.path.basename(new_path)
|
|
print(f" Renamed: {old_name}")
|
|
print(f" -> {new_name}")
|
|
success_count += 1
|
|
|
|
print("\n" + "=" * 70)
|
|
print(f"Summary: {success_count}/{len(NON_DUTCH_INSTITUTIONS)} institutions corrected")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|