glam/scripts/fix_xxx_country_codes.py
2025-12-17 10:11:56 +01:00

246 lines
8.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""Fix XXX files with incorrect country codes based on city location."""
import os
import re
import yaml
import uuid
import hashlib
from pathlib import Path
from datetime import datetime, timezone
# City to (country, region) mapping
CITY_LOCATION_MAP = {
# Australia
'canberra': ('AU', 'ACT', 'Canberra'),
'sydney': ('AU', 'NSW', 'Sydney'),
# Bangladesh
'dhaka': ('BD', '13', 'Dhaka'),
# Belgium
'gent': ('BE', 'VOV', 'Ghent'),
'ghent': ('BE', 'VOV', 'Ghent'),
'maasmechelen': ('BE', 'VLI', 'Maasmechelen'),
# Czech Republic
'prague': ('CZ', '10', 'Prague'),
# France
'normandy': ('FR', 'NOR', 'Caen'), # Normandy region, Caen city
# Germany
'bonn': ('DE', 'NW', 'Bonn'),
'münchen': ('DE', 'BY', 'Munich'),
'munich': ('DE', 'BY', 'Munich'),
'rhineland-palatinate': ('DE', 'RP', 'Mainz'), # State-level, use capital
# Indonesia
'agats': ('ID', 'PB', 'Agats'),
'ubud': ('ID', 'BA', 'Ubud'),
'denpasar': ('ID', 'BA', 'Denpasar'),
'makassar': ('ID', 'SN', 'Makassar'),
'kediri': ('ID', 'JI', 'Kediri'),
'depok': ('ID', 'JB', 'Depok'),
'bantul': ('ID', 'YO', 'Bantul'),
'sidoarjo': ('ID', 'JI', 'Sidoarjo'),
'bali': ('ID', 'BA', 'Denpasar'), # Use Denpasar as capital
'karangasem': ('ID', 'BA', 'Karangasem'),
'gianyar': ('ID', 'BA', 'Gianyar'),
'tangerang': ('ID', 'BT', 'Tangerang'),
'aceh': ('ID', 'AC', 'Banda Aceh'),
'surakarta': ('ID', 'JT', 'Surakarta'),
'pererenan': ('ID', 'BA', 'Pererenan'),
'medan': ('ID', 'SU', 'Medan'),
# South Africa
'pretoria': ('ZA', 'GT', 'Pretoria'),
# Turkey
'istanbul': ('TR', '34', 'Istanbul'),
# UK
'essex': ('GB', 'ENG', 'Chelmsford'), # County, use county town
'pulborough': ('GB', 'ENG', 'Pulborough'),
'glasgow': ('GB', 'SCT', 'Glasgow'),
'windsor': ('GB', 'ENG', 'Windsor'),
# USA
'amelia': ('US', 'VA', 'Amelia'),
}
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
if not city_name:
return 'XXX'
# Remove diacritics
import unicodedata
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Take first 3 letters, uppercase
return ascii_name[:3].upper()
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def fix_xxx_file(filepath: Path, city_key: str, dry_run: bool = False) -> bool:
"""Fix a single XXX file with correct country code."""
if city_key not in CITY_LOCATION_MAP:
print(f" ⚠ Unknown city: {city_key}")
return False
country, region, city_name = CITY_LOCATION_MAP[city_key]
city_code = generate_city_code(city_name)
# Read file
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
# Extract type and abbreviation from old GHCID
# Format: NL-XX-XXX-{TYPE}-{ABBREV}
match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', old_ghcid)
if not match:
print(f" ⚠ Invalid GHCID format: {old_ghcid}")
return False
inst_type = match.group(1)
abbreviation = match.group(2)
# Generate new GHCID
new_ghcid = f"{country}-{region}-{city_code}-{inst_type}-{abbreviation}"
print(f" Old: {old_ghcid}")
print(f" New: {new_ghcid}")
if dry_run:
return True
# Update data
timestamp = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': city_name,
'region': region,
'country': country
}
# Update GHCID
new_uuid = generate_ghcid_uuid(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
# Add history entry
history = data.get('ghcid', {}).get('ghcid_history', [])
# Mark old entry as ended
if history and history[0].get('valid_to') is None:
history[0]['valid_to'] = timestamp
# Add new entry
history.insert(0, {
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': f"Corrected country from NL to {country} based on city location ({city_name})"
})
data['ghcid'] = {
'ghcid_current': new_ghcid,
'ghcid_original': data.get('ghcid', {}).get('ghcid_original', old_ghcid),
'ghcid_uuid': new_uuid,
'ghcid_uuid_sha256': data.get('ghcid', {}).get('ghcid_uuid_sha256', ''),
'ghcid_numeric': new_numeric,
'record_id': data.get('ghcid', {}).get('record_id', str(uuid.uuid4())),
'generation_timestamp': timestamp,
'ghcid_history': history,
'location_resolution': {
'method': 'CITY_LOOKUP',
'city_code': city_code,
'city_name': city_name,
'region_code': region,
'country_code': country,
'resolution_date': timestamp
}
}
# Write updated file with new name
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
with open(new_filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Delete old file if different name
if new_filepath != filepath:
os.remove(filepath)
print(f" Created: {new_filename}")
print(f" Deleted: {filepath.name}")
return True
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
# Files to fix with their city keys
files_to_fix = [
('NL-XX-XXX-L-NLA.yaml', 'canberra'),
('NL-XX-XXX-M-IFDA.yaml', 'sydney'),
('NL-XX-XXX-M-OOBM.yaml', 'dhaka'),
('NL-XX-XXX-M-MV-maasmechelen_village.yaml', 'maasmechelen'),
('NL-XX-XXX-A-MFIFA.yaml', 'gent'),
('NL-XX-XXX-R-EAAE.yaml', 'prague'),
('NL-XX-XXX-M-N.yaml', 'rhineland-palatinate'),
('NL-XX-XXX-M-DSDGFMP.yaml', 'bonn'),
('NL-XX-XXX-M-ZFK.yaml', 'münchen'),
('NL-XX-XXX-M-SGILUKM.yaml', 'münchen'),
('NL-XX-XXX-M-CM-caen_memorial.yaml', 'normandy'),
('NL-XX-XXX-M-AH.yaml', 'essex'),
('NL-XX-XXX-M-AHLU.yaml', 'pulborough'),
('NL-XX-XXX-M-CA.yaml', 'windsor'),
('NL-XX-XXX-L-DPC.yaml', 'glasgow'),
('NL-XX-XXX-M-RZWMNS.yaml', 'medan'),
('NL-XX-XXX-M-PM.yaml', 'aceh'),
('NL-XX-XXX-M-MT-museum_of_toys.yaml', 'tangerang'),
('NL-XX-XXX-M-MP-museum_pendet.yaml', 'bali'),
('NL-XX-XXX-M-TPM.yaml', 'surakarta'),
('NL-XX-XXX-M-AMCP.yaml', 'agats'),
('NL-XX-XXX-M-MA-museum_airlangga.yaml', 'kediri'),
('NL-XX-XXX-M-MM-museum_muhammadiyah.yaml', 'bantul'),
('NL-XX-XXX-M-SNSM.yaml', 'bali'),
('NL-XX-XXX-M-WSM.yaml', 'pererenan'),
('NL-XX-XXX-M-BFYHB.yaml', 'makassar'),
('NL-XX-XXX-M-MR-museum_rudana.yaml', 'gianyar'),
('NL-XX-XXX-M-APAMR.yaml', 'denpasar'),
('NL-XX-XXX-M-YG.yaml', 'aceh'),
('NL-XX-XXX-M-AMRUB.yaml', 'ubud'),
('NL-XX-XXX-M-MTA.yaml', 'aceh'),
('NL-XX-XXX-M-MB.yaml', 'depok'),
('NL-XX-XXX-M-MPL.yaml', 'karangasem'),
('NL-XX-XXX-M-MMT.yaml', 'sidoarjo'),
('NL-XX-XXX-M-AIACFPAAF.yaml', 'istanbul'),
('NL-XX-XXX-R-AFRICAA.yaml', 'amelia'),
('NL-XX-XXX-M-DEH.yaml', 'pretoria'),
]
print("=" * 80)
print("FIXING NON-DUTCH INSTITUTION COUNTRY CODES")
print("=" * 80)
success_count = 0
for filename, city_key in files_to_fix:
filepath = custodian_dir / filename
if not filepath.exists():
print(f"\n⚠ File not found: {filename}")
continue
print(f"\n=== Processing: {filename} ===")
if fix_xxx_file(filepath, city_key):
success_count += 1
print("\n" + "=" * 80)
print(f"SUMMARY: Fixed {success_count}/{len(files_to_fix)} files")
print("=" * 80)
if __name__ == '__main__':
main()