#!/usr/bin/env python3 """Fix XXX files with incorrect country codes based on city location.""" import os import re import yaml import uuid import hashlib from pathlib import Path from datetime import datetime, timezone # City to (country, region) mapping CITY_LOCATION_MAP = { # Australia 'canberra': ('AU', 'ACT', 'Canberra'), 'sydney': ('AU', 'NSW', 'Sydney'), # Bangladesh 'dhaka': ('BD', '13', 'Dhaka'), # Belgium 'gent': ('BE', 'VOV', 'Ghent'), 'ghent': ('BE', 'VOV', 'Ghent'), 'maasmechelen': ('BE', 'VLI', 'Maasmechelen'), # Czech Republic 'prague': ('CZ', '10', 'Prague'), # France 'normandy': ('FR', 'NOR', 'Caen'), # Normandy region, Caen city # Germany 'bonn': ('DE', 'NW', 'Bonn'), 'münchen': ('DE', 'BY', 'Munich'), 'munich': ('DE', 'BY', 'Munich'), 'rhineland-palatinate': ('DE', 'RP', 'Mainz'), # State-level, use capital # Indonesia 'agats': ('ID', 'PB', 'Agats'), 'ubud': ('ID', 'BA', 'Ubud'), 'denpasar': ('ID', 'BA', 'Denpasar'), 'makassar': ('ID', 'SN', 'Makassar'), 'kediri': ('ID', 'JI', 'Kediri'), 'depok': ('ID', 'JB', 'Depok'), 'bantul': ('ID', 'YO', 'Bantul'), 'sidoarjo': ('ID', 'JI', 'Sidoarjo'), 'bali': ('ID', 'BA', 'Denpasar'), # Use Denpasar as capital 'karangasem': ('ID', 'BA', 'Karangasem'), 'gianyar': ('ID', 'BA', 'Gianyar'), 'tangerang': ('ID', 'BT', 'Tangerang'), 'aceh': ('ID', 'AC', 'Banda Aceh'), 'surakarta': ('ID', 'JT', 'Surakarta'), 'pererenan': ('ID', 'BA', 'Pererenan'), 'medan': ('ID', 'SU', 'Medan'), # South Africa 'pretoria': ('ZA', 'GT', 'Pretoria'), # Turkey 'istanbul': ('TR', '34', 'Istanbul'), # UK 'essex': ('GB', 'ENG', 'Chelmsford'), # County, use county town 'pulborough': ('GB', 'ENG', 'Pulborough'), 'glasgow': ('GB', 'SCT', 'Glasgow'), 'windsor': ('GB', 'ENG', 'Windsor'), # USA 'amelia': ('US', 'VA', 'Amelia'), } def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" if not city_name: return 'XXX' # Remove diacritics import unicodedata normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Take first 3 letters, uppercase return ascii_name[:3].upper() def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string)) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() return int.from_bytes(sha256_hash[:8], byteorder='big') def fix_xxx_file(filepath: Path, city_key: str, dry_run: bool = False) -> bool: """Fix a single XXX file with correct country code.""" if city_key not in CITY_LOCATION_MAP: print(f" ⚠ Unknown city: {city_key}") return False country, region, city_name = CITY_LOCATION_MAP[city_key] city_code = generate_city_code(city_name) # Read file with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') # Extract type and abbreviation from old GHCID # Format: NL-XX-XXX-{TYPE}-{ABBREV} match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', old_ghcid) if not match: print(f" ⚠ Invalid GHCID format: {old_ghcid}") return False inst_type = match.group(1) abbreviation = match.group(2) # Generate new GHCID new_ghcid = f"{country}-{region}-{city_code}-{inst_type}-{abbreviation}" print(f" Old: {old_ghcid}") print(f" New: {new_ghcid}") if dry_run: return True # Update data timestamp = datetime.now(timezone.utc).isoformat() # Update location data['location'] = { 'city': city_name, 'region': region, 'country': country } # Update GHCID new_uuid = generate_ghcid_uuid(new_ghcid) new_numeric = generate_ghcid_numeric(new_ghcid) # Add history entry history = data.get('ghcid', {}).get('ghcid_history', []) # Mark old entry as ended if history and history[0].get('valid_to') is None: history[0]['valid_to'] = timestamp # Add new entry history.insert(0, { 'ghcid': new_ghcid, 'ghcid_numeric': new_numeric, 'valid_from': timestamp, 'valid_to': None, 'reason': f"Corrected country from NL to {country} based on city location ({city_name})" }) data['ghcid'] = { 'ghcid_current': new_ghcid, 'ghcid_original': data.get('ghcid', {}).get('ghcid_original', old_ghcid), 'ghcid_uuid': new_uuid, 'ghcid_uuid_sha256': data.get('ghcid', {}).get('ghcid_uuid_sha256', ''), 'ghcid_numeric': new_numeric, 'record_id': data.get('ghcid', {}).get('record_id', str(uuid.uuid4())), 'generation_timestamp': timestamp, 'ghcid_history': history, 'location_resolution': { 'method': 'CITY_LOOKUP', 'city_code': city_code, 'city_name': city_name, 'region_code': region, 'country_code': country, 'resolution_date': timestamp } } # Write updated file with new name new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename with open(new_filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Delete old file if different name if new_filepath != filepath: os.remove(filepath) print(f" Created: {new_filename}") print(f" Deleted: {filepath.name}") return True def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') # Files to fix with their city keys files_to_fix = [ ('NL-XX-XXX-L-NLA.yaml', 'canberra'), ('NL-XX-XXX-M-IFDA.yaml', 'sydney'), ('NL-XX-XXX-M-OOBM.yaml', 'dhaka'), ('NL-XX-XXX-M-MV-maasmechelen_village.yaml', 'maasmechelen'), ('NL-XX-XXX-A-MFIFA.yaml', 'gent'), ('NL-XX-XXX-R-EAAE.yaml', 'prague'), ('NL-XX-XXX-M-N.yaml', 'rhineland-palatinate'), ('NL-XX-XXX-M-DSDGFMP.yaml', 'bonn'), ('NL-XX-XXX-M-ZFK.yaml', 'münchen'), ('NL-XX-XXX-M-SGILUKM.yaml', 'münchen'), ('NL-XX-XXX-M-CM-caen_memorial.yaml', 'normandy'), ('NL-XX-XXX-M-AH.yaml', 'essex'), ('NL-XX-XXX-M-AHLU.yaml', 'pulborough'), ('NL-XX-XXX-M-CA.yaml', 'windsor'), ('NL-XX-XXX-L-DPC.yaml', 'glasgow'), ('NL-XX-XXX-M-RZWMNS.yaml', 'medan'), ('NL-XX-XXX-M-PM.yaml', 'aceh'), ('NL-XX-XXX-M-MT-museum_of_toys.yaml', 'tangerang'), ('NL-XX-XXX-M-MP-museum_pendet.yaml', 'bali'), ('NL-XX-XXX-M-TPM.yaml', 'surakarta'), ('NL-XX-XXX-M-AMCP.yaml', 'agats'), ('NL-XX-XXX-M-MA-museum_airlangga.yaml', 'kediri'), ('NL-XX-XXX-M-MM-museum_muhammadiyah.yaml', 'bantul'), ('NL-XX-XXX-M-SNSM.yaml', 'bali'), ('NL-XX-XXX-M-WSM.yaml', 'pererenan'), ('NL-XX-XXX-M-BFYHB.yaml', 'makassar'), ('NL-XX-XXX-M-MR-museum_rudana.yaml', 'gianyar'), ('NL-XX-XXX-M-APAMR.yaml', 'denpasar'), ('NL-XX-XXX-M-YG.yaml', 'aceh'), ('NL-XX-XXX-M-AMRUB.yaml', 'ubud'), ('NL-XX-XXX-M-MTA.yaml', 'aceh'), ('NL-XX-XXX-M-MB.yaml', 'depok'), ('NL-XX-XXX-M-MPL.yaml', 'karangasem'), ('NL-XX-XXX-M-MMT.yaml', 'sidoarjo'), ('NL-XX-XXX-M-AIACFPAAF.yaml', 'istanbul'), ('NL-XX-XXX-R-AFRICAA.yaml', 'amelia'), ('NL-XX-XXX-M-DEH.yaml', 'pretoria'), ] print("=" * 80) print("FIXING NON-DUTCH INSTITUTION COUNTRY CODES") print("=" * 80) success_count = 0 for filename, city_key in files_to_fix: filepath = custodian_dir / filename if not filepath.exists(): print(f"\n⚠ File not found: {filename}") continue print(f"\n=== Processing: {filename} ===") if fix_xxx_file(filepath, city_key): success_count += 1 print("\n" + "=" * 80) print(f"SUMMARY: Fixed {success_count}/{len(files_to_fix)} files") print("=" * 80) if __name__ == '__main__': main()