246 lines
8.4 KiB
Python
Executable file
246 lines
8.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Fix XXX files with incorrect country codes based on city location."""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
import uuid
|
|
import hashlib
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# City to (country, region) mapping
|
|
CITY_LOCATION_MAP = {
|
|
# Australia
|
|
'canberra': ('AU', 'ACT', 'Canberra'),
|
|
'sydney': ('AU', 'NSW', 'Sydney'),
|
|
# Bangladesh
|
|
'dhaka': ('BD', '13', 'Dhaka'),
|
|
# Belgium
|
|
'gent': ('BE', 'VOV', 'Ghent'),
|
|
'ghent': ('BE', 'VOV', 'Ghent'),
|
|
'maasmechelen': ('BE', 'VLI', 'Maasmechelen'),
|
|
# Czech Republic
|
|
'prague': ('CZ', '10', 'Prague'),
|
|
# France
|
|
'normandy': ('FR', 'NOR', 'Caen'), # Normandy region, Caen city
|
|
# Germany
|
|
'bonn': ('DE', 'NW', 'Bonn'),
|
|
'münchen': ('DE', 'BY', 'Munich'),
|
|
'munich': ('DE', 'BY', 'Munich'),
|
|
'rhineland-palatinate': ('DE', 'RP', 'Mainz'), # State-level, use capital
|
|
# Indonesia
|
|
'agats': ('ID', 'PB', 'Agats'),
|
|
'ubud': ('ID', 'BA', 'Ubud'),
|
|
'denpasar': ('ID', 'BA', 'Denpasar'),
|
|
'makassar': ('ID', 'SN', 'Makassar'),
|
|
'kediri': ('ID', 'JI', 'Kediri'),
|
|
'depok': ('ID', 'JB', 'Depok'),
|
|
'bantul': ('ID', 'YO', 'Bantul'),
|
|
'sidoarjo': ('ID', 'JI', 'Sidoarjo'),
|
|
'bali': ('ID', 'BA', 'Denpasar'), # Use Denpasar as capital
|
|
'karangasem': ('ID', 'BA', 'Karangasem'),
|
|
'gianyar': ('ID', 'BA', 'Gianyar'),
|
|
'tangerang': ('ID', 'BT', 'Tangerang'),
|
|
'aceh': ('ID', 'AC', 'Banda Aceh'),
|
|
'surakarta': ('ID', 'JT', 'Surakarta'),
|
|
'pererenan': ('ID', 'BA', 'Pererenan'),
|
|
'medan': ('ID', 'SU', 'Medan'),
|
|
# South Africa
|
|
'pretoria': ('ZA', 'GT', 'Pretoria'),
|
|
# Turkey
|
|
'istanbul': ('TR', '34', 'Istanbul'),
|
|
# UK
|
|
'essex': ('GB', 'ENG', 'Chelmsford'), # County, use county town
|
|
'pulborough': ('GB', 'ENG', 'Pulborough'),
|
|
'glasgow': ('GB', 'SCT', 'Glasgow'),
|
|
'windsor': ('GB', 'ENG', 'Windsor'),
|
|
# USA
|
|
'amelia': ('US', 'VA', 'Amelia'),
|
|
}
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
if not city_name:
|
|
return 'XXX'
|
|
# Remove diacritics
|
|
import unicodedata
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
# Take first 3 letters, uppercase
|
|
return ascii_name[:3].upper()
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
|
|
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
|
return int.from_bytes(sha256_hash[:8], byteorder='big')
|
|
|
|
def fix_xxx_file(filepath: Path, city_key: str, dry_run: bool = False) -> bool:
|
|
"""Fix a single XXX file with correct country code."""
|
|
|
|
if city_key not in CITY_LOCATION_MAP:
|
|
print(f" ⚠ Unknown city: {city_key}")
|
|
return False
|
|
|
|
country, region, city_name = CITY_LOCATION_MAP[city_key]
|
|
city_code = generate_city_code(city_name)
|
|
|
|
# Read file
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
|
|
# Extract type and abbreviation from old GHCID
|
|
# Format: NL-XX-XXX-{TYPE}-{ABBREV}
|
|
match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', old_ghcid)
|
|
if not match:
|
|
print(f" ⚠ Invalid GHCID format: {old_ghcid}")
|
|
return False
|
|
|
|
inst_type = match.group(1)
|
|
abbreviation = match.group(2)
|
|
|
|
# Generate new GHCID
|
|
new_ghcid = f"{country}-{region}-{city_code}-{inst_type}-{abbreviation}"
|
|
|
|
print(f" Old: {old_ghcid}")
|
|
print(f" New: {new_ghcid}")
|
|
|
|
if dry_run:
|
|
return True
|
|
|
|
# Update data
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data['location'] = {
|
|
'city': city_name,
|
|
'region': region,
|
|
'country': country
|
|
}
|
|
|
|
# Update GHCID
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
# Add history entry
|
|
history = data.get('ghcid', {}).get('ghcid_history', [])
|
|
|
|
# Mark old entry as ended
|
|
if history and history[0].get('valid_to') is None:
|
|
history[0]['valid_to'] = timestamp
|
|
|
|
# Add new entry
|
|
history.insert(0, {
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': new_numeric,
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': f"Corrected country from NL to {country} based on city location ({city_name})"
|
|
})
|
|
|
|
data['ghcid'] = {
|
|
'ghcid_current': new_ghcid,
|
|
'ghcid_original': data.get('ghcid', {}).get('ghcid_original', old_ghcid),
|
|
'ghcid_uuid': new_uuid,
|
|
'ghcid_uuid_sha256': data.get('ghcid', {}).get('ghcid_uuid_sha256', ''),
|
|
'ghcid_numeric': new_numeric,
|
|
'record_id': data.get('ghcid', {}).get('record_id', str(uuid.uuid4())),
|
|
'generation_timestamp': timestamp,
|
|
'ghcid_history': history,
|
|
'location_resolution': {
|
|
'method': 'CITY_LOOKUP',
|
|
'city_code': city_code,
|
|
'city_name': city_name,
|
|
'region_code': region,
|
|
'country_code': country,
|
|
'resolution_date': timestamp
|
|
}
|
|
}
|
|
|
|
# Write updated file with new name
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
with open(new_filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Delete old file if different name
|
|
if new_filepath != filepath:
|
|
os.remove(filepath)
|
|
print(f" Created: {new_filename}")
|
|
print(f" Deleted: {filepath.name}")
|
|
|
|
return True
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
# Files to fix with their city keys
|
|
files_to_fix = [
|
|
('NL-XX-XXX-L-NLA.yaml', 'canberra'),
|
|
('NL-XX-XXX-M-IFDA.yaml', 'sydney'),
|
|
('NL-XX-XXX-M-OOBM.yaml', 'dhaka'),
|
|
('NL-XX-XXX-M-MV-maasmechelen_village.yaml', 'maasmechelen'),
|
|
('NL-XX-XXX-A-MFIFA.yaml', 'gent'),
|
|
('NL-XX-XXX-R-EAAE.yaml', 'prague'),
|
|
('NL-XX-XXX-M-N.yaml', 'rhineland-palatinate'),
|
|
('NL-XX-XXX-M-DSDGFMP.yaml', 'bonn'),
|
|
('NL-XX-XXX-M-ZFK.yaml', 'münchen'),
|
|
('NL-XX-XXX-M-SGILUKM.yaml', 'münchen'),
|
|
('NL-XX-XXX-M-CM-caen_memorial.yaml', 'normandy'),
|
|
('NL-XX-XXX-M-AH.yaml', 'essex'),
|
|
('NL-XX-XXX-M-AHLU.yaml', 'pulborough'),
|
|
('NL-XX-XXX-M-CA.yaml', 'windsor'),
|
|
('NL-XX-XXX-L-DPC.yaml', 'glasgow'),
|
|
('NL-XX-XXX-M-RZWMNS.yaml', 'medan'),
|
|
('NL-XX-XXX-M-PM.yaml', 'aceh'),
|
|
('NL-XX-XXX-M-MT-museum_of_toys.yaml', 'tangerang'),
|
|
('NL-XX-XXX-M-MP-museum_pendet.yaml', 'bali'),
|
|
('NL-XX-XXX-M-TPM.yaml', 'surakarta'),
|
|
('NL-XX-XXX-M-AMCP.yaml', 'agats'),
|
|
('NL-XX-XXX-M-MA-museum_airlangga.yaml', 'kediri'),
|
|
('NL-XX-XXX-M-MM-museum_muhammadiyah.yaml', 'bantul'),
|
|
('NL-XX-XXX-M-SNSM.yaml', 'bali'),
|
|
('NL-XX-XXX-M-WSM.yaml', 'pererenan'),
|
|
('NL-XX-XXX-M-BFYHB.yaml', 'makassar'),
|
|
('NL-XX-XXX-M-MR-museum_rudana.yaml', 'gianyar'),
|
|
('NL-XX-XXX-M-APAMR.yaml', 'denpasar'),
|
|
('NL-XX-XXX-M-YG.yaml', 'aceh'),
|
|
('NL-XX-XXX-M-AMRUB.yaml', 'ubud'),
|
|
('NL-XX-XXX-M-MTA.yaml', 'aceh'),
|
|
('NL-XX-XXX-M-MB.yaml', 'depok'),
|
|
('NL-XX-XXX-M-MPL.yaml', 'karangasem'),
|
|
('NL-XX-XXX-M-MMT.yaml', 'sidoarjo'),
|
|
('NL-XX-XXX-M-AIACFPAAF.yaml', 'istanbul'),
|
|
('NL-XX-XXX-R-AFRICAA.yaml', 'amelia'),
|
|
('NL-XX-XXX-M-DEH.yaml', 'pretoria'),
|
|
]
|
|
|
|
print("=" * 80)
|
|
print("FIXING NON-DUTCH INSTITUTION COUNTRY CODES")
|
|
print("=" * 80)
|
|
|
|
success_count = 0
|
|
for filename, city_key in files_to_fix:
|
|
filepath = custodian_dir / filename
|
|
if not filepath.exists():
|
|
print(f"\n⚠ File not found: {filename}")
|
|
continue
|
|
|
|
print(f"\n=== Processing: {filename} ===")
|
|
if fix_xxx_file(filepath, city_key):
|
|
success_count += 1
|
|
|
|
print("\n" + "=" * 80)
|
|
print(f"SUMMARY: Fixed {success_count}/{len(files_to_fix)} files")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|