#!/usr/bin/env python3 """ Fix GHCID location mismatches for heritage custodian files. This script: 1. Identifies files where GHCID location component doesn't match actual location in locations[] array 2. Looks up correct GeoNames data for the actual city 3. Generates proper GHCID with all identifier formats (UUID v5, UUID v8, numeric) 4. Updates all relevant fields in the YAML file 5. Renames files to match new GHCID Usage: python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview Type I changes python scripts/fix_ghcid_location_mismatches.py --type M --dry-run # Preview Museum changes python scripts/fix_ghcid_location_mismatches.py --type A # Fix Archive files python scripts/fix_ghcid_location_mismatches.py --type ALL --dry-run # Preview ALL types Supported types: A (Archive), G (Gallery), H (Holy Sites), I (Intangible), L (Library), M (Museum), N (NGO), O (Official), R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), ALL """ import argparse import hashlib import os import re import shutil import sqlite3 import uuid from datetime import datetime, timezone from pathlib import Path from typing import Optional, Tuple import yaml # GHCID namespace UUID (RFC 4122 DNS namespace) GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # Netherlands admin1 code to ISO 3166-2 province code mapping ADMIN1_TO_PROVINCE = { '01': 'DR', # Drenthe '02': 'FR', # Friesland '03': 'GE', # Gelderland '04': 'GR', # Groningen '05': 'LI', # Limburg '06': 'NB', # Noord-Brabant '07': 'NH', # Noord-Holland '09': 'UT', # Utrecht '10': 'ZE', # Zeeland '11': 'ZH', # Zuid-Holland '15': 'OV', # Overijssel '16': 'FL', # Flevoland } # Special city name mappings for 3-letter codes SPECIAL_CITY_CODES = { "'s-Hertogenbosch": "SHE", "s-Hertogenbosch": "SHE", "'s-Gravenhage": "SGR", "Den Haag": "DHA", "The Hague": "DHA", "Den Burg": "DBU", "Den Helder": "DHE", "De Kwakel": "DKW", "Sint Nicolaasga": "SNI", "Sint Jansklooster": "SJK", "Sint-Oedenrode": "SOR", "Wijk bij Duurstede": "WBD", "Alphen aan den Rijn": "AAR", "Bergen op Zoom": "BOZ", "Tweede Exloërmond": "TEX", "Budel-Schoot": "BUS", "Vierlingsbeek": "VIE", "Leenderstrijp": "LEE", "Sinoutskerke": "SIN", "Espelo": "ESP", "Denekamp": "DEN", "Haarzuilens": "HAA", "Nootdorp": "NOO", "Ameland": "AME", "Essen": "ESS", "Didam": "DID", "Venhuizen": "VEN", "Bleskensgraaf": "BLE", "Noordwijk": "NOO", "Ootmarsum": "OOT", "Zwaag": "ZWA", "Diepenheim": "DIE", "Wierden": "WIE", "Zierikzee": "ZIE", "Heemskerk": "HEE", "Zundert": "ZUN", } # Valid feature codes for settlements (not neighborhoods) VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" # Check special mappings first if city_name in SPECIAL_CITY_CODES: return SPECIAL_CITY_CODES[city_name] # Handle Dutch articles and prefixes name = city_name # Remove quotes and normalize name = name.replace("'", "").replace("-", " ") # Split into words words = name.split() if len(words) == 1: # Single word: first 3 letters return words[0][:3].upper() elif len(words) >= 2: # Check for Dutch articles at start dutch_articles = ['de', 'het', 'den', "'s"] if words[0].lower() in dutch_articles: # Article + main word: take article initial + 2 from main word return (words[0][0] + words[1][:2]).upper() else: # Multi-word: take initials (up to 3) initials = ''.join(w[0] for w in words[:3]) return initials.upper() return name[:3].upper() def generate_ghcid_identifiers(ghcid_string: str) -> dict: """Generate all GHCID identifier formats.""" # UUID v5 (SHA-1) - PRIMARY uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string) # UUID v8 (SHA-256) - Secondary sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()[:16] sha256_hash = bytearray(sha256_hash) sha256_hash[6] = (sha256_hash[6] & 0x0F) | 0x80 # Version 8 sha256_hash[8] = (sha256_hash[8] & 0x3F) | 0x80 # Variant uuid_sha256 = uuid.UUID(bytes=bytes(sha256_hash)) # Numeric (64-bit from SHA-256) full_hash = hashlib.sha256(ghcid_string.encode()).digest() numeric = int.from_bytes(full_hash[:8], 'big') return { 'ghcid_uuid': str(uuid_v5), 'ghcid_uuid_sha256': str(uuid_sha256), 'ghcid_numeric': str(numeric), } # City name aliases for GeoNames lookup CITY_NAME_ALIASES = { "Den Haag": ["The Hague", "'s-Gravenhage", "s-Gravenhage"], "The Hague": ["Den Haag", "'s-Gravenhage", "s-Gravenhage"], "'s-Gravenhage": ["The Hague", "Den Haag", "s-Gravenhage"], "'s-Hertogenbosch": ["s-Hertogenbosch", "Hertogenbosch", "Den Bosch"], "Ameland": ["Hollum", "Nes"], # Main villages on Ameland } def lookup_city_geonames(db_path: str, city_name: str, country_code: str = 'NL') -> Optional[dict]: """Look up city in GeoNames database.""" conn = sqlite3.connect(db_path) cursor = conn.cursor() # Normalize city name for search search_name = city_name.replace("'s-", "s-").replace("'", "") # Build list of names to try names_to_try = [city_name, search_name] if city_name in CITY_NAME_ALIASES: names_to_try.extend(CITY_NAME_ALIASES[city_name]) # Try each name variant row = None for name_variant in names_to_try: # Try exact match first cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """, (country_code, name_variant, name_variant) + VALID_FEATURE_CODES) row = cursor.fetchone() if row: break if not row: # Try fuzzy match with LIKE as last resort for name_variant in names_to_try: cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """, (country_code, f"%{name_variant}%", f"%{name_variant}%") + VALID_FEATURE_CODES) row = cursor.fetchone() if row: break conn.close() if row: admin1_code = row[3] or '' province_code = ADMIN1_TO_PROVINCE.get(admin1_code, 'XX') return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': admin1_code, 'admin1_name': row[4], 'province_code': province_code, 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } return None def extract_locations_city(data: dict) -> Optional[str]: """Extract city from locations array in YAML data.""" locations = data.get('locations', []) if locations and isinstance(locations, list) and len(locations) > 0: return locations[0].get('city') return None def extract_abbreviation(ghcid: str) -> str: """Extract abbreviation from GHCID (everything after 5th component).""" parts = ghcid.split('-') if len(parts) >= 5: return '-'.join(parts[4:]) return '' def parse_ghcid(ghcid: str) -> Tuple[str, str, str, str, str]: """Parse GHCID into components.""" parts = ghcid.split('-') if len(parts) >= 5: country = parts[0] region = parts[1] city = parts[2] inst_type = parts[3] abbrev = '-'.join(parts[4:]) return country, region, city, inst_type, abbrev return '', '', '', '', '' def build_ghcid(country: str, region: str, city_code: str, inst_type: str, abbrev: str) -> str: """Build GHCID from components.""" return f"{country}-{region}-{city_code}-{inst_type}-{abbrev}" def update_yaml_ghcid(data: dict, new_ghcid: str, old_ghcid: str, geonames_data: dict, timestamp: str) -> dict: """Update all GHCID-related fields in YAML data.""" identifiers = generate_ghcid_identifiers(new_ghcid) # Update ghcid section if 'ghcid' not in data: data['ghcid'] = {} ghcid_section = data['ghcid'] ghcid_section['ghcid_current'] = new_ghcid ghcid_section['ghcid_uuid'] = identifiers['ghcid_uuid'] ghcid_section['ghcid_uuid_sha256'] = identifiers['ghcid_uuid_sha256'] ghcid_section['ghcid_numeric'] = int(identifiers['ghcid_numeric']) ghcid_section['generation_timestamp'] = timestamp # Preserve record_id if it exists # record_id should NOT change - it's the database primary key # Update location_resolution ghcid_section['location_resolution'] = { 'method': 'GEONAMES_LOOKUP', 'geonames_id': geonames_data['geonames_id'], 'geonames_name': geonames_data['name'], 'feature_code': geonames_data['feature_code'], 'population': geonames_data['population'], 'admin1_code': geonames_data['admin1_code'], 'region_code': geonames_data['province_code'], 'country_code': 'NL', } ghcid_section['geonames_id'] = geonames_data['geonames_id'] # Update ghcid_history if 'ghcid_history' not in ghcid_section: ghcid_section['ghcid_history'] = [] # Mark old GHCID as ended for entry in ghcid_section['ghcid_history']: if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None: entry['valid_to'] = timestamp # Add new GHCID entry ghcid_section['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': int(identifiers['ghcid_numeric']), 'valid_from': timestamp, 'valid_to': None, 'reason': f"GHCID corrected: location mismatch fix from {old_ghcid} to {new_ghcid}", }) # Update identifiers array if 'identifiers' in data: for identifier in data['identifiers']: if identifier.get('identifier_scheme') == 'GHCID': identifier['identifier_value'] = new_ghcid elif identifier.get('identifier_scheme') == 'GHCID_UUID': identifier['identifier_value'] = identifiers['ghcid_uuid'] identifier['identifier_url'] = f"urn:uuid:{identifiers['ghcid_uuid']}" elif identifier.get('identifier_scheme') == 'GHCID_UUID_SHA256': identifier['identifier_value'] = identifiers['ghcid_uuid_sha256'] identifier['identifier_url'] = f"urn:uuid:{identifiers['ghcid_uuid_sha256']}" elif identifier.get('identifier_scheme') == 'GHCID_NUMERIC': identifier['identifier_value'] = identifiers['ghcid_numeric'] # Update location section to match locations array if 'location' in data: data['location']['city'] = geonames_data['name'] data['location']['region_code'] = geonames_data['province_code'] data['location']['geonames_id'] = geonames_data['geonames_id'] data['location']['geonames_name'] = geonames_data['name'] data['location']['feature_code'] = geonames_data['feature_code'] if geonames_data.get('latitude'): data['location']['latitude'] = geonames_data['latitude'] data['location']['longitude'] = geonames_data['longitude'] data['location']['normalization_timestamp'] = timestamp # Remove old coordinate provenance notes if 'note' in data['location']: del data['location']['note'] if 'coordinate_provenance_removed' in data['location']: del data['location']['coordinate_provenance_removed'] # Add provenance note if 'provenance' in data: if 'notes' not in data['provenance']: data['provenance']['notes'] = [] if isinstance(data['provenance']['notes'], list): data['provenance']['notes'].append( f"GHCID location corrected via fix_ghcid_location_mismatches.py on {timestamp}: " f"{old_ghcid} -> {new_ghcid}" ) return data def find_mismatched_files(custodian_dir: Path, db_path: str, inst_type: str = 'I') -> list: """Find all files of given type with GHCID location mismatches. Args: custodian_dir: Path to custodian directory db_path: Path to GeoNames database inst_type: Institution type code (I, M, A, L, etc.) or 'ALL' for all types """ mismatches = [] # Build glob pattern based on institution type if inst_type == 'ALL': pattern = 'NL-*-*.yaml' else: pattern = f'NL-*-{inst_type}-*.yaml' for filepath in sorted(custodian_dir.glob(pattern)): filename = filepath.stem current_ghcid = filename # Skip PENDING files (no location data) if 'PENDING' in current_ghcid: continue # Parse current GHCID country, region, city_code, file_inst_type, abbrev = parse_ghcid(current_ghcid) if not abbrev: continue # Load YAML try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f"Error loading {filepath}: {e}") continue # Get actual city from locations array actual_city = extract_locations_city(data) if not actual_city: continue # Skip "Nederland" as it's for national organizations if actual_city.lower() == 'nederland': continue # Generate expected city code expected_city_code = generate_city_code(actual_city) # Check if mismatch (city code or region is wrong) if city_code != expected_city_code: # Look up correct GeoNames data geonames_data = lookup_city_geonames(db_path, actual_city) if geonames_data: new_ghcid = build_ghcid( country, geonames_data['province_code'], expected_city_code, file_inst_type, abbrev ) # Only add if the GHCID actually changes if new_ghcid != current_ghcid: mismatches.append({ 'filepath': filepath, 'old_ghcid': current_ghcid, 'new_ghcid': new_ghcid, 'actual_city': actual_city, 'geonames_data': geonames_data, 'data': data, }) else: print(f"WARNING: Could not find GeoNames data for '{actual_city}' in {filepath}") return mismatches # Valid institution type codes VALID_INST_TYPES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'ALL'] def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool: """Check if the new GHCID would collide with an existing file.""" new_filepath = custodian_dir / f"{new_ghcid}.yaml" return new_filepath.exists() and new_filepath != old_filepath def main(): parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for heritage custodian files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying') parser.add_argument('--type', '-t', default='I', choices=VALID_INST_TYPES, help='Institution type code: A (Archive), G (Gallery), H (Holy Sites), ' 'I (Intangible, default), L (Library), M (Museum), N (NGO), O (Official), ' 'R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), ' 'ALL (all types)') parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory') parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database') args = parser.parse_args() inst_type = args.type # Resolve paths script_dir = Path(__file__).parent.parent custodian_dir = script_dir / args.custodian_dir db_path = script_dir / args.geonames_db if not custodian_dir.exists(): print(f"ERROR: Custodian directory not found: {custodian_dir}") return 1 if not db_path.exists(): print(f"ERROR: GeoNames database not found: {db_path}") return 1 print("=" * 80) type_name = 'ALL types' if inst_type == 'ALL' else f'Type {inst_type}' print(f"GHCID Location Mismatch Fixer for {type_name} Heritage Custodians") print("=" * 80) print(f"Custodian directory: {custodian_dir}") print(f"GeoNames database: {db_path}") print(f"Institution type: {inst_type}") print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}") print() # Find mismatches print("Scanning for location mismatches...") mismatches = find_mismatched_files(custodian_dir, str(db_path), inst_type) print(f"Found {len(mismatches)} files with GHCID location mismatches") print() if not mismatches: print("No mismatches found. Exiting.") return 0 # Generate timestamp for all updates timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Process each mismatch fixed_count = 0 skipped_count = 0 errors = [] for mismatch in mismatches: old_ghcid = mismatch['old_ghcid'] new_ghcid = mismatch['new_ghcid'] actual_city = mismatch['actual_city'] filepath = mismatch['filepath'] geonames_data = mismatch['geonames_data'] data = mismatch['data'] print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing: {old_ghcid}") print(f" Actual city: {actual_city}") print(f" Province: {geonames_data['province_code']} ({geonames_data.get('admin1_name', 'Unknown')})") print(f" New GHCID: {new_ghcid}") # Check for collision if check_collision(custodian_dir, new_ghcid, filepath): print(f" SKIPPED: Collision - {new_ghcid}.yaml already exists") skipped_count += 1 print() continue # Generate new identifiers for display identifiers = generate_ghcid_identifiers(new_ghcid) print(f" UUID v5: {identifiers['ghcid_uuid']}") print(f" UUID v8: {identifiers['ghcid_uuid_sha256']}") print(f" Numeric: {identifiers['ghcid_numeric']}") if args.dry_run: print(f" Would rename: {filepath.name} -> {new_ghcid}.yaml") print() fixed_count += 1 continue try: # Update YAML data updated_data = update_yaml_ghcid( data, new_ghcid, old_ghcid, geonames_data, timestamp ) # Write updated YAML to new file new_filepath = filepath.parent / f"{new_ghcid}.yaml" with open(new_filepath, 'w', encoding='utf-8') as f: yaml.dump(updated_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove old file if different from new if filepath != new_filepath: filepath.unlink() print(f" Renamed: {filepath.name} -> {new_filepath.name}") else: print(f" Updated: {filepath.name}") fixed_count += 1 except Exception as e: error_msg = f"Error processing {filepath}: {e}" print(f" ERROR: {e}") errors.append(error_msg) print() # Summary print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total mismatches found: {len(mismatches)}") print(f"Successfully {'would fix' if args.dry_run else 'fixed'}: {fixed_count}") print(f"Skipped (collisions): {skipped_count}") print(f"Errors: {len(errors)}") if errors: print("\nErrors:") for error in errors: print(f" - {error}") if args.dry_run: print("\nThis was a dry run. Run without --dry-run to apply changes.") return 0 if not errors else 1 if __name__ == '__main__': exit(main())