glam/scripts/fix_ghcid_location_mismatches.py

#!/usr/bin/env python3
"""
Fix GHCID location mismatches for heritage custodian files.

This script:
1. Identifies files where GHCID location component doesn't match actual location in locations[] array
2. Looks up correct GeoNames data for the actual city
3. Generates proper GHCID with all identifier formats (UUID v5, UUID v8, numeric)
4. Updates all relevant fields in the YAML file
5. Renames files to match new GHCID

Usage:
    python scripts/fix_ghcid_location_mismatches.py --dry-run              # Preview Type I changes
    python scripts/fix_ghcid_location_mismatches.py --type M --dry-run     # Preview Museum changes
    python scripts/fix_ghcid_location_mismatches.py --type A               # Fix Archive files
    python scripts/fix_ghcid_location_mismatches.py --type ALL --dry-run   # Preview ALL types

Supported types: A (Archive), G (Gallery), H (Holy Sites), I (Intangible), L (Library),
                 M (Museum), N (NGO), O (Official), R (Research), S (Society),
                 T (Taste/Smell), U (Unknown), X (Mixed), ALL
"""

import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple

import yaml

# GHCID namespace UUID (RFC 4122 DNS namespace)
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')

# Netherlands admin1 code to ISO 3166-2 province code mapping
ADMIN1_TO_PROVINCE = {
    '01': 'DR',  # Drenthe
    '02': 'FR',  # Friesland
    '03': 'GE',  # Gelderland
    '04': 'GR',  # Groningen
    '05': 'LI',  # Limburg
    '06': 'NB',  # Noord-Brabant
    '07': 'NH',  # Noord-Holland
    '09': 'UT',  # Utrecht
    '10': 'ZE',  # Zeeland
    '11': 'ZH',  # Zuid-Holland
    '15': 'OV',  # Overijssel
    '16': 'FL',  # Flevoland
}

# Special city name mappings for 3-letter codes
SPECIAL_CITY_CODES = {
    "'s-Hertogenbosch": "SHE",
    "s-Hertogenbosch": "SHE",
    "'s-Gravenhage": "SGR",
    "Den Haag": "DHA",
    "The Hague": "DHA",
    "Den Burg": "DBU",
    "Den Helder": "DHE",
    "De Kwakel": "DKW",
    "Sint Nicolaasga": "SNI",
    "Sint Jansklooster": "SJK",
    "Sint-Oedenrode": "SOR",
    "Wijk bij Duurstede": "WBD",
    "Alphen aan den Rijn": "AAR",
    "Bergen op Zoom": "BOZ",
    "Tweede Exloërmond": "TEX",
    "Budel-Schoot": "BUS",
    "Vierlingsbeek": "VIE",
    "Leenderstrijp": "LEE",
    "Sinoutskerke": "SIN",
    "Espelo": "ESP",
    "Denekamp": "DEN",
    "Haarzuilens": "HAA",
    "Nootdorp": "NOO",
    "Ameland": "AME",
    "Essen": "ESS",
    "Didam": "DID",
    "Venhuizen": "VEN",
    "Bleskensgraaf": "BLE",
    "Noordwijk": "NOO",
    "Ootmarsum": "OOT",
    "Zwaag": "ZWA",
    "Diepenheim": "DIE",
    "Wierden": "WIE",
    "Zierikzee": "ZIE",
    "Heemskerk": "HEE",
    "Zundert": "ZUN",
}

# Valid feature codes for settlements (not neighborhoods)
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')


def generate_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    # Check special mappings first
    if city_name in SPECIAL_CITY_CODES:
        return SPECIAL_CITY_CODES[city_name]

    # Handle Dutch articles and prefixes
    name = city_name

    # Remove quotes and normalize
    name = name.replace("'", "").replace("-", " ")

    # Split into words
    words = name.split()

    if len(words) == 1:
        # Single word: first 3 letters
        return words[0][:3].upper()
    elif len(words) >= 2:
        # Check for Dutch articles at start
        dutch_articles = ['de', 'het', 'den', "'s"]
        if words[0].lower() in dutch_articles:
            # Article + main word: take article initial + 2 from main word
            return (words[0][0] + words[1][:2]).upper()
        else:
            # Multi-word: take initials (up to 3)
            initials = ''.join(w[0] for w in words[:3])
            return initials.upper()

    return name[:3].upper()


def generate_ghcid_identifiers(ghcid_string: str) -> dict:
    """Generate all GHCID identifier formats."""
    # UUID v5 (SHA-1) - PRIMARY
    uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)

    # UUID v8 (SHA-256) - Secondary
    sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()[:16]
    sha256_hash = bytearray(sha256_hash)
    sha256_hash[6] = (sha256_hash[6] & 0x0F) | 0x80  # Version 8
    sha256_hash[8] = (sha256_hash[8] & 0x3F) | 0x80  # Variant
    uuid_sha256 = uuid.UUID(bytes=bytes(sha256_hash))

    # Numeric (64-bit from SHA-256)
    full_hash = hashlib.sha256(ghcid_string.encode()).digest()
    numeric = int.from_bytes(full_hash[:8], 'big')

    return {
        'ghcid_uuid': str(uuid_v5),
        'ghcid_uuid_sha256': str(uuid_sha256),
        'ghcid_numeric': str(numeric),
    }


# City name aliases for GeoNames lookup
CITY_NAME_ALIASES = {
    "Den Haag": ["The Hague", "'s-Gravenhage", "s-Gravenhage"],
    "The Hague": ["Den Haag", "'s-Gravenhage", "s-Gravenhage"],
    "'s-Gravenhage": ["The Hague", "Den Haag", "s-Gravenhage"],
    "'s-Hertogenbosch": ["s-Hertogenbosch", "Hertogenbosch", "Den Bosch"],
    "Ameland": ["Hollum", "Nes"],  # Main villages on Ameland
}


def lookup_city_geonames(db_path: str, city_name: str, country_code: str = 'NL') -> Optional[dict]:
    """Look up city in GeoNames database."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Normalize city name for search
    search_name = city_name.replace("'s-", "s-").replace("'", "")

    # Build list of names to try
    names_to_try = [city_name, search_name]
    if city_name in CITY_NAME_ALIASES:
        names_to_try.extend(CITY_NAME_ALIASES[city_name])

    # Try each name variant
    row = None
    for name_variant in names_to_try:
        # Try exact match first
        cursor.execute("""
            SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
                   latitude, longitude, population, feature_code
            FROM cities
            WHERE country_code = ?
              AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
              AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
            ORDER BY population DESC
            LIMIT 1
        """, (country_code, name_variant, name_variant) + VALID_FEATURE_CODES)

        row = cursor.fetchone()
        if row:
            break

    if not row:
        # Try fuzzy match with LIKE as last resort
        for name_variant in names_to_try:
            cursor.execute("""
                SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
                       latitude, longitude, population, feature_code
                FROM cities
                WHERE country_code = ?
                  AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
                  AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
                ORDER BY population DESC
                LIMIT 1
            """, (country_code, f"%{name_variant}%", f"%{name_variant}%") + VALID_FEATURE_CODES)
            row = cursor.fetchone()
            if row:
                break

    conn.close()

    if row:
        admin1_code = row[3] or ''
        province_code = ADMIN1_TO_PROVINCE.get(admin1_code, 'XX')

        return {
            'geonames_id': row[0],
            'name': row[1],
            'ascii_name': row[2],
            'admin1_code': admin1_code,
            'admin1_name': row[4],
            'province_code': province_code,
            'latitude': row[5],
            'longitude': row[6],
            'population': row[7],
            'feature_code': row[8],
        }

    return None


def extract_locations_city(data: dict) -> Optional[str]:
    """Extract city from locations array in YAML data."""
    locations = data.get('locations', [])
    if locations and isinstance(locations, list) and len(locations) > 0:
        return locations[0].get('city')
    return None


def extract_abbreviation(ghcid: str) -> str:
    """Extract abbreviation from GHCID (everything after 5th component)."""
    parts = ghcid.split('-')
    if len(parts) >= 5:
        return '-'.join(parts[4:])
    return ''


def parse_ghcid(ghcid: str) -> Tuple[str, str, str, str, str]:
    """Parse GHCID into components."""
    parts = ghcid.split('-')
    if len(parts) >= 5:
        country = parts[0]
        region = parts[1]
        city = parts[2]
        inst_type = parts[3]
        abbrev = '-'.join(parts[4:])
        return country, region, city, inst_type, abbrev
    return '', '', '', '', ''


def build_ghcid(country: str, region: str, city_code: str, inst_type: str, abbrev: str) -> str:
    """Build GHCID from components."""
    return f"{country}-{region}-{city_code}-{inst_type}-{abbrev}"


def update_yaml_ghcid(data: dict, new_ghcid: str, old_ghcid: str, geonames_data: dict,
                      timestamp: str) -> dict:
    """Update all GHCID-related fields in YAML data."""
    identifiers = generate_ghcid_identifiers(new_ghcid)

    # Update ghcid section
    if 'ghcid' not in data:
        data['ghcid'] = {}

    ghcid_section = data['ghcid']
    ghcid_section['ghcid_current'] = new_ghcid
    ghcid_section['ghcid_uuid'] = identifiers['ghcid_uuid']
    ghcid_section['ghcid_uuid_sha256'] = identifiers['ghcid_uuid_sha256']
    ghcid_section['ghcid_numeric'] = int(identifiers['ghcid_numeric'])
    ghcid_section['generation_timestamp'] = timestamp

    # Preserve record_id if it exists
    # record_id should NOT change - it's the database primary key

    # Update location_resolution
    ghcid_section['location_resolution'] = {
        'method': 'GEONAMES_LOOKUP',
        'geonames_id': geonames_data['geonames_id'],
        'geonames_name': geonames_data['name'],
        'feature_code': geonames_data['feature_code'],
        'population': geonames_data['population'],
        'admin1_code': geonames_data['admin1_code'],
        'region_code': geonames_data['province_code'],
        'country_code': 'NL',
    }
    ghcid_section['geonames_id'] = geonames_data['geonames_id']

    # Update ghcid_history
    if 'ghcid_history' not in ghcid_section:
        ghcid_section['ghcid_history'] = []

    # Mark old GHCID as ended
    for entry in ghcid_section['ghcid_history']:
        if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None:
            entry['valid_to'] = timestamp

    # Add new GHCID entry
    ghcid_section['ghcid_history'].append({
        'ghcid': new_ghcid,
        'ghcid_numeric': int(identifiers['ghcid_numeric']),
        'valid_from': timestamp,
        'valid_to': None,
        'reason': f"GHCID corrected: location mismatch fix from {old_ghcid} to {new_ghcid}",
    })

    # Update identifiers array
    if 'identifiers' in data:
        for identifier in data['identifiers']:
            if identifier.get('identifier_scheme') == 'GHCID':
                identifier['identifier_value'] = new_ghcid
            elif identifier.get('identifier_scheme') == 'GHCID_UUID':
                identifier['identifier_value'] = identifiers['ghcid_uuid']
                identifier['identifier_url'] = f"urn:uuid:{identifiers['ghcid_uuid']}"
            elif identifier.get('identifier_scheme') == 'GHCID_UUID_SHA256':
                identifier['identifier_value'] = identifiers['ghcid_uuid_sha256']
                identifier['identifier_url'] = f"urn:uuid:{identifiers['ghcid_uuid_sha256']}"
            elif identifier.get('identifier_scheme') == 'GHCID_NUMERIC':
                identifier['identifier_value'] = identifiers['ghcid_numeric']

    # Update location section to match locations array
    if 'location' in data:
        data['location']['city'] = geonames_data['name']
        data['location']['region_code'] = geonames_data['province_code']
        data['location']['geonames_id'] = geonames_data['geonames_id']
        data['location']['geonames_name'] = geonames_data['name']
        data['location']['feature_code'] = geonames_data['feature_code']
        if geonames_data.get('latitude'):
            data['location']['latitude'] = geonames_data['latitude']
            data['location']['longitude'] = geonames_data['longitude']
        data['location']['normalization_timestamp'] = timestamp
        # Remove old coordinate provenance notes
        if 'note' in data['location']:
            del data['location']['note']
        if 'coordinate_provenance_removed' in data['location']:
            del data['location']['coordinate_provenance_removed']

    # Add provenance note
    if 'provenance' in data:
        if 'notes' not in data['provenance']:
            data['provenance']['notes'] = []
        if isinstance(data['provenance']['notes'], list):
            data['provenance']['notes'].append(
                f"GHCID location corrected via fix_ghcid_location_mismatches.py on {timestamp}: "
                f"{old_ghcid} -> {new_ghcid}"
            )

    return data


def find_mismatched_files(custodian_dir: Path, db_path: str, inst_type: str = 'I') -> list:
    """Find all files of given type with GHCID location mismatches.

    Args:
        custodian_dir: Path to custodian directory
        db_path: Path to GeoNames database
        inst_type: Institution type code (I, M, A, L, etc.) or 'ALL' for all types
    """
    mismatches = []

    # Build glob pattern based on institution type
    if inst_type == 'ALL':
        pattern = 'NL-*-*.yaml'
    else:
        pattern = f'NL-*-{inst_type}-*.yaml'

    for filepath in sorted(custodian_dir.glob(pattern)):
        filename = filepath.stem
        current_ghcid = filename

        # Skip PENDING files (no location data)
        if 'PENDING' in current_ghcid:
            continue

        # Parse current GHCID
        country, region, city_code, file_inst_type, abbrev = parse_ghcid(current_ghcid)

        if not abbrev:
            continue

        # Load YAML
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
        except Exception as e:
            print(f"Error loading {filepath}: {e}")
            continue

        # Get actual city from locations array
        actual_city = extract_locations_city(data)

        if not actual_city:
            continue

        # Skip "Nederland" as it's for national organizations
        if actual_city.lower() == 'nederland':
            continue

        # Generate expected city code
        expected_city_code = generate_city_code(actual_city)

        # Check if mismatch (city code or region is wrong)
        if city_code != expected_city_code:
            # Look up correct GeoNames data
            geonames_data = lookup_city_geonames(db_path, actual_city)

            if geonames_data:
                new_ghcid = build_ghcid(
                    country,
                    geonames_data['province_code'],
                    expected_city_code,
                    file_inst_type,
                    abbrev
                )

                # Only add if the GHCID actually changes
                if new_ghcid != current_ghcid:
                    mismatches.append({
                        'filepath': filepath,
                        'old_ghcid': current_ghcid,
                        'new_ghcid': new_ghcid,
                        'actual_city': actual_city,
                        'geonames_data': geonames_data,
                        'data': data,
                    })
            else:
                print(f"WARNING: Could not find GeoNames data for '{actual_city}' in {filepath}")

    return mismatches


# Valid institution type codes
VALID_INST_TYPES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'ALL']


def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool:
    """Check if the new GHCID would collide with an existing file."""
    new_filepath = custodian_dir / f"{new_ghcid}.yaml"
    return new_filepath.exists() and new_filepath != old_filepath


def main():
    parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for heritage custodian files')
    parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying')
    parser.add_argument('--type', '-t', default='I', choices=VALID_INST_TYPES,
                        help='Institution type code: A (Archive), G (Gallery), H (Holy Sites), '
                             'I (Intangible, default), L (Library), M (Museum), N (NGO), O (Official), '
                             'R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), '
                             'ALL (all types)')
    parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory')
    parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database')
    args = parser.parse_args()

    inst_type = args.type

    # Resolve paths
    script_dir = Path(__file__).parent.parent
    custodian_dir = script_dir / args.custodian_dir
    db_path = script_dir / args.geonames_db

    if not custodian_dir.exists():
        print(f"ERROR: Custodian directory not found: {custodian_dir}")
        return 1

    if not db_path.exists():
        print(f"ERROR: GeoNames database not found: {db_path}")
        return 1

    print("=" * 80)
    type_name = 'ALL types' if inst_type == 'ALL' else f'Type {inst_type}'
    print(f"GHCID Location Mismatch Fixer for {type_name} Heritage Custodians")
    print("=" * 80)
    print(f"Custodian directory: {custodian_dir}")
    print(f"GeoNames database: {db_path}")
    print(f"Institution type: {inst_type}")
    print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}")
    print()

    # Find mismatches
    print("Scanning for location mismatches...")
    mismatches = find_mismatched_files(custodian_dir, str(db_path), inst_type)

    print(f"Found {len(mismatches)} files with GHCID location mismatches")
    print()

    if not mismatches:
        print("No mismatches found. Exiting.")
        return 0

    # Generate timestamp for all updates
    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    # Process each mismatch
    fixed_count = 0
    skipped_count = 0
    errors = []

    for mismatch in mismatches:
        old_ghcid = mismatch['old_ghcid']
        new_ghcid = mismatch['new_ghcid']
        actual_city = mismatch['actual_city']
        filepath = mismatch['filepath']
        geonames_data = mismatch['geonames_data']
        data = mismatch['data']

        print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing: {old_ghcid}")
        print(f"  Actual city: {actual_city}")
        print(f"  Province: {geonames_data['province_code']} ({geonames_data.get('admin1_name', 'Unknown')})")
        print(f"  New GHCID: {new_ghcid}")

        # Check for collision
        if check_collision(custodian_dir, new_ghcid, filepath):
            print(f"  SKIPPED: Collision - {new_ghcid}.yaml already exists")
            skipped_count += 1
            print()
            continue

        # Generate new identifiers for display
        identifiers = generate_ghcid_identifiers(new_ghcid)
        print(f"  UUID v5: {identifiers['ghcid_uuid']}")
        print(f"  UUID v8: {identifiers['ghcid_uuid_sha256']}")
        print(f"  Numeric: {identifiers['ghcid_numeric']}")

        if args.dry_run:
            print(f"  Would rename: {filepath.name} -> {new_ghcid}.yaml")
            print()
            fixed_count += 1
            continue

        try:
            # Update YAML data
            updated_data = update_yaml_ghcid(
                data, new_ghcid, old_ghcid, geonames_data, timestamp
            )

            # Write updated YAML to new file
            new_filepath = filepath.parent / f"{new_ghcid}.yaml"

            with open(new_filepath, 'w', encoding='utf-8') as f:
                yaml.dump(updated_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

            # Remove old file if different from new
            if filepath != new_filepath:
                filepath.unlink()
                print(f"  Renamed: {filepath.name} -> {new_filepath.name}")
            else:
                print(f"  Updated: {filepath.name}")

            fixed_count += 1

        except Exception as e:
            error_msg = f"Error processing {filepath}: {e}"
            print(f"  ERROR: {e}")
            errors.append(error_msg)

        print()

    # Summary
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total mismatches found: {len(mismatches)}")
    print(f"Successfully {'would fix' if args.dry_run else 'fixed'}: {fixed_count}")
    print(f"Skipped (collisions): {skipped_count}")
    print(f"Errors: {len(errors)}")

    if errors:
        print("\nErrors:")
        for error in errors:
            print(f"  - {error}")

    if args.dry_run:
        print("\nThis was a dry run. Run without --dry-run to apply changes.")

    return 0 if not errors else 1


if __name__ == '__main__':
    exit(main())