glam/scripts/enrich_nde_entries_ghcid.py

#!/usr/bin/env python3
"""
Enrich NDE Heritage Institution Entries with GHCID Persistent Identifiers.

This script:
1. Loads all YAML files from data/nde/enriched/entries/
2. Extracts location data (city, region, coordinates)
3. Generates base GHCIDs using NL-REGION-CITY-TYPE-ABBREV format
4. Detects collisions and applies First Batch rule (all get name suffixes)
5. Generates all 4 identifier formats:
   - Human-readable GHCID string
   - UUID v5 (SHA-1, RFC 4122 compliant) - PRIMARY
   - UUID v8 (SHA-256, SOTA cryptographic strength) - Future-proof
   - Numeric (64-bit integer for database PKs)
6. Adds GHCID fields to each entry
7. Generates collision statistics report

## GHCID Format

Base: NL-{Region}-{City}-{Type}-{Abbreviation}
With collision suffix: NL-{Region}-{City}-{Type}-{Abbreviation}-{name_suffix}

## Collision Resolution (First Batch Rule)

Since this is a batch import (all entries processed together), when multiple
institutions generate the same base GHCID:
- ALL colliding institutions receive native language name suffixes
- Name suffix: snake_case of institution name

Example:
- Two societies with NL-OV-ZWO-S-HK both become:
  - NL-OV-ZWO-S-HK-historische_kring_zwolle
  - NL-OV-ZWO-S-HK-heemkundige_kring_zwolle

Usage:
    python scripts/enrich_nde_entries_ghcid.py [--dry-run]

Options:
    --dry-run   Preview changes without writing to files
"""

import argparse
import json
import re
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.identifiers.ghcid import (
    GHCIDComponents,
    GHCIDGenerator,
    InstitutionType,
    extract_abbreviation_from_name,
    normalize_city_name,
)
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB


# Dutch province to ISO 3166-2 code mapping
DUTCH_PROVINCE_CODES = {
    # Standard names
    "drenthe": "DR",
    "flevoland": "FL",
    "friesland": "FR",
    "fryslan": "FR",
    "fryslân": "FR",
    "gelderland": "GE",
    "groningen": "GR",
    "limburg": "LI",
    "noord-brabant": "NB",
    "north brabant": "NB",
    "noord brabant": "NB",
    "noord-holland": "NH",
    "north holland": "NH",
    "noord holland": "NH",
    "overijssel": "OV",
    "utrecht": "UT",
    "zeeland": "ZE",
    "zuid-holland": "ZH",
    "south holland": "ZH",
    "zuid holland": "ZH",
}

# GeoNames admin1 code to ISO 3166-2 NL mapping
# Based on actual GeoNames database content (verified 2025-12-01)
GEONAMES_ADMIN1_TO_ISO_NL = {
    "01": "DR",  # Drenthe
    "02": "FR",  # Friesland (NOT Flevoland!)
    "03": "GE",  # Gelderland
    "04": "GR",  # Groningen
    "05": "LI",  # Limburg
    "06": "NB",  # Noord-Brabant (North Brabant)
    "07": "NH",  # Noord-Holland (North Holland)
    "09": "UT",  # Utrecht
    "10": "ZE",  # Zeeland
    "11": "ZH",  # Zuid-Holland (South Holland)
    "15": "OV",  # Overijssel
    "16": "FL",  # Flevoland
}

# GeoNames admin1 code to ISO 3166-2 BE mapping
# Belgium uses region codes as admin1
GEONAMES_ADMIN1_TO_ISO_BE = {
    "BRU": "BRU",  # Brussels Capital Region
    "VLG": "VLG",  # Flanders (Vlaanderen)
    "WAL": "WAL",  # Wallonia (Wallonie)
}

# GeoNames admin1 code to ISO 3166-2 DE mapping (placeholder)
GEONAMES_ADMIN1_TO_ISO_DE = {
    # German federal states would go here
}

# Combined mapping by country
GEONAMES_ADMIN1_TO_ISO = {
    "NL": GEONAMES_ADMIN1_TO_ISO_NL,
    "BE": GEONAMES_ADMIN1_TO_ISO_BE,
    "DE": GEONAMES_ADMIN1_TO_ISO_DE,
}

# Global GeoNames database instance (initialized lazily)
_geonames_db: Optional[GeoNamesDB] = None

def get_geonames_db() -> GeoNamesDB:
    """Get or create the global GeoNames database instance."""
    global _geonames_db
    if _geonames_db is None:
        project_root = Path(__file__).parent.parent
        db_path = project_root / "data" / "reference" / "geonames.db"
        _geonames_db = GeoNamesDB(db_path, enable_disambiguation=True)
    return _geonames_db


def reverse_geocode_to_city(latitude: float, longitude: float, country_code: str = "NL") -> Optional[dict]:
    """
    Reverse geocode coordinates to find the nearest city/town/village.

    Uses the GeoNames database to find the closest settlement to the given coordinates.
    EXCLUDES neighborhoods/districts (PPLX) - only returns proper settlements.

    Args:
        latitude: Latitude coordinate
        longitude: Longitude coordinate
        country_code: ISO 3166-1 alpha-2 country code (default: NL)

    Returns:
        Dict with 'city', 'region', 'city_code', 'admin1_code' or None if not found
    """
    db = get_geonames_db()

    # Feature codes for proper settlements (cities, towns, villages):
    # PPL = populated place (city/town/village)
    # PPLA = seat of first-order admin division (provincial capital)
    # PPLA2 = seat of second-order admin division
    # PPLA3 = seat of third-order admin division
    # PPLA4 = seat of fourth-order admin division
    # PPLC = capital of a political entity (national capital)
    # PPLS = populated places (multiple)
    # PPLG = seat of government (when different from capital)
    #
    # EXCLUDED:
    # PPLX = section of populated place (neighborhood, district, quarter)
    #        e.g., "Binnenstad" (city center), "Amsterdam Binnenstad"

    VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')

    # Query for nearest city/town/village using Euclidean distance approximation
    # (Good enough for country-scale distances)
    query = """
        SELECT
            name,
            ascii_name,
            admin1_code,
            admin1_name,
            latitude,
            longitude,
            geonames_id,
            population,
            feature_code,
            ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
        FROM cities
        WHERE country_code = ?
          AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
        ORDER BY distance_sq
        LIMIT 1
    """

    import sqlite3
    conn = sqlite3.connect(str(db.db_path))
    cursor = conn.cursor()

    try:
        cursor.execute(query, (latitude, latitude, longitude, longitude, country_code, *VALID_FEATURE_CODES))
        row = cursor.fetchone()

        if row:
            name, ascii_name, admin1_code, admin1_name, lat, lon, geonameid, population, feature_code, distance_sq = row

            # Get city code using disambiguation
            city_code = db.get_city_abbreviation(name, country_code, use_disambiguation=True)
            if not city_code:
                city_code = get_city_code(name)

            # Map admin1 code to ISO 3166-2 (country-specific mapping)
            country_admin1_map = GEONAMES_ADMIN1_TO_ISO.get(country_code, {})
            region_code = country_admin1_map.get(admin1_code, admin1_code if admin1_code else "00")

            return {
                'city': name,
                'ascii_name': ascii_name,
                'region': admin1_name,
                'region_code': region_code,
                'city_code': city_code,
                'admin1_code': admin1_code,
                'geonames_id': geonameid,
                'feature_code': feature_code,
                'distance_km': (distance_sq ** 0.5) * 111,  # Approximate km (1 degree ≈ 111km)
            }
    finally:
        conn.close()

    return None

# Institution type code mapping (from original entry 'type' field)
TYPE_CODE_MAP = {
    "G": "G",  # Gallery
    "L": "L",  # Library
    "A": "A",  # Archive
    "M": "M",  # Museum
    "O": "O",  # Official Institution
    "R": "R",  # Research Center
    "C": "C",  # Corporation
    "U": "U",  # Unknown
    "B": "B",  # Botanical/Zoo
    "E": "E",  # Education Provider
    "S": "S",  # Collecting Society
    "P": "P",  # Personal Collection
    "F": "F",  # Features (monuments, etc.)
    "I": "I",  # Intangible Heritage Group
    "X": "X",  # Mixed
    "H": "H",  # Holy Sites
    "D": "D",  # Digital Platform
    "N": "N",  # NGO
    "T": "T",  # Taste/Smell Heritage
}


def get_region_code(region_name: Optional[str]) -> str:
    """
    Get ISO 3166-2 region code for a Dutch province.

    Args:
        region_name: Province/region name (Dutch or English)

    Returns:
        2-letter region code or "00" if not found
    """
    if not region_name:
        return "00"

    # Normalize: lowercase, remove accents
    normalized = unicodedata.normalize('NFD', region_name.lower())
    normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    normalized = normalized.strip()

    return DUTCH_PROVINCE_CODES.get(normalized, "00")


def get_city_code(city_name: str) -> str:
    """
    Generate 3-letter city code from city name.

    Rules:
    1. Single word: first 3 letters uppercase
    2. City with article (de, het, den): first letter + first 2 of next word
    3. Multi-word: first letter of each word (up to 3)

    Args:
        city_name: City name

    Returns:
        3-letter uppercase city code
    """
    if not city_name:
        return "XXX"

    # Normalize: remove accents, handle special chars
    normalized = normalize_city_name(city_name)

    # Split into words
    words = normalized.split()

    if not words:
        return "XXX"

    # Dutch articles and prepositions
    articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'}

    if len(words) == 1:
        # Single word: take first 3 letters
        code = words[0][:3].upper()
    elif words[0].lower() in articles and len(words) > 1:
        # City with article: first letter of article + first 2 of next word
        code = (words[0][0] + words[1][:2]).upper()
    else:
        # Multi-word: take first letter of each word (up to 3)
        code = ''.join(w[0] for w in words[:3]).upper()

    # Ensure exactly 3 letters
    if len(code) < 3:
        code = code.ljust(3, 'X')
    elif len(code) > 3:
        code = code[:3]

    # Ensure only A-Z characters
    code = re.sub(r'[^A-Z]', 'X', code)

    return code


def generate_name_suffix(institution_name: str) -> str:
    """
    Generate snake_case name suffix from institution name.

    Used for collision resolution. Converts native language name to
    lowercase with underscores, removing diacritics and punctuation.

    Args:
        institution_name: Full institution name

    Returns:
        snake_case suffix (e.g., "historische_kring_zwolle")
    """
    if not institution_name:
        return "unknown"

    # Normalize: NFD decomposition to remove accents
    normalized = unicodedata.normalize('NFD', institution_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove apostrophes, commas, and other punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces and hyphens with underscores
    underscored = re.sub(r'[\s\-/]+', '_', no_punct)

    # Remove any remaining non-alphanumeric characters (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    # Truncate if too long (max 50 chars for name suffix)
    if len(final) > 50:
        final = final[:50].rstrip('_')

    return final if final else "unknown"


def extract_entry_data(entry: dict) -> dict:
    """
    Extract relevant data from an entry for GHCID generation.

    Settlement Resolution Priority (Updated Dec 2025):
    1. Google Maps locality (address_components with 'locality' type) - AUTHORITATIVE
       - Avoids micro-hamlet problem where GeoNames finds tiny settlements
       - Cross-referenced with GeoNames for geonames_id and city_code
    2. GeoNames reverse geocoding (if no Google Maps locality)
       - Uses MIN_POPULATION=100 threshold to skip micro-hamlets
    3. GeoNames name lookup (if only text city available) - FALLBACK
    4. Text-based city name (if GeoNames lookup fails) - LAST RESORT

    The micro-hamlet problem: GeoNames may return tiny settlements like "Duur" (pop 0)
    when the institution is clearly in "Olst" (pop 4,780) just because the coordinates
    are slightly closer to the hamlet. Using Google Maps locality solves this.

    Args:
        entry: Entry dictionary from YAML

    Returns:
        Dict with: name, type_code, city, region, wikidata_id, geonames_id, location_resolution, country_code
    """
    import re

    # === STEP 0: DETERMINE COUNTRY CODE FIRST ===
    # This is critical for correct GeoNames reverse geocoding!
    country_code = "NL"  # Default to Netherlands

    # Check zcbs_enrichment.country (most explicit source)
    if 'zcbs_enrichment' in entry and entry['zcbs_enrichment'].get('country'):
        country_code = entry['zcbs_enrichment']['country']

    # Check location.country
    elif 'location' in entry and entry['location'].get('country'):
        country_code = entry['location']['country']

    # Check locations[].country
    elif 'locations' in entry and entry['locations']:
        loc = entry['locations'][0]
        if loc.get('country'):
            country_code = loc['country']

    # Check original_entry for country indicators
    elif 'original_entry' in entry:
        # Check for explicit country field
        if entry['original_entry'].get('country'):
            country_code = entry['original_entry']['country']
        # Check for country in address or name
        elif entry['original_entry'].get('organisatie'):
            org_name = entry['original_entry']['organisatie'].lower()
            if 'belgium' in org_name or 'belgië' in org_name or 'belgique' in org_name:
                country_code = "BE"
            elif 'germany' in org_name or 'deutschland' in org_name:
                country_code = "DE"

    # Check google_maps_enrichment.address for country
    if country_code == "NL" and 'google_maps_enrichment' in entry:
        address = entry['google_maps_enrichment'].get('address', '')
        if address:
            if ', Belgium' in address or ', België' in address:
                country_code = "BE"
            elif ', Germany' in address or ', Deutschland' in address:
                country_code = "DE"

    # Check wikidata_enrichment for country/location hints
    if country_code == "NL" and 'wikidata_enrichment' in entry:
        wiki = entry['wikidata_enrichment']
        # Check located_in label for country hints
        located_in = wiki.get('located_in', {})
        if isinstance(located_in, dict):
            label = located_in.get('label', '').lower()
            if 'belgium' in label or 'belgië' in label:
                country_code = "BE"
            elif 'germany' in label or 'deutschland' in label:
                country_code = "DE"

    # Get institution name
    # Priority: custodian_name (verified) > original_entry > wikidata
    name = None

    # Try custodian_name first (XPath-verified from website or authoritative fallback)
    if 'custodian_name' in entry and entry['custodian_name'].get('claim_value'):
        name = entry['custodian_name']['claim_value']

    # Fallback to original_entry.organisatie
    if not name and 'original_entry' in entry:
        name = entry['original_entry'].get('organisatie')

    # Fallback to wikidata labels
    if not name and 'wikidata_enrichment' in entry:
        name = entry['wikidata_enrichment'].get('wikidata_label_nl')
        if not name:
            name = entry['wikidata_enrichment'].get('wikidata_label_en')

    if not name:
        name = "Unknown Institution"

    # Get institution type
    type_codes = []

    # Check organization.institution_type first (enriched data)
    if 'organization' in entry and 'institution_type' in entry['organization']:
        org_type = entry['organization']['institution_type']
        if isinstance(org_type, list):
            type_codes = org_type
        elif isinstance(org_type, str):
            type_codes = [org_type]

    # Fallback to original_entry.type
    if not type_codes and 'original_entry' in entry and 'type' in entry['original_entry']:
        types = entry['original_entry']['type']
        if isinstance(types, list):
            type_codes = types
        elif isinstance(types, str):
            type_codes = [types]

    # Use first type, default to U (Unknown)
    type_code = type_codes[0] if type_codes else 'U'

    # === STEP 1: EXTRACT COORDINATES FROM ALL SOURCES ===
    latitude = None
    longitude = None
    coord_source = None

    # Try google_maps_enrichment first (most accurate coordinates)
    if 'google_maps_enrichment' in entry:
        gm = entry['google_maps_enrichment']
        # Check nested 'coordinates' object first (new format)
        if isinstance(gm.get('coordinates'), dict):
            coords = gm['coordinates']
            if coords.get('latitude') and coords.get('longitude'):
                latitude = coords.get('latitude')
                longitude = coords.get('longitude')
                coord_source = 'google_maps'
        # Fallback to flat structure (old format)
        if latitude is None and gm.get('latitude') and gm.get('longitude'):
            latitude = gm.get('latitude')
            longitude = gm.get('longitude')
            coord_source = 'google_maps'

    # Try wikidata coordinates (multiple possible field names)
    if latitude is None and 'wikidata_enrichment' in entry:
        wiki = entry['wikidata_enrichment']

        # Check 'wikidata_coordinates' field first
        coords = wiki.get('wikidata_coordinates')
        if isinstance(coords, dict) and coords.get('latitude') and coords.get('longitude'):
            latitude = coords.get('latitude')
            longitude = coords.get('longitude')
            coord_source = 'wikidata'

        # Also check 'coordinates' field (alternative format)
        if latitude is None:
            coords = wiki.get('coordinates')
            if isinstance(coords, dict) and coords.get('latitude') and coords.get('longitude'):
                latitude = coords.get('latitude')
                longitude = coords.get('longitude')
                coord_source = 'wikidata'

        # Also check wikidata_claims for coordinates
        if latitude is None:
            claims = wiki.get('wikidata_claims', {})
            coords = claims.get('coordinate_location') or claims.get('coordinates')
            if isinstance(coords, dict) and coords.get('latitude') and coords.get('longitude'):
                latitude = coords.get('latitude')
                longitude = coords.get('longitude')
                coord_source = 'wikidata_claims'

    # Try locations[] array
    if latitude is None and 'locations' in entry and entry['locations']:
        loc = entry['locations'][0]
        if loc.get('latitude') and loc.get('longitude'):
            latitude = loc.get('latitude')
            longitude = loc.get('longitude')
            coord_source = 'locations'

    # Try location{} object (singular) with nested coordinates
    if latitude is None and 'location' in entry:
        loc = entry['location']
        # Check nested 'coordinates' object
        if isinstance(loc.get('coordinates'), dict):
            coords = loc['coordinates']
            if coords.get('latitude') and coords.get('longitude'):
                latitude = coords.get('latitude')
                longitude = coords.get('longitude')
                coord_source = 'location'
        # Fallback to flat structure
        elif loc.get('latitude') and loc.get('longitude'):
            latitude = loc.get('latitude')
            longitude = loc.get('longitude')
            coord_source = 'location'

    # === STEP 2: EXTRACT CITY FROM GOOGLE MAPS LOCALITY (AUTHORITATIVE) ===
    # Google Maps address_components with 'locality' type is the most reliable city source
    # This avoids the micro-hamlet problem where GeoNames finds tiny settlements near coords
    city = None
    region = None
    geonames_id = None
    location_resolution = None
    google_maps_locality = None
    google_maps_region = None

    if 'google_maps_enrichment' in entry:
        gm = entry['google_maps_enrichment']
        for comp in gm.get('address_components', []):
            types = comp.get('types', [])
            if 'locality' in types:
                google_maps_locality = comp.get('long_name')
            elif 'administrative_area_level_1' in types:
                # Extract region code from short_name (e.g., "OV" for Overijssel)
                google_maps_region = comp.get('short_name')

    # === STEP 3: USE GOOGLE MAPS LOCALITY OR FALL BACK TO GEONAMES ===
    if google_maps_locality:
        # Use Google Maps locality as the authoritative city name
        city = google_maps_locality
        region = google_maps_region

        # Look up in GeoNames to get geonames_id and city_code
        db = get_geonames_db()
        try:
            # Search for the city in GeoNames by name
            import sqlite3
            conn = sqlite3.connect(str(db.db_path))
            cursor = conn.cursor()

            VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')

            cursor.execute("""
                SELECT geonames_id, name, feature_code, population, admin1_code
                FROM cities
                WHERE country_code = ?
                  AND (name = ? OR ascii_name = ?)
                  AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
                ORDER BY population DESC
                LIMIT 1
            """, (country_code, google_maps_locality, google_maps_locality, *VALID_FEATURE_CODES))

            row = cursor.fetchone()
            if row:
                geonames_id, geonames_name, feature_code, population, admin1_code = row

                # Map admin1 code to ISO 3166-2 if we don't have region from Google Maps
                if not region:
                    country_admin1_map = GEONAMES_ADMIN1_TO_ISO.get(country_code, {})
                    region = country_admin1_map.get(admin1_code, admin1_code if admin1_code else "00")

                location_resolution = {
                    'method': 'GOOGLE_MAPS_LOCALITY',
                    'google_maps_locality': google_maps_locality,
                    'geonames_id': geonames_id,
                    'geonames_name': geonames_name,
                    'feature_code': feature_code,
                    'population': population,
                    'admin1_code': admin1_code,
                    'region_code': region,
                    'country_code': country_code,
                    'source_coordinates': {
                        'latitude': latitude,
                        'longitude': longitude,
                        'source': coord_source,
                    } if latitude and longitude else None,
                }
            else:
                # GeoNames lookup failed, but we still have Google Maps locality
                location_resolution = {
                    'method': 'GOOGLE_MAPS_LOCALITY',
                    'google_maps_locality': google_maps_locality,
                    'geonames_id': None,
                    'geonames_name': None,
                    'region_code': region,
                    'country_code': country_code,
                    'needs_geonames_entry': True,
                    'source_coordinates': {
                        'latitude': latitude,
                        'longitude': longitude,
                        'source': coord_source,
                    } if latitude and longitude else None,
                }
            conn.close()
        except Exception as e:
            # GeoNames lookup failed, but we still have Google Maps locality
            location_resolution = {
                'method': 'GOOGLE_MAPS_LOCALITY',
                'google_maps_locality': google_maps_locality,
                'geonames_id': None,
                'geonames_name': None,
                'region_code': region,
                'country_code': country_code,
                'error': str(e),
            }

    elif latitude is not None and longitude is not None:
        # No Google Maps locality - fall back to GeoNames reverse geocoding
        # with POPULATION THRESHOLD to avoid micro-hamlets
        MIN_POPULATION = 100  # Skip settlements with population < 100

        try:
            geo_result = reverse_geocode_to_city(latitude, longitude, country_code)
            if geo_result:
                population = geo_result.get('population', 0) or 0

                # If population is too low, try to find a larger nearby settlement
                if population < MIN_POPULATION:
                    # Search for nearest settlement with population >= MIN_POPULATION
                    import sqlite3
                    conn = sqlite3.connect(str(get_geonames_db().db_path))
                    cursor = conn.cursor()

                    VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')

                    cursor.execute("""
                        SELECT
                            name, ascii_name, admin1_code, admin1_name,
                            latitude, longitude, geonames_id, population, feature_code,
                            ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
                        FROM cities
                        WHERE country_code = ?
                          AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
                          AND population >= ?
                        ORDER BY distance_sq
                        LIMIT 1
                    """, (latitude, latitude, longitude, longitude, country_code, *VALID_FEATURE_CODES, MIN_POPULATION))

                    row = cursor.fetchone()
                    conn.close()

                    if row:
                        name, ascii_name, admin1_code, admin1_name, lat, lon, geonameid, pop, fcode, dist_sq = row

                        # Get city code using disambiguation
                        db = get_geonames_db()
                        city_code = db.get_city_abbreviation(name, country_code, use_disambiguation=True)
                        if not city_code:
                            city_code = get_city_code(name)

                        country_admin1_map = GEONAMES_ADMIN1_TO_ISO.get(country_code, {})
                        region_code = country_admin1_map.get(admin1_code, admin1_code if admin1_code else "00")

                        geo_result = {
                            'city': name,
                            'ascii_name': ascii_name,
                            'region': admin1_name,
                            'region_code': region_code,
                            'city_code': city_code,
                            'admin1_code': admin1_code,
                            'geonames_id': geonameid,
                            'feature_code': fcode,
                            'population': pop,
                            'distance_km': (dist_sq ** 0.5) * 111,
                            'micro_hamlet_skipped': True,
                        }

                city = geo_result.get('city')
                region = geo_result.get('region_code')  # ISO 3166-2 code
                geonames_id = geo_result.get('geonames_id')
                location_resolution = {
                    'method': 'REVERSE_GEOCODE',
                    'geonames_id': geonames_id,
                    'geonames_name': city,
                    'feature_code': geo_result.get('feature_code'),
                    'population': geo_result.get('population'),
                    'admin1_code': geo_result.get('admin1_code'),
                    'region_code': region,
                    'country_code': country_code,
                    'source_coordinates': {
                        'latitude': latitude,
                        'longitude': longitude,
                        'source': coord_source,
                    },
                    'distance_km': geo_result.get('distance_km'),
                }
                if geo_result.get('micro_hamlet_skipped'):
                    location_resolution['micro_hamlet_skipped'] = True
        except Exception as e:
            # Log but continue - will fall back to text-based resolution
            pass

    # === STEP 4: TEXT-BASED CITY EXTRACTION (LAST RESORT) ===
    text_city = None

    # Source 1: locations[] array (already enriched)
    if 'locations' in entry and entry['locations']:
        loc = entry['locations'][0]
        text_city = loc.get('city')
        if not region:
            region = loc.get('region')

    # Source 2: original_entry.plaatsnaam_bezoekadres (NDE CSV)
    if not text_city and 'original_entry' in entry:
        raw_city = entry['original_entry'].get('plaatsnaam_bezoekadres')
        if raw_city:
            # Handle formats like "Hoogeveen (en Zuidwolde)" - take first city
            clean_city = re.sub(r'\s*\([^)]+\)', '', raw_city).strip()
            if '/' in clean_city:
                clean_city = clean_city.split('/')[0].strip()
            if ' en ' in clean_city.lower():
                clean_city = re.split(r'\s+en\s+', clean_city, flags=re.IGNORECASE)[0].strip()
            text_city = clean_city if clean_city else raw_city

    # Source 3: google_maps_enrichment address
    if not text_city and 'google_maps_enrichment' in entry:
        gm = entry['google_maps_enrichment']
        address = gm.get('address', '')
        if address:
            parts = address.split(',')
            if len(parts) >= 2:
                last_part = parts[-1].strip()
                city_match = re.sub(r'^\d{4}\s*[A-Z]{2}\s*', '', last_part)
                if city_match:
                    text_city = city_match
        if not text_city:
            text_city = gm.get('city')

    # Source 4: museum_register_enrichment.province (for region only)
    if not region and 'museum_register_enrichment' in entry:
        region = entry['museum_register_enrichment'].get('province')

    # Source 5: wikidata_enrichment.wikidata_claims.location
    if not text_city and 'wikidata_enrichment' in entry:
        claims = entry['wikidata_enrichment'].get('wikidata_claims', {})
        if 'location' in claims:
            loc_data = claims['location']
            if isinstance(loc_data, dict):
                text_city = loc_data.get('label_en') or loc_data.get('label_nl')

    # Source 6: wikidata description for city hint
    if not text_city and 'wikidata_enrichment' in entry:
        desc_nl = entry['wikidata_enrichment'].get('wikidata_description_nl', '')
        city_match = re.search(r'in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),?\s*(?:Nederland|Netherlands)', desc_nl)
        if city_match:
            text_city = city_match.group(1)

    # === STEP 4: USE TEXT CITY IF GEONAMES RESOLUTION FAILED ===
    if not city and text_city:
        city = text_city
        location_resolution = {
            'method': 'TEXT_FALLBACK',
            'text_source': 'various',
            'geonames_id': None,
            'needs_review': True,
        }

    # Get Wikidata ID
    wikidata_id = None
    if 'wikidata_enrichment' in entry:
        wikidata_id = entry['wikidata_enrichment'].get('wikidata_entity_id')
    if not wikidata_id and 'original_entry' in entry:
        wikidata_id = entry['original_entry'].get('wikidata_id')

    return {
        'name': name,
        'type_code': TYPE_CODE_MAP.get(type_code, 'U'),
        'city': city,
        'region': region,
        'country_code': country_code,
        'wikidata_id': wikidata_id,
        'geonames_id': geonames_id,
        'location_resolution': location_resolution,
    }


def generate_base_ghcid(data: dict) -> Tuple[str, GHCIDComponents]:
    """
    Generate base GHCID (without name suffix) for an institution.

    Args:
        data: Dict with name, type_code, city, region, country_code

    Returns:
        Tuple of (base_ghcid_string, GHCIDComponents)
    """
    # Get country code from data, default to NL
    country_code = data.get('country_code', 'NL')

    # Get region code - handle both ISO codes (FL, NH, VLG) and province names
    region = data.get('region')
    if region:
        # If it's already a 2-3 letter uppercase code, use it directly
        # (NL uses 2-letter, BE uses 3-letter region codes)
        if len(region) in (2, 3) and region.isupper():
            region_code = region
        else:
            region_code = get_region_code(region)
    else:
        region_code = "00"

    # Get city code
    city_code = get_city_code(data['city']) if data['city'] else "XXX"

    # Get abbreviation from name
    abbreviation = extract_abbreviation_from_name(data['name'])
    if not abbreviation:
        abbreviation = "INST"

    # Create components (without Wikidata QID - we'll use name suffix for collisions)
    components = GHCIDComponents(
        country_code=country_code,
        region_code=region_code,
        city_locode=city_code,
        institution_type=data['type_code'],
        abbreviation=abbreviation,
        wikidata_qid=None,  # Don't use QID for collision resolution
    )

    return components.to_string(), components


def process_entries(entries_dir: Path, dry_run: bool = False) -> dict:
    """
    Process all entry files and generate GHCIDs.

    Args:
        entries_dir: Path to entries directory
        dry_run: If True, don't write changes

    Returns:
        Statistics dictionary
    """
    stats = {
        'total': 0,
        'success': 0,
        'skipped_no_location': 0,
        'skipped_not_custodian': 0,
        'collisions': 0,
        'collision_groups': 0,
        'files_updated': 0,
        'google_maps_locality': 0,  # Entries resolved via Google Maps locality (best)
        'geonames_resolved': 0,  # Entries resolved via GeoNames reverse geocoding
        'text_fallback': 0,      # Entries using text-based city (needs review)
        'errors': [],
    }

    # Timestamp for this batch
    generation_timestamp = datetime.now(timezone.utc).isoformat()

    # Phase 1: Load all entries and generate base GHCIDs
    print("Phase 1: Loading entries and generating base GHCIDs...")
    entries_data = []  # List of (filepath, entry, extracted_data, base_ghcid, components)

    yaml_files = sorted(entries_dir.glob("*.yaml"))
    stats['total'] = len(yaml_files)

    for filepath in yaml_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if not entry:
                continue

            # Check if NOT_CUSTODIAN (skip these)
            if entry.get('google_maps_status') == 'NOT_CUSTODIAN':
                stats['skipped_not_custodian'] += 1
                continue

            # Extract data
            data = extract_entry_data(entry)

            # Check if we have location data
            if not data['city']:
                stats['skipped_no_location'] += 1
                continue

            # Track resolution method
            loc_resolution = data.get('location_resolution', {})
            method = loc_resolution.get('method', '')
            if method == 'GOOGLE_MAPS_LOCALITY':
                stats['google_maps_locality'] += 1
            elif method == 'REVERSE_GEOCODE':
                stats['geonames_resolved'] += 1
            elif method == 'TEXT_FALLBACK':
                stats['text_fallback'] += 1

            # Generate base GHCID
            base_ghcid, components = generate_base_ghcid(data)

            entries_data.append({
                'filepath': filepath,
                'entry': entry,
                'data': data,
                'base_ghcid': base_ghcid,
                'components': components,
            })

        except Exception as e:
            stats['errors'].append(f"{filepath.name}: {str(e)}")

    print(f"  Loaded {len(entries_data)} entries with location data")
    print(f"    - Google Maps locality (best):        {stats['google_maps_locality']}")
    print(f"    - GeoNames reverse geocode:           {stats['geonames_resolved']}")
    print(f"    - Text fallback (needs review):       {stats['text_fallback']}")
    print(f"  Skipped {stats['skipped_no_location']} entries without city")
    print(f"  Skipped {stats['skipped_not_custodian']} NOT_CUSTODIAN entries")

    # Phase 2: Detect collisions
    print("\nPhase 2: Detecting GHCID collisions...")
    collision_groups = defaultdict(list)

    for ed in entries_data:
        collision_groups[ed['base_ghcid']].append(ed)

    # Count collisions
    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            stats['collision_groups'] += 1
            stats['collisions'] += len(group)

    print(f"  Found {stats['collision_groups']} collision groups ({stats['collisions']} entries)")

    # Phase 3: Resolve collisions and generate final GHCIDs
    print("\nPhase 3: Resolving collisions and generating final GHCIDs...")

    collision_report = []

    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            # COLLISION: Apply First Batch rule - ALL get name suffixes
            collision_report.append({
                'base_ghcid': base_ghcid,
                'count': len(group),
                'institutions': [ed['data']['name'] for ed in group],
            })

            for ed in group:
                # Generate name suffix
                name_suffix = generate_name_suffix(ed['data']['name'])
                ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}"
                ed['had_collision'] = True
        else:
            # No collision: use base GHCID
            ed = group[0]
            ed['final_ghcid'] = base_ghcid
            ed['had_collision'] = False

    # Phase 4: Generate all identifier formats and update entries
    print("\nPhase 4: Generating identifier formats and updating entries...")

    for ed in entries_data:
        final_ghcid = ed['final_ghcid']

        # Create final components with the resolved GHCID string
        # We need to parse it back or generate UUIDs directly
        # For simplicity, hash the final GHCID string directly

        import hashlib
        import uuid

        # GHCID UUID v5 Namespace
        GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')

        # Generate UUID v5 (SHA-1)
        ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid)

        # Generate UUID v8 (SHA-256)
        hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest()
        uuid_bytes = bytearray(hash_bytes[:16])
        uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80  # Version 8
        uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80  # Variant RFC 4122
        ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))

        # Generate numeric (64-bit)
        ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)

        # Generate record ID (UUID v7 - time-ordered, non-deterministic)
        record_id = GHCIDComponents.generate_uuid_v7()

        # Create GHCID block for entry
        ghcid_block = {
            'ghcid_current': final_ghcid,
            'ghcid_original': final_ghcid,  # Same for first assignment
            'ghcid_uuid': str(ghcid_uuid),
            'ghcid_uuid_sha256': str(ghcid_uuid_sha256),
            'ghcid_numeric': ghcid_numeric,
            'record_id': str(record_id),
            'generation_timestamp': generation_timestamp,
            'ghcid_history': [
                {
                    'ghcid': final_ghcid,
                    'ghcid_numeric': ghcid_numeric,
                    'valid_from': generation_timestamp,
                    'valid_to': None,
                    'reason': 'Initial GHCID assignment (NDE batch import December 2025)'
                        + (' - name suffix added to resolve collision' if ed.get('had_collision') else ''),
                }
            ],
        }

        # Add location resolution metadata (GeoNames provenance)
        if ed['data'].get('location_resolution'):
            ghcid_block['location_resolution'] = ed['data']['location_resolution']

        # Add GeoNames ID if available
        if ed['data'].get('geonames_id'):
            ghcid_block['geonames_id'] = ed['data']['geonames_id']

        # Add collision info if applicable
        if ed.get('had_collision'):
            ghcid_block['collision_resolved'] = True
            ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid']

        # Update entry
        entry = ed['entry']
        entry['ghcid'] = ghcid_block

        # Also add to identifiers list
        if 'identifiers' not in entry:
            entry['identifiers'] = []

        # Remove any existing GHCID identifiers
        entry['identifiers'] = [
            i for i in entry['identifiers']
            if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID']
        ]

        # Add new GHCID identifiers
        entry['identifiers'].extend([
            {
                'identifier_scheme': 'GHCID',
                'identifier_value': final_ghcid,
            },
            {
                'identifier_scheme': 'GHCID_UUID',
                'identifier_value': str(ghcid_uuid),
                'identifier_url': f'urn:uuid:{ghcid_uuid}',
            },
            {
                'identifier_scheme': 'GHCID_UUID_SHA256',
                'identifier_value': str(ghcid_uuid_sha256),
                'identifier_url': f'urn:uuid:{ghcid_uuid_sha256}',
            },
            {
                'identifier_scheme': 'GHCID_NUMERIC',
                'identifier_value': str(ghcid_numeric),
            },
            {
                'identifier_scheme': 'RECORD_ID',
                'identifier_value': str(record_id),
                'identifier_url': f'urn:uuid:{record_id}',
            },
        ])

        ed['entry'] = entry
        stats['success'] += 1

    # Phase 5: Write updated entries
    if not dry_run:
        print("\nPhase 5: Writing updated entry files...")

        for ed in entries_data:
            filepath = ed['filepath']
            entry = ed['entry']

            try:
                with open(filepath, 'w', encoding='utf-8') as f:
                    yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
                stats['files_updated'] += 1
            except Exception as e:
                stats['errors'].append(f"Write error {filepath.name}: {str(e)}")

        print(f"  Updated {stats['files_updated']} files")
    else:
        print("\nPhase 5: DRY RUN - no files written")

    # Phase 6: Generate collision report
    print("\nPhase 6: Generating collision report...")

    if collision_report:
        report_path = entries_dir.parent / "ghcid_collision_report.json"

        report = {
            'generation_timestamp': generation_timestamp,
            'total_entries': stats['total'],
            'entries_with_ghcid': stats['success'],
            'collision_groups': stats['collision_groups'],
            'entries_with_collisions': stats['collisions'],
            'collision_resolution_strategy': 'first_batch_all_get_name_suffix',
            'collisions': collision_report,
        }

        if not dry_run:
            with open(report_path, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"  Collision report written to: {report_path}")
        else:
            print(f"  Would write collision report to: {report_path}")

    return stats


def main():
    """Main execution."""
    parser = argparse.ArgumentParser(description="Enrich NDE entries with GHCID identifiers")
    parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing")
    args = parser.parse_args()

    # Paths
    project_root = Path(__file__).parent.parent
    entries_dir = project_root / "data" / "nde" / "enriched" / "entries"

    print("="*70)
    print("NDE HERITAGE INSTITUTION GHCID ENRICHMENT")
    print("="*70)
    print(f"Entries directory: {entries_dir}")
    print(f"Dry run: {args.dry_run}")
    print()

    if not entries_dir.exists():
        print(f"ERROR: Entries directory not found: {entries_dir}")
        sys.exit(1)

    # Process entries
    stats = process_entries(entries_dir, dry_run=args.dry_run)

    # Print summary
    print()
    print("="*70)
    print("GHCID ENRICHMENT SUMMARY")
    print("="*70)
    print(f"Total entry files:              {stats['total']}")
    print(f"Entries with GHCID generated:   {stats['success']}")
    print(f"  - Google Maps locality:       {stats['google_maps_locality']}")
    print(f"  - GeoNames reverse geocode:   {stats['geonames_resolved']}")
    print(f"  - Text fallback (review):     {stats['text_fallback']}")
    print(f"Skipped (no city):              {stats['skipped_no_location']}")
    print(f"Skipped (NOT_CUSTODIAN):        {stats['skipped_not_custodian']}")
    print(f"Collision groups:               {stats['collision_groups']}")
    print(f"Entries with collisions:        {stats['collisions']}")
    print(f"Files updated:                  {stats['files_updated']}")

    if stats['errors']:
        print(f"\nErrors ({len(stats['errors'])}):")
        for err in stats['errors'][:10]:
            print(f"  - {err}")
        if len(stats['errors']) > 10:
            print(f"  ... and {len(stats['errors']) - 10} more")

    print()
    print("="*70)
    if args.dry_run:
        print("DRY RUN COMPLETE - No files were modified")
    else:
        print("GHCID ENRICHMENT COMPLETE")
    print("="*70)


if __name__ == "__main__":
    main()