glam/scripts/enrich_kien_ghcid.py

#!/usr/bin/env python3
"""
Generate GHCIDs for KIEN intangible heritage custodian entries.

This script is a targeted version of enrich_nde_entries_ghcid.py that only
processes KIEN entries (entry_index 1674-1860) to avoid processing the
entire NDE dataset.

Usage:
    python scripts/enrich_kien_ghcid.py [--dry-run]
"""

import argparse
import hashlib
import json
import re
import sqlite3
import sys
import unicodedata
import uuid
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple

import yaml

# Project root
PROJECT_ROOT = Path(__file__).parent.parent

# GHCID UUID v5 Namespace (DNS namespace from RFC 4122)
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')

# GeoNames admin1 code to ISO 3166-2 NL mapping
GEONAMES_ADMIN1_TO_ISO_NL = {
    "01": "DR",  # Drenthe
    "02": "FR",  # Friesland
    "03": "GE",  # Gelderland
    "04": "GR",  # Groningen
    "05": "LI",  # Limburg
    "06": "NB",  # Noord-Brabant
    "07": "NH",  # Noord-Holland
    "09": "UT",  # Utrecht
    "10": "ZE",  # Zeeland
    "11": "ZH",  # Zuid-Holland
    "15": "OV",  # Overijssel
    "16": "FL",  # Flevoland
}

# Dutch articles/prepositions to skip in abbreviation generation
DUTCH_SKIP_WORDS = {
    'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
    "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
    'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', 'an'
}

# Valid GeoNames feature codes (settlements, not neighborhoods)
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')


def generate_uuid_v7() -> uuid.UUID:
    """Generate a UUID v7 (time-ordered, for database records)."""
    import time
    import os

    # Get current time in milliseconds
    timestamp_ms = int(time.time() * 1000)

    # Create 16-byte array
    uuid_bytes = bytearray(16)

    # First 6 bytes: timestamp (48 bits)
    uuid_bytes[0:6] = timestamp_ms.to_bytes(6, byteorder='big')

    # 4 bits version (7) + 12 bits random
    random_a = int.from_bytes(os.urandom(2), byteorder='big')
    uuid_bytes[6] = 0x70 | ((random_a >> 8) & 0x0F)
    uuid_bytes[7] = random_a & 0xFF

    # 2 bits variant (10) + 62 bits random
    random_b = int.from_bytes(os.urandom(8), byteorder='big')
    uuid_bytes[8] = 0x80 | ((random_b >> 56) & 0x3F)
    uuid_bytes[9:16] = random_b.to_bytes(8, byteorder='big')[1:]

    return uuid.UUID(bytes=bytes(uuid_bytes))


def normalize_city_name(city_name: str) -> str:
    """Normalize city name for code generation."""
    # NFD decomposition to separate accents
    normalized = unicodedata.normalize('NFD', city_name)
    # Remove combining marks (accents)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    # Remove apostrophes and special chars
    ascii_name = re.sub(r"[''`]", '', ascii_name)
    return ascii_name


def get_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    if not city_name:
        return "XXX"

    normalized = normalize_city_name(city_name)
    words = normalized.split()

    if not words:
        return "XXX"

    articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'}

    if len(words) == 1:
        code = words[0][:3].upper()
    elif words[0].lower() in articles and len(words) > 1:
        code = (words[0][0] + words[1][:2]).upper()
    else:
        code = ''.join(w[0] for w in words[:3]).upper()

    if len(code) < 3:
        code = code.ljust(3, 'X')
    elif len(code) > 3:
        code = code[:3]

    code = re.sub(r'[^A-Z]', 'X', code)
    return code


def extract_abbreviation_from_name(name: str) -> str:
    """Extract abbreviation from institution name using first letters of significant words."""
    if not name:
        return "INST"

    # Normalize
    normalized = unicodedata.normalize('NFD', name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Remove punctuation except hyphens and apostrophes
    cleaned = re.sub(r"[''`\",.:;!?()[\]{}]", '', ascii_name)

    # Split into words
    words = cleaned.split()

    # Filter out skip words and digits
    significant = []
    for word in words:
        word_lower = word.lower()
        if word_lower not in DUTCH_SKIP_WORDS and not word.isdigit():
            significant.append(word)

    if not significant:
        significant = words[:3]  # Fallback to first 3 words

    # Take first letter of each significant word (up to 10)
    abbrev = ''.join(w[0].upper() for w in significant[:10] if w)

    return abbrev if abbrev else "INST"


def generate_name_suffix(institution_name: str) -> str:
    """Generate snake_case name suffix for collision resolution."""
    if not institution_name:
        return "unknown"

    # Normalize
    normalized = unicodedata.normalize('NFD', institution_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces/hyphens with underscores
    underscored = re.sub(r'[\s\-/]+', '_', no_punct)

    # Remove non-alphanumeric (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    # Truncate
    if len(final) > 50:
        final = final[:50].rstrip('_')

    return final if final else "unknown"


def reverse_geocode(lat: float, lon: float, db_path: Path) -> Optional[dict]:
    """Reverse geocode coordinates to find nearest city using GeoNames."""
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()

    try:
        query = """
            SELECT
                name, ascii_name, admin1_code, geonames_id, population, feature_code,
                ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
            FROM cities
            WHERE country_code = 'NL'
              AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
              AND population >= 100
            ORDER BY distance_sq
            LIMIT 1
        """
        cursor.execute(query, (lat, lat, lon, lon, *VALID_FEATURE_CODES))
        row = cursor.fetchone()

        if row:
            name, ascii_name, admin1_code, geonames_id, population, feature_code, dist_sq = row
            region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")

            return {
                'city': name,
                'city_code': get_city_code(name),
                'region_code': region_code,
                'admin1_code': admin1_code,
                'geonames_id': geonames_id,
                'feature_code': feature_code,
                'population': population,
                'distance_km': (dist_sq ** 0.5) * 111,
            }
    finally:
        conn.close()

    return None


def lookup_city_by_name(city_name: str, db_path: Path) -> Optional[dict]:
    """Look up city in GeoNames by name."""
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()

    try:
        query = """
            SELECT
                name, admin1_code, geonames_id, population, feature_code
            FROM cities
            WHERE country_code = 'NL'
              AND (name = ? OR ascii_name = ?)
              AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
            ORDER BY population DESC
            LIMIT 1
        """
        cursor.execute(query, (city_name, city_name, *VALID_FEATURE_CODES))
        row = cursor.fetchone()

        if row:
            name, admin1_code, geonames_id, population, feature_code = row
            region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")

            return {
                'city': name,
                'city_code': get_city_code(name),
                'region_code': region_code,
                'admin1_code': admin1_code,
                'geonames_id': geonames_id,
                'feature_code': feature_code,
                'population': population,
            }
    finally:
        conn.close()

    return None


def extract_entry_data(entry: dict, db_path: Path) -> dict:
    """Extract data for GHCID generation from a KIEN entry."""
    # Get name from custodian_name or original_entry
    name = None
    if 'custodian_name' in entry and entry['custodian_name'].get('claim_value'):
        name = entry['custodian_name']['claim_value']
    if not name and 'original_entry' in entry:
        name = entry['original_entry'].get('organisatie')
    if not name and 'kien_enrichment' in entry:
        name = entry['kien_enrichment'].get('kien_name')
    if not name:
        name = "Unknown Institution"

    # Get type code - KIEN entries are type I (Intangible Heritage) or T (Taste/Smell)
    type_code = 'I'  # Default for KIEN
    if 'original_entry' in entry and 'type' in entry['original_entry']:
        types = entry['original_entry']['type']
        if isinstance(types, list) and types:
            type_code = types[0]
        elif isinstance(types, str):
            type_code = types

    # Get location data
    city = None
    region_code = "00"
    geonames_id = None
    location_resolution = None

    # Try coordinates first
    lat, lon = None, None
    if 'locations' in entry and entry['locations']:
        loc = entry['locations'][0]
        lat = loc.get('latitude')
        lon = loc.get('longitude')
        city = loc.get('city')

    # Reverse geocode if we have coordinates
    if lat is not None and lon is not None:
        geo_result = reverse_geocode(lat, lon, db_path)
        if geo_result:
            city = geo_result['city']
            region_code = geo_result['region_code']
            geonames_id = geo_result['geonames_id']
            location_resolution = {
                'method': 'REVERSE_GEOCODE',
                'geonames_id': geonames_id,
                'geonames_name': city,
                'feature_code': geo_result['feature_code'],
                'population': geo_result['population'],
                'admin1_code': geo_result['admin1_code'],
                'region_code': region_code,
                'country_code': 'NL',
                'source_coordinates': {'latitude': lat, 'longitude': lon},
                'distance_km': geo_result['distance_km'],
            }

    # If we have a city name but no geocode result, look it up
    if city and not location_resolution:
        geo_result = lookup_city_by_name(city, db_path)
        if geo_result:
            region_code = geo_result['region_code']
            geonames_id = geo_result['geonames_id']
            location_resolution = {
                'method': 'NAME_LOOKUP',
                'geonames_id': geonames_id,
                'geonames_name': geo_result['city'],
                'feature_code': geo_result['feature_code'],
                'population': geo_result['population'],
                'admin1_code': geo_result['admin1_code'],
                'region_code': region_code,
                'country_code': 'NL',
            }
        else:
            location_resolution = {
                'method': 'TEXT_FALLBACK',
                'city_name': city,
                'needs_review': True,
            }

    return {
        'name': name,
        'type_code': type_code,
        'city': city,
        'city_code': get_city_code(city) if city else "XXX",
        'region_code': region_code,
        'country_code': 'NL',
        'geonames_id': geonames_id,
        'location_resolution': location_resolution,
    }


def generate_ghcid(data: dict) -> Tuple[str, dict]:
    """Generate base GHCID and all identifier formats."""
    # Build base GHCID string
    country = data['country_code']
    region = data['region_code']
    city = data['city_code']
    inst_type = data['type_code']
    abbrev = extract_abbreviation_from_name(data['name'])

    base_ghcid = f"{country}-{region}-{city}-{inst_type}-{abbrev}"

    return base_ghcid, {
        'country': country,
        'region': region,
        'city': city,
        'type': inst_type,
        'abbrev': abbrev,
    }


def generate_identifier_formats(final_ghcid: str) -> dict:
    """Generate all 4 identifier formats from final GHCID string."""
    # UUID v5 (SHA-1)
    ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid)

    # UUID v8 (SHA-256)
    hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest()
    uuid_bytes = bytearray(hash_bytes[:16])
    uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80  # Version 8
    uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80  # Variant RFC 4122
    ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))

    # Numeric (64-bit)
    ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)

    # Record ID (UUID v7)
    record_id = generate_uuid_v7()

    return {
        'ghcid_uuid': str(ghcid_uuid),
        'ghcid_uuid_sha256': str(ghcid_uuid_sha256),
        'ghcid_numeric': ghcid_numeric,
        'record_id': str(record_id),
    }


def process_kien_entries(entries_dir: Path, db_path: Path, dry_run: bool = False) -> dict:
    """Process KIEN entries and generate GHCIDs."""
    stats = {
        'total': 0,
        'processed': 0,
        'with_location': 0,
        'without_location': 0,
        'already_has_ghcid': 0,
        'collisions': 0,
        'collision_groups': 0,
        'files_updated': 0,
        'errors': [],
    }

    timestamp = datetime.now(timezone.utc).isoformat()

    # Find KIEN entries (1674-1860)
    kien_files = []
    for f in entries_dir.glob("*.yaml"):
        # Extract entry index from filename
        match = re.match(r'^(\d+)_', f.name)
        if match:
            idx = int(match.group(1))
            if 1674 <= idx <= 1860:
                kien_files.append(f)

    def get_entry_index(filepath: Path) -> int:
        match = re.match(r'^(\d+)_', filepath.name)
        return int(match.group(1)) if match else 0

    kien_files.sort(key=get_entry_index)
    stats['total'] = len(kien_files)

    print(f"Found {len(kien_files)} KIEN entries")

    # Phase 1: Load entries and extract data
    print("\nPhase 1: Loading entries and extracting location data...")
    entries_data = []

    for filepath in kien_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if not entry:
                continue

            # Check if already has GHCID
            if 'ghcid' in entry and entry['ghcid'].get('ghcid_current'):
                stats['already_has_ghcid'] += 1
                continue

            # Extract data
            data = extract_entry_data(entry, db_path)

            if not data['city']:
                stats['without_location'] += 1
                continue

            stats['with_location'] += 1

            # Generate base GHCID
            base_ghcid, components = generate_ghcid(data)

            entries_data.append({
                'filepath': filepath,
                'entry': entry,
                'data': data,
                'base_ghcid': base_ghcid,
                'components': components,
            })

        except Exception as e:
            stats['errors'].append(f"{filepath.name}: {str(e)}")

    print(f"  Entries with location: {stats['with_location']}")
    print(f"  Entries without location: {stats['without_location']}")
    print(f"  Already have GHCID: {stats['already_has_ghcid']}")

    # Phase 2: Detect collisions
    print("\nPhase 2: Detecting collisions...")
    collision_groups = defaultdict(list)

    for ed in entries_data:
        collision_groups[ed['base_ghcid']].append(ed)

    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            stats['collision_groups'] += 1
            stats['collisions'] += len(group)

    print(f"  Collision groups: {stats['collision_groups']}")
    print(f"  Entries with collisions: {stats['collisions']}")

    # Phase 3: Resolve collisions
    print("\nPhase 3: Resolving collisions...")
    collision_report = []

    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            # All get name suffixes
            collision_report.append({
                'base_ghcid': base_ghcid,
                'count': len(group),
                'institutions': [ed['data']['name'] for ed in group],
            })

            for ed in group:
                name_suffix = generate_name_suffix(ed['data']['name'])
                ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}"
                ed['had_collision'] = True
        else:
            ed = group[0]
            ed['final_ghcid'] = base_ghcid
            ed['had_collision'] = False

    # Phase 4: Generate identifiers and update entries
    print("\nPhase 4: Generating identifiers and updating entries...")

    for ed in entries_data:
        final_ghcid = ed['final_ghcid']
        ids = generate_identifier_formats(final_ghcid)

        # Create GHCID block
        ghcid_block = {
            'ghcid_current': final_ghcid,
            'ghcid_original': final_ghcid,
            'ghcid_uuid': ids['ghcid_uuid'],
            'ghcid_uuid_sha256': ids['ghcid_uuid_sha256'],
            'ghcid_numeric': ids['ghcid_numeric'],
            'record_id': ids['record_id'],
            'generation_timestamp': timestamp,
            'ghcid_history': [
                {
                    'ghcid': final_ghcid,
                    'ghcid_numeric': ids['ghcid_numeric'],
                    'valid_from': timestamp,
                    'valid_to': None,
                    'reason': 'Initial GHCID assignment (KIEN batch import December 2025)'
                        + (' - name suffix added to resolve collision' if ed.get('had_collision') else ''),
                }
            ],
        }

        # Add location resolution metadata
        if ed['data'].get('location_resolution'):
            ghcid_block['location_resolution'] = ed['data']['location_resolution']

        if ed['data'].get('geonames_id'):
            ghcid_block['geonames_id'] = ed['data']['geonames_id']

        if ed.get('had_collision'):
            ghcid_block['collision_resolved'] = True
            ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid']

        # Update entry
        entry = ed['entry']
        entry['ghcid'] = ghcid_block

        # Add to identifiers list
        if 'identifiers' not in entry:
            entry['identifiers'] = []

        # Remove existing GHCID identifiers
        entry['identifiers'] = [
            i for i in entry['identifiers']
            if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID']
        ]

        # Add new identifiers
        entry['identifiers'].extend([
            {'identifier_scheme': 'GHCID', 'identifier_value': final_ghcid},
            {'identifier_scheme': 'GHCID_UUID', 'identifier_value': ids['ghcid_uuid'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid']}"},
            {'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ids['ghcid_uuid_sha256'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid_sha256']}"},
            {'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ids['ghcid_numeric'])},
            {'identifier_scheme': 'RECORD_ID', 'identifier_value': ids['record_id'], 'identifier_url': f"urn:uuid:{ids['record_id']}"},
        ])

        ed['entry'] = entry
        stats['processed'] += 1

    # Phase 5: Write updated entries
    if not dry_run:
        print("\nPhase 5: Writing updated entries...")

        for ed in entries_data:
            try:
                with open(ed['filepath'], 'w', encoding='utf-8') as f:
                    yaml.dump(ed['entry'], f, default_flow_style=False, allow_unicode=True, sort_keys=False)
                stats['files_updated'] += 1
            except Exception as e:
                stats['errors'].append(f"Write error {ed['filepath'].name}: {str(e)}")

        print(f"  Updated {stats['files_updated']} files")

        # Write collision report
        if collision_report:
            report_path = entries_dir.parent / "kien_ghcid_collision_report.json"
            report = {
                'generation_timestamp': timestamp,
                'total_kien_entries': stats['total'],
                'entries_with_ghcid': stats['processed'],
                'collision_groups': stats['collision_groups'],
                'entries_with_collisions': stats['collisions'],
                'collisions': collision_report,
            }
            with open(report_path, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"  Collision report: {report_path}")
    else:
        print("\nPhase 5: DRY RUN - no files written")

    return stats


def main():
    parser = argparse.ArgumentParser(description="Generate GHCIDs for KIEN entries")
    parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing")
    args = parser.parse_args()

    entries_dir = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
    db_path = PROJECT_ROOT / "data" / "reference" / "geonames.db"

    print("="*70)
    print("KIEN HERITAGE CUSTODIAN GHCID GENERATION")
    print("="*70)
    print(f"Entries directory: {entries_dir}")
    print(f"GeoNames database: {db_path}")
    print(f"Dry run: {args.dry_run}")
    print()

    if not entries_dir.exists():
        print(f"ERROR: Entries directory not found: {entries_dir}")
        sys.exit(1)

    if not db_path.exists():
        print(f"ERROR: GeoNames database not found: {db_path}")
        sys.exit(1)

    stats = process_kien_entries(entries_dir, db_path, dry_run=args.dry_run)

    print()
    print("="*70)
    print("SUMMARY")
    print("="*70)
    print(f"Total KIEN entries:         {stats['total']}")
    print(f"Already have GHCID:         {stats['already_has_ghcid']}")
    print(f"Entries with location:      {stats['with_location']}")
    print(f"Entries without location:   {stats['without_location']}")
    print(f"GHCIDs generated:           {stats['processed']}")
    print(f"Collision groups:           {stats['collision_groups']}")
    print(f"Entries with collisions:    {stats['collisions']}")
    print(f"Files updated:              {stats['files_updated']}")

    if stats['errors']:
        print(f"\nErrors ({len(stats['errors'])}):")
        for err in stats['errors'][:5]:
            print(f"  - {err}")
        if len(stats['errors']) > 5:
            print(f"  ... and {len(stats['errors']) - 5} more")

    print()
    if args.dry_run:
        print("DRY RUN COMPLETE - No files modified")
    else:
        print("GHCID GENERATION COMPLETE")


if __name__ == "__main__":
    main()