glam/scripts/enrich_nde_entries_ghcid.py

#!/usr/bin/env python3
"""
Enrich NDE Heritage Institution Entries with GHCID Persistent Identifiers.

This script:
1. Loads all YAML files from data/nde/enriched/entries/
2. Extracts location data (city, region, coordinates)
3. Generates base GHCIDs using NL-REGION-CITY-TYPE-ABBREV format
4. Detects collisions and applies First Batch rule (all get name suffixes)
5. Generates all 4 identifier formats:
   - Human-readable GHCID string
   - UUID v5 (SHA-1, RFC 4122 compliant) - PRIMARY
   - UUID v8 (SHA-256, SOTA cryptographic strength) - Future-proof
   - Numeric (64-bit integer for database PKs)
6. Adds GHCID fields to each entry
7. Generates collision statistics report

## GHCID Format

Base: NL-{Region}-{City}-{Type}-{Abbreviation}
With collision suffix: NL-{Region}-{City}-{Type}-{Abbreviation}-{name_suffix}

## Collision Resolution (First Batch Rule)

Since this is a batch import (all entries processed together), when multiple
institutions generate the same base GHCID:
- ALL colliding institutions receive native language name suffixes
- Name suffix: snake_case of institution name

Example:
- Two societies with NL-OV-ZWO-S-HK both become:
  - NL-OV-ZWO-S-HK-historische_kring_zwolle
  - NL-OV-ZWO-S-HK-heemkundige_kring_zwolle

Usage:
    python scripts/enrich_nde_entries_ghcid.py [--dry-run]

Options:
    --dry-run   Preview changes without writing to files
"""

import argparse
import json
import re
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.identifiers.ghcid import (
    GHCIDComponents,
    GHCIDGenerator,
    InstitutionType,
    extract_abbreviation_from_name,
    normalize_city_name,
)


# Dutch province to ISO 3166-2 code mapping
DUTCH_PROVINCE_CODES = {
    # Standard names
    "drenthe": "DR",
    "flevoland": "FL",
    "friesland": "FR",
    "fryslan": "FR",
    "fryslân": "FR",
    "gelderland": "GE",
    "groningen": "GR",
    "limburg": "LI",
    "noord-brabant": "NB",
    "north brabant": "NB",
    "noord brabant": "NB",
    "noord-holland": "NH",
    "north holland": "NH",
    "noord holland": "NH",
    "overijssel": "OV",
    "utrecht": "UT",
    "zeeland": "ZE",
    "zuid-holland": "ZH",
    "south holland": "ZH",
    "zuid holland": "ZH",
}

# Institution type code mapping (from original entry 'type' field)
TYPE_CODE_MAP = {
    "G": "G",  # Gallery
    "L": "L",  # Library
    "A": "A",  # Archive
    "M": "M",  # Museum
    "O": "O",  # Official Institution
    "R": "R",  # Research Center
    "C": "C",  # Corporation
    "U": "U",  # Unknown
    "B": "B",  # Botanical/Zoo
    "E": "E",  # Education Provider
    "S": "S",  # Collecting Society
    "P": "P",  # Personal Collection
    "F": "F",  # Features (monuments, etc.)
    "I": "I",  # Intangible Heritage Group
    "X": "X",  # Mixed
    "H": "H",  # Holy Sites
    "D": "D",  # Digital Platform
    "N": "N",  # NGO
    "T": "T",  # Taste/Smell Heritage
}


def get_region_code(region_name: Optional[str]) -> str:
    """
    Get ISO 3166-2 region code for a Dutch province.

    Args:
        region_name: Province/region name (Dutch or English)

    Returns:
        2-letter region code or "00" if not found
    """
    if not region_name:
        return "00"

    # Normalize: lowercase, remove accents
    normalized = unicodedata.normalize('NFD', region_name.lower())
    normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    normalized = normalized.strip()

    return DUTCH_PROVINCE_CODES.get(normalized, "00")


def get_city_code(city_name: str) -> str:
    """
    Generate 3-letter city code from city name.

    Rules:
    1. Single word: first 3 letters uppercase
    2. City with article (de, het, den): first letter + first 2 of next word
    3. Multi-word: first letter of each word (up to 3)

    Args:
        city_name: City name

    Returns:
        3-letter uppercase city code
    """
    if not city_name:
        return "XXX"

    # Normalize: remove accents, handle special chars
    normalized = normalize_city_name(city_name)

    # Split into words
    words = normalized.split()

    if not words:
        return "XXX"

    # Dutch articles and prepositions
    articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'}

    if len(words) == 1:
        # Single word: take first 3 letters
        code = words[0][:3].upper()
    elif words[0].lower() in articles and len(words) > 1:
        # City with article: first letter of article + first 2 of next word
        code = (words[0][0] + words[1][:2]).upper()
    else:
        # Multi-word: take first letter of each word (up to 3)
        code = ''.join(w[0] for w in words[:3]).upper()

    # Ensure exactly 3 letters
    if len(code) < 3:
        code = code.ljust(3, 'X')
    elif len(code) > 3:
        code = code[:3]

    # Ensure only A-Z characters
    code = re.sub(r'[^A-Z]', 'X', code)

    return code


def generate_name_suffix(institution_name: str) -> str:
    """
    Generate snake_case name suffix from institution name.

    Used for collision resolution. Converts native language name to
    lowercase with underscores, removing diacritics and punctuation.

    Args:
        institution_name: Full institution name

    Returns:
        snake_case suffix (e.g., "historische_kring_zwolle")
    """
    if not institution_name:
        return "unknown"

    # Normalize: NFD decomposition to remove accents
    normalized = unicodedata.normalize('NFD', institution_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove apostrophes, commas, and other punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces and hyphens with underscores
    underscored = re.sub(r'[\s\-/]+', '_', no_punct)

    # Remove any remaining non-alphanumeric characters (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    # Truncate if too long (max 50 chars for name suffix)
    if len(final) > 50:
        final = final[:50].rstrip('_')

    return final if final else "unknown"


def extract_entry_data(entry: dict) -> dict:
    """
    Extract relevant data from an entry for GHCID generation.

    Looks in multiple sources for location data:
    1. locations[] array (if already enriched)
    2. original_entry.plaatsnaam_bezoekadres (NDE CSV city field)
    3. google_maps_enrichment.address / city
    4. museum_register_enrichment.province
    5. wikidata_enrichment.wikidata_claims.location

    Args:
        entry: Entry dictionary from YAML

    Returns:
        Dict with: name, type_code, city, region, wikidata_id
    """
    # Get institution name
    name = None
    if 'original_entry' in entry:
        name = entry['original_entry'].get('organisatie')

    if not name and 'wikidata_enrichment' in entry:
        name = entry['wikidata_enrichment'].get('wikidata_label_nl')
        if not name:
            name = entry['wikidata_enrichment'].get('wikidata_label_en')

    if not name:
        name = "Unknown Institution"

    # Get institution type
    type_codes = []
    if 'original_entry' in entry and 'type' in entry['original_entry']:
        types = entry['original_entry']['type']
        if isinstance(types, list):
            type_codes = types
        elif isinstance(types, str):
            type_codes = [types]

    # Use first type, default to U (Unknown)
    type_code = type_codes[0] if type_codes else 'U'

    # Get location - try multiple sources
    city = None
    region = None

    # Source 1: locations[] array (already enriched)
    if 'locations' in entry and entry['locations']:
        loc = entry['locations'][0]
        city = loc.get('city')
        region = loc.get('region')

    # Source 2: original_entry.plaatsnaam_bezoekadres (NDE CSV)
    if not city and 'original_entry' in entry:
        city = entry['original_entry'].get('plaatsnaam_bezoekadres')

    # Source 3: google_maps_enrichment
    if not city and 'google_maps_enrichment' in entry:
        gm = entry['google_maps_enrichment']
        # Try to extract city from address
        address = gm.get('address', '')
        if address:
            # Dutch addresses: "Street Nr, Postcode City"
            # Try to extract city from last part
            parts = address.split(',')
            if len(parts) >= 2:
                last_part = parts[-1].strip()
                # Remove postcode (4 digits + 2 letters)
                import re
                city_match = re.sub(r'^\d{4}\s*[A-Z]{2}\s*', '', last_part)
                if city_match:
                    city = city_match
        # Also try 'city' field if present
        if not city:
            city = gm.get('city')

    # Source 4: museum_register_enrichment.province (for region)
    if not region and 'museum_register_enrichment' in entry:
        region = entry['museum_register_enrichment'].get('province')

    # Source 5: wikidata_enrichment.wikidata_claims.location
    if not city and 'wikidata_enrichment' in entry:
        claims = entry['wikidata_enrichment'].get('wikidata_claims', {})
        if 'location' in claims:
            loc_data = claims['location']
            if isinstance(loc_data, dict):
                city = loc_data.get('label_en') or loc_data.get('label_nl')

    # Source 6: Try wikidata description for city hint
    if not city and 'wikidata_enrichment' in entry:
        desc_nl = entry['wikidata_enrichment'].get('wikidata_description_nl', '')
        # Try to extract city from "museum in [City], Nederland"
        import re
        city_match = re.search(r'in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),?\s*(?:Nederland|Netherlands)', desc_nl)
        if city_match:
            city = city_match.group(1)

    # Get Wikidata ID
    wikidata_id = None
    if 'wikidata_enrichment' in entry:
        wikidata_id = entry['wikidata_enrichment'].get('wikidata_entity_id')
    if not wikidata_id and 'original_entry' in entry:
        wikidata_id = entry['original_entry'].get('wikidata_id')

    return {
        'name': name,
        'type_code': TYPE_CODE_MAP.get(type_code, 'U'),
        'city': city,
        'region': region,
        'wikidata_id': wikidata_id,
    }


def generate_base_ghcid(data: dict) -> Tuple[str, GHCIDComponents]:
    """
    Generate base GHCID (without name suffix) for an institution.

    Args:
        data: Dict with name, type_code, city, region

    Returns:
        Tuple of (base_ghcid_string, GHCIDComponents)
    """
    # Get region code
    region_code = get_region_code(data['region'])

    # Get city code
    city_code = get_city_code(data['city']) if data['city'] else "XXX"

    # Get abbreviation from name
    abbreviation = extract_abbreviation_from_name(data['name'])
    if not abbreviation:
        abbreviation = "INST"

    # Create components (without Wikidata QID - we'll use name suffix for collisions)
    components = GHCIDComponents(
        country_code="NL",
        region_code=region_code,
        city_locode=city_code,
        institution_type=data['type_code'],
        abbreviation=abbreviation,
        wikidata_qid=None,  # Don't use QID for collision resolution
    )

    return components.to_string(), components


def process_entries(entries_dir: Path, dry_run: bool = False) -> dict:
    """
    Process all entry files and generate GHCIDs.

    Args:
        entries_dir: Path to entries directory
        dry_run: If True, don't write changes

    Returns:
        Statistics dictionary
    """
    stats = {
        'total': 0,
        'success': 0,
        'skipped_no_location': 0,
        'skipped_not_custodian': 0,
        'collisions': 0,
        'collision_groups': 0,
        'files_updated': 0,
        'errors': [],
    }

    # Timestamp for this batch
    generation_timestamp = datetime.now(timezone.utc).isoformat()

    # Phase 1: Load all entries and generate base GHCIDs
    print("Phase 1: Loading entries and generating base GHCIDs...")
    entries_data = []  # List of (filepath, entry, extracted_data, base_ghcid, components)

    yaml_files = sorted(entries_dir.glob("*.yaml"))
    stats['total'] = len(yaml_files)

    for filepath in yaml_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if not entry:
                continue

            # Check if NOT_CUSTODIAN (skip these)
            if entry.get('google_maps_status') == 'NOT_CUSTODIAN':
                stats['skipped_not_custodian'] += 1
                continue

            # Extract data
            data = extract_entry_data(entry)

            # Check if we have location data
            if not data['city']:
                stats['skipped_no_location'] += 1
                continue

            # Generate base GHCID
            base_ghcid, components = generate_base_ghcid(data)

            entries_data.append({
                'filepath': filepath,
                'entry': entry,
                'data': data,
                'base_ghcid': base_ghcid,
                'components': components,
            })

        except Exception as e:
            stats['errors'].append(f"{filepath.name}: {str(e)}")

    print(f"  Loaded {len(entries_data)} entries with location data")
    print(f"  Skipped {stats['skipped_no_location']} entries without city")
    print(f"  Skipped {stats['skipped_not_custodian']} NOT_CUSTODIAN entries")

    # Phase 2: Detect collisions
    print("\nPhase 2: Detecting GHCID collisions...")
    collision_groups = defaultdict(list)

    for ed in entries_data:
        collision_groups[ed['base_ghcid']].append(ed)

    # Count collisions
    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            stats['collision_groups'] += 1
            stats['collisions'] += len(group)

    print(f"  Found {stats['collision_groups']} collision groups ({stats['collisions']} entries)")

    # Phase 3: Resolve collisions and generate final GHCIDs
    print("\nPhase 3: Resolving collisions and generating final GHCIDs...")

    collision_report = []

    for base_ghcid, group in collision_groups.items():
        if len(group) > 1:
            # COLLISION: Apply First Batch rule - ALL get name suffixes
            collision_report.append({
                'base_ghcid': base_ghcid,
                'count': len(group),
                'institutions': [ed['data']['name'] for ed in group],
            })

            for ed in group:
                # Generate name suffix
                name_suffix = generate_name_suffix(ed['data']['name'])
                ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}"
                ed['had_collision'] = True
        else:
            # No collision: use base GHCID
            ed = group[0]
            ed['final_ghcid'] = base_ghcid
            ed['had_collision'] = False

    # Phase 4: Generate all identifier formats and update entries
    print("\nPhase 4: Generating identifier formats and updating entries...")

    for ed in entries_data:
        final_ghcid = ed['final_ghcid']

        # Create final components with the resolved GHCID string
        # We need to parse it back or generate UUIDs directly
        # For simplicity, hash the final GHCID string directly

        import hashlib
        import uuid

        # GHCID UUID v5 Namespace
        GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')

        # Generate UUID v5 (SHA-1)
        ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid)

        # Generate UUID v8 (SHA-256)
        hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest()
        uuid_bytes = bytearray(hash_bytes[:16])
        uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80  # Version 8
        uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80  # Variant RFC 4122
        ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))

        # Generate numeric (64-bit)
        ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)

        # Generate record ID (UUID v7 - time-ordered, non-deterministic)
        record_id = GHCIDComponents.generate_uuid_v7()

        # Create GHCID block for entry
        ghcid_block = {
            'ghcid_current': final_ghcid,
            'ghcid_original': final_ghcid,  # Same for first assignment
            'ghcid_uuid': str(ghcid_uuid),
            'ghcid_uuid_sha256': str(ghcid_uuid_sha256),
            'ghcid_numeric': ghcid_numeric,
            'record_id': str(record_id),
            'generation_timestamp': generation_timestamp,
            'ghcid_history': [
                {
                    'ghcid': final_ghcid,
                    'ghcid_numeric': ghcid_numeric,
                    'valid_from': generation_timestamp,
                    'valid_to': None,
                    'reason': 'Initial GHCID assignment (NDE batch import December 2025)'
                        + (' - name suffix added to resolve collision' if ed.get('had_collision') else ''),
                }
            ],
        }

        # Add collision info if applicable
        if ed.get('had_collision'):
            ghcid_block['collision_resolved'] = True
            ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid']

        # Update entry
        entry = ed['entry']
        entry['ghcid'] = ghcid_block

        # Also add to identifiers list
        if 'identifiers' not in entry:
            entry['identifiers'] = []

        # Remove any existing GHCID identifiers
        entry['identifiers'] = [
            i for i in entry['identifiers']
            if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID']
        ]

        # Add new GHCID identifiers
        entry['identifiers'].extend([
            {
                'identifier_scheme': 'GHCID',
                'identifier_value': final_ghcid,
            },
            {
                'identifier_scheme': 'GHCID_UUID',
                'identifier_value': str(ghcid_uuid),
                'identifier_url': f'urn:uuid:{ghcid_uuid}',
            },
            {
                'identifier_scheme': 'GHCID_UUID_SHA256',
                'identifier_value': str(ghcid_uuid_sha256),
                'identifier_url': f'urn:uuid:{ghcid_uuid_sha256}',
            },
            {
                'identifier_scheme': 'GHCID_NUMERIC',
                'identifier_value': str(ghcid_numeric),
            },
            {
                'identifier_scheme': 'RECORD_ID',
                'identifier_value': str(record_id),
                'identifier_url': f'urn:uuid:{record_id}',
            },
        ])

        ed['entry'] = entry
        stats['success'] += 1

    # Phase 5: Write updated entries
    if not dry_run:
        print("\nPhase 5: Writing updated entry files...")

        for ed in entries_data:
            filepath = ed['filepath']
            entry = ed['entry']

            try:
                with open(filepath, 'w', encoding='utf-8') as f:
                    yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
                stats['files_updated'] += 1
            except Exception as e:
                stats['errors'].append(f"Write error {filepath.name}: {str(e)}")

        print(f"  Updated {stats['files_updated']} files")
    else:
        print("\nPhase 5: DRY RUN - no files written")

    # Phase 6: Generate collision report
    print("\nPhase 6: Generating collision report...")

    if collision_report:
        report_path = entries_dir.parent / "ghcid_collision_report.json"

        report = {
            'generation_timestamp': generation_timestamp,
            'total_entries': stats['total'],
            'entries_with_ghcid': stats['success'],
            'collision_groups': stats['collision_groups'],
            'entries_with_collisions': stats['collisions'],
            'collision_resolution_strategy': 'first_batch_all_get_name_suffix',
            'collisions': collision_report,
        }

        if not dry_run:
            with open(report_path, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"  Collision report written to: {report_path}")
        else:
            print(f"  Would write collision report to: {report_path}")

    return stats


def main():
    """Main execution."""
    parser = argparse.ArgumentParser(description="Enrich NDE entries with GHCID identifiers")
    parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing")
    args = parser.parse_args()

    # Paths
    project_root = Path(__file__).parent.parent
    entries_dir = project_root / "data" / "nde" / "enriched" / "entries"

    print("="*70)
    print("NDE HERITAGE INSTITUTION GHCID ENRICHMENT")
    print("="*70)
    print(f"Entries directory: {entries_dir}")
    print(f"Dry run: {args.dry_run}")
    print()

    if not entries_dir.exists():
        print(f"ERROR: Entries directory not found: {entries_dir}")
        sys.exit(1)

    # Process entries
    stats = process_entries(entries_dir, dry_run=args.dry_run)

    # Print summary
    print()
    print("="*70)
    print("GHCID ENRICHMENT SUMMARY")
    print("="*70)
    print(f"Total entry files:              {stats['total']}")
    print(f"Entries with GHCID generated:   {stats['success']}")
    print(f"Skipped (no city):              {stats['skipped_no_location']}")
    print(f"Skipped (NOT_CUSTODIAN):        {stats['skipped_not_custodian']}")
    print(f"Collision groups:               {stats['collision_groups']}")
    print(f"Entries with collisions:        {stats['collisions']}")
    print(f"Files updated:                  {stats['files_updated']}")

    if stats['errors']:
        print(f"\nErrors ({len(stats['errors'])}):")
        for err in stats['errors'][:10]:
            print(f"  - {err}")
        if len(stats['errors']) > 10:
            print(f"  ... and {len(stats['errors']) - 10} more")

    print()
    print("="*70)
    if args.dry_run:
        print("DRY RUN COMPLETE - No files were modified")
    else:
        print("GHCID ENRICHMENT COMPLETE")
    print("="*70)


if __name__ == "__main__":
    main()