glam/scripts/fix_collision_victims.py

#!/usr/bin/env python3
"""
Fix GHCID collision victim files.

These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml)
indicating they were collision victims whose internal GHCID was incorrectly set
to their collision partner's GHCID instead of getting their own unique GHCID.

This script:
1. Reads the institution's real name from original_entry.organisatie
2. Generates a proper name suffix from that name
3. Creates a new unique GHCID with the proper suffix
4. Regenerates all GHCID-derived identifiers (UUID, numeric)
5. Updates the file with correct identifiers
6. Renames the file to match the new GHCID
"""

import hashlib
import re
import shutil
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import yaml

# GHCID namespace for UUID generation
GHCID_NAMESPACE = uuid.NAMESPACE_URL
GHCID_URL_PREFIX = "https://glam.registry/"

# Skip words for abbreviation generation (Dutch and common)
SKIP_WORDS = {
    'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
    's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
    'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a',
    'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as',
    'museum', 'bibliotheek', 'archief', 'collectie'
}


def normalize_diacritics(text: str) -> str:
    """Normalize diacritics to ASCII equivalents."""
    normalized = unicodedata.normalize('NFD', text)
    ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    return ascii_text


def generate_name_suffix(native_name: str) -> str:
    """Convert native language institution name to snake_case suffix.

    Examples:
        "Biblionet Drenthe POI" → "biblionet_drenthe_poi"
        "Fries Verzetsmuseum" → "fries_verzetsmuseum"
        "Musée d'Orsay" → "musee_dorsay"
    """
    # Normalize unicode and remove diacritics
    ascii_name = normalize_diacritics(native_name)

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove apostrophes, commas, and other punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces and hyphens with underscores
    underscored = re.sub(r'[\s\-]+', '_', no_punct)

    # Remove any remaining non-alphanumeric characters (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    return final


def generate_ghcid_uuid(ghcid: str) -> str:
    """Generate UUID v5 from GHCID."""
    return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}"))


def generate_ghcid_uuid_sha256(ghcid: str) -> str:
    """Generate UUID v8 (SHA-256 based) from GHCID."""
    return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}"))


def generate_ghcid_numeric(ghcid: str) -> int:
    """Generate 64-bit numeric ID from GHCID."""
    sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
    return int(sha256_hash[:16], 16)


def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]:
    """Fix a single collision victim file.

    Args:
        file_path: Path to the collision victim YAML file
        dry_run: If True, only print what would be done

    Returns:
        New file path after renaming, or None if skipped/failed
    """
    print(f"\n{'='*80}")
    print(f"Processing: {file_path.name}")
    print(f"{'='*80}")

    # Read file
    try:
        with open(file_path) as f:
            data = yaml.safe_load(f)
    except Exception as e:
        print(f"  ERROR: Could not read file: {e}")
        return None

    if data is None:
        print(f"  SKIP: File is empty or invalid")
        return None

    # Get institution name
    org_name = data.get('original_entry', {}).get('organisatie')
    if not org_name:
        print(f"  ERROR: No organisatie found in original_entry")
        return None

    print(f"  Institution: {org_name}")

    # Get current GHCID info
    ghcid_data = data.get('ghcid', {})
    old_ghcid = ghcid_data.get('ghcid_current', '')
    print(f"  Old GHCID: {old_ghcid}")

    # Extract base GHCID from filename (remove trailing dash)
    base_ghcid = file_path.stem.rstrip('-')
    print(f"  Base GHCID: {base_ghcid}")

    # Generate new name suffix from institution name
    name_suffix = generate_name_suffix(org_name)
    print(f"  Name suffix: {name_suffix}")

    # Create new GHCID
    new_ghcid = f"{base_ghcid}-{name_suffix}"
    print(f"  New GHCID: {new_ghcid}")

    # Check if this would be the same as old (only filename is wrong)
    if new_ghcid == old_ghcid:
        expected_filename = f"{new_ghcid}.yaml"
        if file_path.name != expected_filename:
            print(f"  GHCID correct, but filename wrong - needs rename only")
            if dry_run:
                print(f"  DRY RUN: Would rename to {expected_filename}")
                return None

            new_file_path = file_path.parent / expected_filename
            if new_file_path.exists():
                print(f"  ERROR: Target file already exists: {new_file_path.name}")
                return None

            shutil.move(str(file_path), str(new_file_path))
            print(f"  Renamed: {file_path.name} → {new_file_path.name}")
            return new_file_path
        else:
            print(f"  SKIP: GHCID and filename both correct")
            return None

    # Generate new identifiers
    new_uuid = generate_ghcid_uuid(new_ghcid)
    new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
    new_numeric = generate_ghcid_numeric(new_ghcid)

    print(f"  New UUID: {new_uuid}")
    print(f"  New numeric: {new_numeric}")

    if dry_run:
        print(f"  DRY RUN: Would update file and rename to {new_ghcid}.yaml")
        return None

    # Update GHCID section
    timestamp = datetime.now(timezone.utc).isoformat()

    # Preserve old GHCID in history
    ghcid_history = ghcid_data.get('ghcid_history', [])

    # Add history entry for the fix
    ghcid_history.append({
        'ghcid': old_ghcid,
        'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''),
        'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0),
        'valid_from': ghcid_data.get('generated_at', ''),
        'valid_to': timestamp,
        'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'"
    })

    data['ghcid'] = {
        'ghcid_current': new_ghcid,
        'ghcid_uuid': new_uuid,
        'ghcid_uuid_sha256': new_uuid_sha256,
        'ghcid_numeric': new_numeric,
        'generated_at': timestamp,
        'ghcid_history': ghcid_history
    }

    # Update identifiers list
    identifiers = data.get('identifiers', [])
    updated_identifiers = []
    for ident in identifiers:
        scheme = ident.get('identifier_scheme', '')
        if scheme == 'GHCID':
            ident['identifier_value'] = new_ghcid
            ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}"
        elif scheme == 'GHCID_UUID':
            ident['identifier_value'] = new_uuid
        elif scheme == 'GHCID_NUMERIC':
            ident['identifier_value'] = str(new_numeric)
        updated_identifiers.append(ident)
    data['identifiers'] = updated_identifiers

    # Write updated data back to file
    with open(file_path, 'w') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"  Updated file content")

    # Rename file to match new GHCID
    new_file_path = file_path.parent / f"{new_ghcid}.yaml"

    if new_file_path.exists():
        print(f"  ERROR: Target file already exists: {new_file_path.name}")
        return None

    shutil.move(str(file_path), str(new_file_path))
    print(f"  Renamed: {file_path.name} → {new_file_path.name}")

    return new_file_path


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Fix GHCID collision victim files')
    parser.add_argument('--dry-run', action='store_true', help='Only show what would be done')
    parser.add_argument('--file', type=str, help='Process only this specific file')
    args = parser.parse_args()

    custodian_dir = Path('data/custodian')

    if args.file:
        files = [Path(args.file)]
    else:
        # Find all collision victim files (trailing dash pattern)
        files = sorted(custodian_dir.glob('NL-*-.yaml'))

    print(f"Found {len(files)} collision victim file(s)")

    fixed = 0
    skipped = 0
    errors = 0

    for f in files:
        result = fix_collision_victim(f, dry_run=args.dry_run)
        if result:
            fixed += 1
        elif result is None:
            # Check if it was empty
            if f.stat().st_size == 0:
                print(f"\n  EMPTY FILE: {f.name} - should be deleted")
                errors += 1
            else:
                skipped += 1

    print(f"\n{'='*80}")
    print(f"SUMMARY")
    print(f"{'='*80}")
    print(f"  Fixed: {fixed}")
    print(f"  Skipped: {skipped}")
    print(f"  Errors/Empty: {errors}")


if __name__ == '__main__':
    main()