glam/scripts/regenerate_ghcids_emic_name.py

#!/usr/bin/env python3
"""
Regenerate GHCIDs for custodians where emic_name differs from claim_value.

This script finds custodians that have been enriched with emic_name (native language name)
and regenerates their GHCID abbreviation component using the emic_name instead of the
English claim_value.

Per AGENTS.md grandfathering policy: Existing GHCIDs from UNESCO MoW custodians are
grandfathered for PID stability. This script generates a REPORT of which files WOULD
be updated, but does NOT automatically apply changes without explicit confirmation.

Usage:
    # Dry run (default) - show what would change
    python scripts/regenerate_ghcids_emic_name.py

    # Apply changes
    python scripts/regenerate_ghcids_emic_name.py --apply

    # Process specific country
    python scripts/regenerate_ghcids_emic_name.py --country DE

    # Limit number of files
    python scripts/regenerate_ghcids_emic_name.py --limit 10
"""

import argparse
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path

import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.identifiers.ghcid import extract_abbreviation_from_name


def get_current_abbreviation(ghcid: str) -> str:
    """Extract the abbreviation component from a GHCID string.

    GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-collision_suffix
    """
    parts = ghcid.split('-')
    if len(parts) >= 5:
        # Everything after the type code is the abbreviation (may include collision suffix)
        return '-'.join(parts[4:])
    return ''


def build_new_ghcid(old_ghcid: str, new_abbrev: str) -> str:
    """Build a new GHCID with updated abbreviation.

    Preserves country, region, city, and type codes.
    """
    parts = old_ghcid.split('-')
    if len(parts) >= 5:
        # Take first 4 parts (CC-RR-CCC-T) and append new abbreviation
        return '-'.join(parts[:4] + [new_abbrev])
    return old_ghcid


def generate_uuid_v5(ghcid_string: str) -> str:
    """Generate deterministic UUID v5 from GHCID string."""
    GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')  # URL namespace
    return str(uuid.uuid5(GHCID_NAMESPACE, f"https://w3id.org/heritage/custodian/{ghcid_string}"))


def process_custodian_file(filepath: Path, apply: bool = False) -> dict | None:
    """Process a single custodian YAML file.

    Returns info dict if update needed, None otherwise.
    """
    with open(filepath) as f:
        data = yaml.safe_load(f)

    if not data:
        return None

    # Get custodian_name block
    cn = data.get('custodian_name', {})
    emic_name = cn.get('emic_name', '')
    claim_value = cn.get('claim_value', '')

    if not emic_name:
        return None

    # Get current GHCID
    ghcid_data = data.get('ghcid', {})
    current_ghcid = ghcid_data.get('ghcid_current', '')

    if not current_ghcid:
        return None

    # Calculate abbreviations
    current_abbrev = get_current_abbreviation(current_ghcid)

    # Generate new abbreviation from emic_name
    new_abbrev = extract_abbreviation_from_name(emic_name)

    # Also generate what the English abbreviation would be
    english_abbrev = extract_abbreviation_from_name(claim_value)

    # Skip if abbreviations are the same
    if current_abbrev.upper() == new_abbrev.upper():
        return None

    # Build new GHCID
    new_ghcid = build_new_ghcid(current_ghcid, new_abbrev)
    new_uuid = generate_uuid_v5(new_ghcid)

    result = {
        'file': filepath.name,
        'current_ghcid': current_ghcid,
        'new_ghcid': new_ghcid,
        'current_abbrev': current_abbrev,
        'new_abbrev': new_abbrev,
        'english_abbrev': english_abbrev,
        'emic_name': emic_name,
        'claim_value': claim_value,
        'new_uuid': new_uuid,
    }

    if apply:
        # Update the YAML data
        timestamp = datetime.now(timezone.utc).isoformat()

        # Update GHCID fields
        old_ghcid_current = ghcid_data.get('ghcid_current', '')
        ghcid_data['ghcid_current'] = new_ghcid
        ghcid_data['ghcid_uuid'] = new_uuid
        # Note: We don't regenerate ghcid_numeric here, would need the full algorithm
        ghcid_data['generation_timestamp'] = timestamp

        # Add to history
        history = ghcid_data.get('ghcid_history', [])
        history.append({
            'ghcid': new_ghcid,
            'valid_from': timestamp,
            'reason': f'GHCID abbreviation regenerated from emic_name "{emic_name}" (was "{claim_value}", abbrev {current_abbrev}→{new_abbrev})',
        })
        ghcid_data['ghcid_history'] = history

        data['ghcid'] = ghcid_data

        # Add provenance note
        prov = data.get('provenance', {})
        notes = prov.get('notes', [])
        if isinstance(notes, str):
            notes = [notes]
        notes.append(f'GHCID regenerated {timestamp}: abbreviation {current_abbrev}→{new_abbrev} from emic_name')
        prov['notes'] = notes
        data['provenance'] = prov

        # Write updated YAML
        with open(filepath, 'w') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        # Rename file if needed
        new_filename = f"{new_ghcid}.yaml"
        if filepath.name != new_filename:
            new_path = filepath.parent / new_filename
            if new_path.exists():
                result['rename_error'] = f"Target file {new_filename} already exists"
            else:
                filepath.rename(new_path)
                result['renamed_to'] = new_filename

    return result


def main():
    parser = argparse.ArgumentParser(description="Regenerate GHCIDs using emic_name")
    parser.add_argument('--apply', action='store_true', help='Actually apply changes (default: dry run)')
    parser.add_argument('--country', type=str, help='Only process files for specific country code (e.g., DE)')
    parser.add_argument('--limit', type=int, help='Limit number of files to process')
    parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output')
    args = parser.parse_args()

    custodian_dir = Path(__file__).parent.parent / "data" / "custodian"

    print("=" * 70)
    print("GHCID Regeneration from Emic Names")
    print("=" * 70)
    print(f"Directory: {custodian_dir}")
    print(f"Mode: {'APPLY CHANGES' if args.apply else 'DRY RUN (use --apply to modify files)'}")
    if args.country:
        print(f"Country filter: {args.country}")
    if args.limit:
        print(f"Limit: {args.limit} files")
    print("=" * 70)
    print()

    # Find files with emic_name
    yaml_files = sorted(custodian_dir.glob("*.yaml"))

    if args.country:
        yaml_files = [f for f in yaml_files if f.name.startswith(f"{args.country}-")]

    if args.limit:
        yaml_files = yaml_files[:args.limit]

    results = []
    processed = 0

    for filepath in yaml_files:
        processed += 1
        if processed % 500 == 0:
            print(f"  Processed {processed}/{len(yaml_files)} files...")

        try:
            result = process_custodian_file(filepath, apply=args.apply)
            if result:
                results.append(result)
        except Exception as e:
            print(f"  Error processing {filepath.name}: {e}")

    # Report
    print()
    print("=" * 70)
    print(f"RESULTS: {len(results)} files {'updated' if args.apply else 'would be updated'}")
    print("=" * 70)
    print()

    if results:
        # Group by country
        by_country = {}
        for r in results:
            country = r['current_ghcid'][:2]
            if country not in by_country:
                by_country[country] = []
            by_country[country].append(r)

        for country in sorted(by_country.keys()):
            items = by_country[country]
            print(f"\n{country}: {len(items)} files")
            print("-" * 50)

            for r in items[:10 if not args.verbose else None]:  # Show first 10 per country unless verbose
                print(f"  {r['current_ghcid']} → {r['new_ghcid']}")
                print(f"    claim: {r['claim_value'][:50]}...")
                print(f"    emic:  {r['emic_name'][:50]}...")
                print(f"    abbrev: {r['current_abbrev']} → {r['new_abbrev']}")
                if args.apply:
                    if r.get('renamed_to'):
                        print(f"    RENAMED TO: {r['renamed_to']}")
                    if r.get('rename_error'):
                        print(f"    RENAME ERROR: {r['rename_error']}")
                print()

            if len(items) > 10 and not args.verbose:
                print(f"  ... and {len(items) - 10} more (use -v to see all)")

    print()
    print("=" * 70)
    if not args.apply:
        print("This was a DRY RUN. Use --apply to actually modify files.")
    else:
        print(f"Done! {len(results)} files updated.")
    print("=" * 70)


if __name__ == "__main__":
    main()