glam/scripts/migrate_qnumber_to_snakecase.py

#!/usr/bin/env python3
"""
Migrate GHCID files from Q-number suffix convention to snake_case name convention.

This script:
1. Finds files with Q-number suffixes (e.g., LB-BA-BEI-A-AIF-Q4783187.yaml)
2. Extracts the institution name
3. Generates snake_case suffix from the native name
4. Updates GHCID, regenerates UUIDs
5. Renames files
"""

import os
import re
import uuid
import hashlib
import unicodedata
from pathlib import Path
import yaml

# UUID v5 namespace for GHCID (same as in other scripts)
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")  # DNS namespace

def generate_name_suffix(native_name: str) -> str:
    """Convert native language institution name to snake_case suffix."""
    # Normalize unicode (NFD decomposition) and remove diacritics
    normalized = unicodedata.normalize('NFD', native_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove apostrophes, commas, and other punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces and hyphens with underscores
    underscored = re.sub(r'[\s\-]+', '_', no_punct)

    # Remove any remaining non-alphanumeric characters (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    return final

def generate_uuid_v5(ghcid_string: str) -> str:
    """Generate UUID v5 from GHCID string."""
    return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))

def generate_uuid_v8_sha256(ghcid_string: str) -> str:
    """Generate UUID v8 (SHA-256 based) from GHCID string."""
    hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
    # Take first 16 bytes and set version 8 and variant bits
    uuid_bytes = bytearray(hash_bytes[:16])
    uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80  # Version 8
    uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80  # Variant
    return str(uuid.UUID(bytes=bytes(uuid_bytes)))

def generate_numeric_id(ghcid_string: str) -> int:
    """Generate 64-bit numeric ID from GHCID string."""
    hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
    return int.from_bytes(hash_bytes[:8], 'big') & ((1 << 63) - 1)  # 63-bit positive

def process_file(filepath: Path, dry_run: bool = True):
    """Process a single Q-number file."""
    filename = filepath.name

    # Extract base GHCID and Q-number
    match = re.match(r'^(.+)-Q(\d+)\.yaml$', filename)
    if not match:
        print(f"  SKIP: {filename} (doesn't match Q-number pattern)")
        return None

    base_ghcid = match.group(1)
    q_number = match.group(2)

    # Load YAML
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Get institution name
    name = data.get('original_entry', {}).get('name', '')
    if not name:
        print(f"  ERROR: {filename} has no name field")
        return None

    # Generate snake_case suffix
    name_suffix = generate_name_suffix(name)

    # New GHCID
    new_ghcid = f"{base_ghcid}-{name_suffix}"
    old_ghcid = f"{base_ghcid}-Q{q_number}"

    print(f"\n  File: {filename}")
    print(f"  Name: {name}")
    print(f"  Old GHCID: {old_ghcid}")
    print(f"  New GHCID: {new_ghcid}")

    # Generate new UUIDs
    new_uuid_v5 = generate_uuid_v5(new_ghcid)
    new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
    new_numeric = generate_numeric_id(new_ghcid)

    print(f"  New UUID v5: {new_uuid_v5}")
    print(f"  New UUID v8: {new_uuid_v8}")
    print(f"  New Numeric: {new_numeric}")

    if not dry_run:
        # Update GHCID section
        if 'ghcid' in data:
            # Add to history
            if 'ghcid_history' not in data['ghcid']:
                data['ghcid']['ghcid_history'] = []

            # Record old GHCID in history
            from datetime import datetime, timezone
            now = datetime.now(timezone.utc).isoformat()

            data['ghcid']['ghcid_history'].append({
                'ghcid': old_ghcid,
                'ghcid_uuid': data['ghcid'].get('ghcid_uuid'),
                'ghcid_uuid_sha256': data['ghcid'].get('ghcid_uuid_sha256'),
                'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
                'valid_from': data.get('provenance', {}).get('generated_at', '2025-12-06T00:00:00Z'),
                'valid_to': now,
                'reason': 'Migrated from Q-number suffix to snake_case name suffix convention'
            })

            # Update current GHCID
            data['ghcid']['ghcid_current'] = new_ghcid
            data['ghcid']['ghcid_original'] = base_ghcid  # Store base without suffix
            data['ghcid']['ghcid_uuid'] = new_uuid_v5
            data['ghcid']['ghcid_uuid_sha256'] = new_uuid_v8
            data['ghcid']['ghcid_numeric'] = new_numeric

        # Write updated data
        new_filename = f"{new_ghcid}.yaml"
        new_filepath = filepath.parent / new_filename

        with open(new_filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        # Delete old file
        if new_filepath != filepath:
            filepath.unlink()
            print(f"  MIGRATED: {filename} → {new_filename}")

        return new_filepath
    else:
        print(f"  New filename would be: {new_ghcid}.yaml")
        return None

def main():
    import argparse
    parser = argparse.ArgumentParser(description='Migrate Q-number GHCID files to snake_case')
    parser.add_argument('--execute', action='store_true', help='Actually perform migration (default is dry run)')
    parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files')
    args = parser.parse_args()

    custodian_dir = Path(args.path)

    # Find Q-number files
    q_files = list(custodian_dir.glob('*-Q[0-9]*.yaml'))

    # Filter to only those with Q-number suffix (not Q as part of abbreviation)
    q_suffix_files = [f for f in q_files if re.match(r'.+-Q\d+\.yaml$', f.name)]

    print(f"Found {len(q_suffix_files)} files with Q-number suffixes")

    if args.execute:
        print("\n=== EXECUTING MIGRATION ===")
    else:
        print("\n=== DRY RUN (use --execute to actually migrate) ===")

    migrated = 0
    for filepath in sorted(q_suffix_files):
        result = process_file(filepath, dry_run=not args.execute)
        if result:
            migrated += 1

    if args.execute:
        print(f"\n=== MIGRATION COMPLETE: {migrated} files migrated ===")
    else:
        print(f"\n=== DRY RUN COMPLETE: {len(q_suffix_files)} files would be migrated ===")

if __name__ == '__main__':
    main()