glam/scripts/fix_czech_collisions.py

#!/usr/bin/env python3
"""
Handle GHCID collisions by adding name suffixes.

When fixing region codes, some files may collide with existing files.
This script resolves those by adding snake_case name suffixes.
"""

import os
import re
import sys
import unicodedata
from pathlib import Path
from datetime import datetime, timezone

# Same mappings as the main fix script
LETTER_TO_ISO = {
    'HL': '10', 'PR': '10', 'ST': '20', 'JC': '31', 'PL': '32',
    'KA': '41', 'US': '42', 'LI': '51', 'KR': '52', 'PA': '53',
    'VY': '63', 'JI': '64', 'JM': '64', 'OL': '71', 'ZL': '72', 'MO': '80',
}

DISTRICT_TO_REGION = {
    '78': '80', '79': '80', '81': '80', '82': '80', '83': '80',
    '84': '80', '85': '80', '86': '80', '87': '80', '88': '80',
    '89': '80', '90': '80',
}

VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'}

REGION_NAMES = {
    '10': 'Prague', '20': 'Central Bohemian', '31': 'South Bohemian',
    '32': 'Plzeň', '41': 'Karlovy Vary', '42': 'Ústí nad Labem',
    '51': 'Liberec', '52': 'Hradec Králové', '53': 'Pardubice',
    '63': 'Vysočina', '64': 'South Moravian', '71': 'Olomouc',
    '72': 'Zlín', '80': 'Moravian-Silesian',
}


def get_correct_region_code(old_code: str) -> str:
    """Get correct ISO code for a region."""
    if old_code in VALID_ISO_CODES:
        return old_code
    if old_code in LETTER_TO_ISO:
        return LETTER_TO_ISO[old_code]
    if old_code in DISTRICT_TO_REGION:
        return DISTRICT_TO_REGION[old_code]
    return None


def generate_name_suffix(name: str) -> str:
    """Convert institution name to snake_case suffix."""
    # Normalize unicode (NFD decomposition) and remove diacritics
    normalized = unicodedata.normalize('NFD', name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Convert to lowercase
    lowercase = ascii_name.lower()

    # Remove apostrophes, commas, and other punctuation
    no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)

    # Replace spaces and hyphens with underscores
    underscored = re.sub(r'[\s\-]+', '_', no_punct)

    # Remove any remaining non-alphanumeric characters (except underscores)
    clean = re.sub(r'[^a-z0-9_]', '', underscored)

    # Collapse multiple underscores
    final = re.sub(r'_+', '_', clean).strip('_')

    return final


def extract_name_from_yaml(filepath: Path) -> str:
    """Extract institution name from YAML file."""
    content = filepath.read_text(encoding='utf-8')

    # Try to find name in original_entry
    match = re.search(r'original_entry:\s*\n\s*name:\s*(.+)', content)
    if match:
        return match.group(1).strip()

    # Try custodian_name.claim_value
    match = re.search(r'claim_value:\s*(.+)', content)
    if match:
        return match.group(1).strip()

    return None


def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
                     old_region: str, new_region: str) -> str:
    """Fix the YAML content with new GHCID and region codes."""

    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
    region_name = REGION_NAMES.get(new_region, 'Unknown')

    reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) with name suffix for collision resolution per ISO 3166-2:CZ"

    # Replace GHCID in ghcid_current
    content = re.sub(
        r'(ghcid_current:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace GHCID in identifiers
    content = re.sub(
        r'(identifier_value:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace region_code in location_resolution
    content = re.sub(
        r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Replace region_code in location section
    content = re.sub(
        r'(location:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Check if ghcid_history already exists
    if 'ghcid_history:' in content:
        new_history_items = f'''- ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Previous GHCID before collision resolution"
  '''
        content = re.sub(
            r'(ghcid_history:\s*\n\s*)',
            r'\g<1>' + new_history_items,
            content
        )
    else:
        history_entry = f'''
  ghcid_history:
  - ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Original GHCID before collision resolution"'''
        content = re.sub(
            r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
            r'\g<1>' + history_entry,
            content
        )

    return content


def process_collision_file(filepath: Path, dry_run: bool = False) -> dict:
    """Process a file that couldn't be fixed due to collision."""

    filename = filepath.name

    # Extract current GHCID from filename
    match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
    if not match:
        return {'status': 'skipped', 'reason': 'filename pattern mismatch'}

    old_region = match.group(1)
    city = match.group(2)
    inst_type = match.group(3)
    abbrev = match.group(4)

    # Get correct region code
    new_region = get_correct_region_code(old_region)
    if not new_region:
        return {'status': 'error', 'reason': f'unknown region code: {old_region}'}

    if old_region == new_region:
        return {'status': 'skipped', 'reason': 'already correct'}

    # Get institution name
    inst_name = extract_name_from_yaml(filepath)
    if not inst_name:
        return {'status': 'error', 'reason': 'could not extract institution name'}

    # Generate name suffix
    name_suffix = generate_name_suffix(inst_name)

    old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}"
    new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}-{name_suffix}"
    new_filename = f"{new_ghcid}.yaml"
    new_filepath = filepath.parent / new_filename

    if dry_run:
        return {
            'status': 'would_fix',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'name': inst_name,
            'name_suffix': name_suffix,
        }

    # Check if new file already exists
    if new_filepath.exists():
        return {
            'status': 'collision',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'reason': 'Target still exists even with name suffix'
        }

    # Read, fix, and write content
    content = filepath.read_text(encoding='utf-8')
    new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
    new_filepath.write_text(new_content, encoding='utf-8')
    filepath.unlink()

    return {
        'status': 'fixed',
        'old_ghcid': old_ghcid,
        'new_ghcid': new_ghcid,
        'name': inst_name,
    }


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Handle Czech region code collisions')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be changed')
    parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory')
    args = parser.parse_args()

    custodian_dir = Path(args.dir)

    # Find all CZ-*.yaml files with non-ISO region codes
    non_iso_codes = set(LETTER_TO_ISO.keys()) | set(DISTRICT_TO_REGION.keys())

    collision_files = []
    for code in non_iso_codes:
        collision_files.extend(custodian_dir.glob(f'CZ-{code}-*.yaml'))

    print(f"Found {len(collision_files)} files with non-ISO region codes (collision victims)")

    results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []}

    for filepath in sorted(collision_files):
        result = process_collision_file(filepath, dry_run=args.dry_run)
        status = result['status']

        if status in ('fixed', 'would_fix'):
            results[status].append(result)
            action = 'Would fix' if args.dry_run else 'Fixed'
            print(f"  {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
        elif status == 'collision':
            results['collisions'].append(result)
            print(f"  COLLISION: {result}")
        elif status == 'error':
            results['errors'].append(result)
            print(f"  ERROR: {filepath.name} - {result['reason']}")

    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print(f"  Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
    print(f"  Collisions: {len(results['collisions'])}")
    print(f"  Errors: {len(results['errors'])}")


if __name__ == '__main__':
    main()