glam/scripts/merge_pending_by_name.py

#!/usr/bin/env python3
"""
Find and merge PENDING files that have matching emic names with existing files.

This script:
1. Scans all existing custodian files to build a name -> file mapping
2. Scans all PENDING files to find matches
3. Merges staff data from PENDING into existing files
4. Archives merged PENDING files

Usage:
    python scripts/merge_pending_by_name.py --dry-run    # Preview
    python scripts/merge_pending_by_name.py              # Apply
"""

import os
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional
import shutil

def load_yaml_fast(filepath: Path) -> Optional[Dict]:
    """Load YAML file, return None on error."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except:
        return None

def save_yaml(filepath: Path, data: Dict):
    """Save YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
                  sort_keys=False, width=120)

def normalize_name(name: str) -> str:
    """Normalize name for matching."""
    if not name:
        return ""
    return name.lower().strip()

def merge_staff(source_data: Dict, target_data: Dict, source_name: str) -> int:
    """Merge staff from source into target. Returns count of staff added."""
    if 'staff' not in source_data:
        return 0

    source_staff = source_data['staff']
    staff_list = source_staff.get('staff_list', [])

    if not staff_list:
        return 0

    # Skip if target already has staff
    if 'staff' in target_data and target_data['staff'].get('staff_list'):
        return 0

    # Add staff section
    target_data['staff'] = {
        'provenance': source_staff.get('provenance', {}),
        'staff_list': staff_list
    }

    # Add provenance note
    if 'provenance' not in target_data:
        target_data['provenance'] = {}
    notes = target_data['provenance'].get('notes', [])
    if isinstance(notes, str):
        notes = [notes]
    notes.append(f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}")
    target_data['provenance']['notes'] = notes

    return len(staff_list)

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--custodian-dir', type=Path,
                        default=Path('/Users/kempersc/apps/glam/data/custodian'))
    args = parser.parse_args()

    custodian_dir = args.custodian_dir
    archive_dir = custodian_dir / 'archive' / 'pending_merged_20250109'

    print("=" * 80)
    print("MERGING PENDING FILES BY NAME MATCH")
    print("=" * 80)
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print()

    # Step 1: Build name -> file mapping for existing files
    print("Building name index from existing files...")
    existing_by_name = {}

    for f in custodian_dir.glob('[A-Z][A-Z]-[A-Z][A-Z]-*.yaml'):
        if 'PENDING' in f.name or 'archive' in str(f):
            continue
        data = load_yaml_fast(f)
        if data:
            name = data.get('custodian_name', {}).get('emic_name', '')
            if name:
                normalized = normalize_name(name)
                existing_by_name[normalized] = (f, data)

    print(f"  Indexed {len(existing_by_name)} existing files")

    # Step 2: Find matching PENDING files
    print("\nScanning PENDING files for matches...")
    matches = []
    no_matches = []

    for f in sorted(custodian_dir.glob('*-XX-XXX-PENDING-*.yaml')):
        if 'archive' in str(f):
            continue
        data = load_yaml_fast(f)
        if data:
            name = data.get('custodian_name', {}).get('emic_name', '')
            normalized = normalize_name(name)
            staff_count = len(data.get('staff', {}).get('staff_list', []))

            if normalized in existing_by_name:
                existing_file, existing_data = existing_by_name[normalized]
                matches.append({
                    'pending_file': f,
                    'pending_data': data,
                    'existing_file': existing_file,
                    'existing_data': existing_data,
                    'name': name,
                    'staff_count': staff_count
                })
            else:
                no_matches.append({
                    'file': f,
                    'name': name,
                    'staff_count': staff_count
                })

    print(f"  Found {len(matches)} PENDING files with matching existing files")
    print(f"  Found {len(no_matches)} PENDING files without matches")

    # Step 3: Merge matches
    if matches:
        print("\n" + "=" * 80)
        print("MERGING MATCHED FILES")
        print("=" * 80)

        if not args.dry_run:
            archive_dir.mkdir(parents=True, exist_ok=True)

        total_staff = 0
        merged_count = 0
        skipped_count = 0

        for m in matches:
            pending_file = m['pending_file']
            existing_file = m['existing_file']
            pending_data = m['pending_data']
            existing_data = m['existing_data']

            # Check if existing already has staff
            existing_staff = len(existing_data.get('staff', {}).get('staff_list', []))
            if existing_staff > 0:
                skipped_count += 1
                continue

            staff_added = m['staff_count']
            if staff_added == 0:
                skipped_count += 1
                continue

            print(f"\n[{'DRY RUN' if args.dry_run else 'MERGE'}] {m['name'][:50]}")
            print(f"  From: {pending_file.name}")
            print(f"  To:   {existing_file.name}")
            print(f"  Staff: {staff_added}")

            if not args.dry_run:
                # Merge staff
                merge_staff(pending_data, existing_data, pending_file.name)
                save_yaml(existing_file, existing_data)

                # Move PENDING to archive
                shutil.move(str(pending_file), str(archive_dir / pending_file.name))

            total_staff += staff_added
            merged_count += 1

        print("\n" + "=" * 80)
        print("SUMMARY")
        print("=" * 80)
        print(f"Files merged: {merged_count}")
        print(f"Files skipped (already has staff or no staff): {skipped_count}")
        print(f"Total staff added: {total_staff}")
        print(f"Unmatched PENDING files remaining: {len(no_matches)}")

        if not args.dry_run:
            print(f"\nArchived to: {archive_dir}")

if __name__ == '__main__':
    main()