#!/usr/bin/env python3 """ Find and merge PENDING files that have matching emic names with existing files. This script: 1. Scans all existing custodian files to build a name -> file mapping 2. Scans all PENDING files to find matches 3. Merges staff data from PENDING into existing files 4. Archives merged PENDING files Usage: python scripts/merge_pending_by_name.py --dry-run # Preview python scripts/merge_pending_by_name.py # Apply """ import os import yaml from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional import shutil def load_yaml_fast(filepath: Path) -> Optional[Dict]: """Load YAML file, return None on error.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except: return None def save_yaml(filepath: Path, data: Dict): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def normalize_name(name: str) -> str: """Normalize name for matching.""" if not name: return "" return name.lower().strip() def merge_staff(source_data: Dict, target_data: Dict, source_name: str) -> int: """Merge staff from source into target. Returns count of staff added.""" if 'staff' not in source_data: return 0 source_staff = source_data['staff'] staff_list = source_staff.get('staff_list', []) if not staff_list: return 0 # Skip if target already has staff if 'staff' in target_data and target_data['staff'].get('staff_list'): return 0 # Add staff section target_data['staff'] = { 'provenance': source_staff.get('provenance', {}), 'staff_list': staff_list } # Add provenance note if 'provenance' not in target_data: target_data['provenance'] = {} notes = target_data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}") target_data['provenance']['notes'] = notes return len(staff_list) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian')) args = parser.parse_args() custodian_dir = args.custodian_dir archive_dir = custodian_dir / 'archive' / 'pending_merged_20250109' print("=" * 80) print("MERGING PENDING FILES BY NAME MATCH") print("=" * 80) print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") print() # Step 1: Build name -> file mapping for existing files print("Building name index from existing files...") existing_by_name = {} for f in custodian_dir.glob('[A-Z][A-Z]-[A-Z][A-Z]-*.yaml'): if 'PENDING' in f.name or 'archive' in str(f): continue data = load_yaml_fast(f) if data: name = data.get('custodian_name', {}).get('emic_name', '') if name: normalized = normalize_name(name) existing_by_name[normalized] = (f, data) print(f" Indexed {len(existing_by_name)} existing files") # Step 2: Find matching PENDING files print("\nScanning PENDING files for matches...") matches = [] no_matches = [] for f in sorted(custodian_dir.glob('*-XX-XXX-PENDING-*.yaml')): if 'archive' in str(f): continue data = load_yaml_fast(f) if data: name = data.get('custodian_name', {}).get('emic_name', '') normalized = normalize_name(name) staff_count = len(data.get('staff', {}).get('staff_list', [])) if normalized in existing_by_name: existing_file, existing_data = existing_by_name[normalized] matches.append({ 'pending_file': f, 'pending_data': data, 'existing_file': existing_file, 'existing_data': existing_data, 'name': name, 'staff_count': staff_count }) else: no_matches.append({ 'file': f, 'name': name, 'staff_count': staff_count }) print(f" Found {len(matches)} PENDING files with matching existing files") print(f" Found {len(no_matches)} PENDING files without matches") # Step 3: Merge matches if matches: print("\n" + "=" * 80) print("MERGING MATCHED FILES") print("=" * 80) if not args.dry_run: archive_dir.mkdir(parents=True, exist_ok=True) total_staff = 0 merged_count = 0 skipped_count = 0 for m in matches: pending_file = m['pending_file'] existing_file = m['existing_file'] pending_data = m['pending_data'] existing_data = m['existing_data'] # Check if existing already has staff existing_staff = len(existing_data.get('staff', {}).get('staff_list', [])) if existing_staff > 0: skipped_count += 1 continue staff_added = m['staff_count'] if staff_added == 0: skipped_count += 1 continue print(f"\n[{'DRY RUN' if args.dry_run else 'MERGE'}] {m['name'][:50]}") print(f" From: {pending_file.name}") print(f" To: {existing_file.name}") print(f" Staff: {staff_added}") if not args.dry_run: # Merge staff merge_staff(pending_data, existing_data, pending_file.name) save_yaml(existing_file, existing_data) # Move PENDING to archive shutil.move(str(pending_file), str(archive_dir / pending_file.name)) total_staff += staff_added merged_count += 1 print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) print(f"Files merged: {merged_count}") print(f"Files skipped (already has staff or no staff): {skipped_count}") print(f"Total staff added: {total_staff}") print(f"Unmatched PENDING files remaining: {len(no_matches)}") if not args.dry_run: print(f"\nArchived to: {archive_dir}") if __name__ == '__main__': main()