glam/scripts/merge_pending_by_name.py

200 lines
6.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Find and merge PENDING files that have matching emic names with existing files.
This script:
1. Scans all existing custodian files to build a name -> file mapping
2. Scans all PENDING files to find matches
3. Merges staff data from PENDING into existing files
4. Archives merged PENDING files
Usage:
python scripts/merge_pending_by_name.py --dry-run # Preview
python scripts/merge_pending_by_name.py # Apply
"""
import os
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional
import shutil
def load_yaml_fast(filepath: Path) -> Optional[Dict]:
"""Load YAML file, return None on error."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except:
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def normalize_name(name: str) -> str:
"""Normalize name for matching."""
if not name:
return ""
return name.lower().strip()
def merge_staff(source_data: Dict, target_data: Dict, source_name: str) -> int:
"""Merge staff from source into target. Returns count of staff added."""
if 'staff' not in source_data:
return 0
source_staff = source_data['staff']
staff_list = source_staff.get('staff_list', [])
if not staff_list:
return 0
# Skip if target already has staff
if 'staff' in target_data and target_data['staff'].get('staff_list'):
return 0
# Add staff section
target_data['staff'] = {
'provenance': source_staff.get('provenance', {}),
'staff_list': staff_list
}
# Add provenance note
if 'provenance' not in target_data:
target_data['provenance'] = {}
notes = target_data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}")
target_data['provenance']['notes'] = notes
return len(staff_list)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
archive_dir = custodian_dir / 'archive' / 'pending_merged_20250109'
print("=" * 80)
print("MERGING PENDING FILES BY NAME MATCH")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print()
# Step 1: Build name -> file mapping for existing files
print("Building name index from existing files...")
existing_by_name = {}
for f in custodian_dir.glob('[A-Z][A-Z]-[A-Z][A-Z]-*.yaml'):
if 'PENDING' in f.name or 'archive' in str(f):
continue
data = load_yaml_fast(f)
if data:
name = data.get('custodian_name', {}).get('emic_name', '')
if name:
normalized = normalize_name(name)
existing_by_name[normalized] = (f, data)
print(f" Indexed {len(existing_by_name)} existing files")
# Step 2: Find matching PENDING files
print("\nScanning PENDING files for matches...")
matches = []
no_matches = []
for f in sorted(custodian_dir.glob('*-XX-XXX-PENDING-*.yaml')):
if 'archive' in str(f):
continue
data = load_yaml_fast(f)
if data:
name = data.get('custodian_name', {}).get('emic_name', '')
normalized = normalize_name(name)
staff_count = len(data.get('staff', {}).get('staff_list', []))
if normalized in existing_by_name:
existing_file, existing_data = existing_by_name[normalized]
matches.append({
'pending_file': f,
'pending_data': data,
'existing_file': existing_file,
'existing_data': existing_data,
'name': name,
'staff_count': staff_count
})
else:
no_matches.append({
'file': f,
'name': name,
'staff_count': staff_count
})
print(f" Found {len(matches)} PENDING files with matching existing files")
print(f" Found {len(no_matches)} PENDING files without matches")
# Step 3: Merge matches
if matches:
print("\n" + "=" * 80)
print("MERGING MATCHED FILES")
print("=" * 80)
if not args.dry_run:
archive_dir.mkdir(parents=True, exist_ok=True)
total_staff = 0
merged_count = 0
skipped_count = 0
for m in matches:
pending_file = m['pending_file']
existing_file = m['existing_file']
pending_data = m['pending_data']
existing_data = m['existing_data']
# Check if existing already has staff
existing_staff = len(existing_data.get('staff', {}).get('staff_list', []))
if existing_staff > 0:
skipped_count += 1
continue
staff_added = m['staff_count']
if staff_added == 0:
skipped_count += 1
continue
print(f"\n[{'DRY RUN' if args.dry_run else 'MERGE'}] {m['name'][:50]}")
print(f" From: {pending_file.name}")
print(f" To: {existing_file.name}")
print(f" Staff: {staff_added}")
if not args.dry_run:
# Merge staff
merge_staff(pending_data, existing_data, pending_file.name)
save_yaml(existing_file, existing_data)
# Move PENDING to archive
shutil.move(str(pending_file), str(archive_dir / pending_file.name))
total_staff += staff_added
merged_count += 1
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Files merged: {merged_count}")
print(f"Files skipped (already has staff or no staff): {skipped_count}")
print(f"Total staff added: {total_staff}")
print(f"Unmatched PENDING files remaining: {len(no_matches)}")
if not args.dry_run:
print(f"\nArchived to: {archive_dir}")
if __name__ == '__main__':
main()