200 lines
6.7 KiB
Python
Executable file
200 lines
6.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Find and merge PENDING files that have matching emic names with existing files.
|
|
|
|
This script:
|
|
1. Scans all existing custodian files to build a name -> file mapping
|
|
2. Scans all PENDING files to find matches
|
|
3. Merges staff data from PENDING into existing files
|
|
4. Archives merged PENDING files
|
|
|
|
Usage:
|
|
python scripts/merge_pending_by_name.py --dry-run # Preview
|
|
python scripts/merge_pending_by_name.py # Apply
|
|
"""
|
|
|
|
import os
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional
|
|
import shutil
|
|
|
|
def load_yaml_fast(filepath: Path) -> Optional[Dict]:
|
|
"""Load YAML file, return None on error."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except:
|
|
return None
|
|
|
|
def save_yaml(filepath: Path, data: Dict):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120)
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize name for matching."""
|
|
if not name:
|
|
return ""
|
|
return name.lower().strip()
|
|
|
|
def merge_staff(source_data: Dict, target_data: Dict, source_name: str) -> int:
|
|
"""Merge staff from source into target. Returns count of staff added."""
|
|
if 'staff' not in source_data:
|
|
return 0
|
|
|
|
source_staff = source_data['staff']
|
|
staff_list = source_staff.get('staff_list', [])
|
|
|
|
if not staff_list:
|
|
return 0
|
|
|
|
# Skip if target already has staff
|
|
if 'staff' in target_data and target_data['staff'].get('staff_list'):
|
|
return 0
|
|
|
|
# Add staff section
|
|
target_data['staff'] = {
|
|
'provenance': source_staff.get('provenance', {}),
|
|
'staff_list': staff_list
|
|
}
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in target_data:
|
|
target_data['provenance'] = {}
|
|
notes = target_data['provenance'].get('notes', [])
|
|
if isinstance(notes, str):
|
|
notes = [notes]
|
|
notes.append(f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}")
|
|
target_data['provenance']['notes'] = notes
|
|
|
|
return len(staff_list)
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--custodian-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = args.custodian_dir
|
|
archive_dir = custodian_dir / 'archive' / 'pending_merged_20250109'
|
|
|
|
print("=" * 80)
|
|
print("MERGING PENDING FILES BY NAME MATCH")
|
|
print("=" * 80)
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
# Step 1: Build name -> file mapping for existing files
|
|
print("Building name index from existing files...")
|
|
existing_by_name = {}
|
|
|
|
for f in custodian_dir.glob('[A-Z][A-Z]-[A-Z][A-Z]-*.yaml'):
|
|
if 'PENDING' in f.name or 'archive' in str(f):
|
|
continue
|
|
data = load_yaml_fast(f)
|
|
if data:
|
|
name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if name:
|
|
normalized = normalize_name(name)
|
|
existing_by_name[normalized] = (f, data)
|
|
|
|
print(f" Indexed {len(existing_by_name)} existing files")
|
|
|
|
# Step 2: Find matching PENDING files
|
|
print("\nScanning PENDING files for matches...")
|
|
matches = []
|
|
no_matches = []
|
|
|
|
for f in sorted(custodian_dir.glob('*-XX-XXX-PENDING-*.yaml')):
|
|
if 'archive' in str(f):
|
|
continue
|
|
data = load_yaml_fast(f)
|
|
if data:
|
|
name = data.get('custodian_name', {}).get('emic_name', '')
|
|
normalized = normalize_name(name)
|
|
staff_count = len(data.get('staff', {}).get('staff_list', []))
|
|
|
|
if normalized in existing_by_name:
|
|
existing_file, existing_data = existing_by_name[normalized]
|
|
matches.append({
|
|
'pending_file': f,
|
|
'pending_data': data,
|
|
'existing_file': existing_file,
|
|
'existing_data': existing_data,
|
|
'name': name,
|
|
'staff_count': staff_count
|
|
})
|
|
else:
|
|
no_matches.append({
|
|
'file': f,
|
|
'name': name,
|
|
'staff_count': staff_count
|
|
})
|
|
|
|
print(f" Found {len(matches)} PENDING files with matching existing files")
|
|
print(f" Found {len(no_matches)} PENDING files without matches")
|
|
|
|
# Step 3: Merge matches
|
|
if matches:
|
|
print("\n" + "=" * 80)
|
|
print("MERGING MATCHED FILES")
|
|
print("=" * 80)
|
|
|
|
if not args.dry_run:
|
|
archive_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
total_staff = 0
|
|
merged_count = 0
|
|
skipped_count = 0
|
|
|
|
for m in matches:
|
|
pending_file = m['pending_file']
|
|
existing_file = m['existing_file']
|
|
pending_data = m['pending_data']
|
|
existing_data = m['existing_data']
|
|
|
|
# Check if existing already has staff
|
|
existing_staff = len(existing_data.get('staff', {}).get('staff_list', []))
|
|
if existing_staff > 0:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
staff_added = m['staff_count']
|
|
if staff_added == 0:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
print(f"\n[{'DRY RUN' if args.dry_run else 'MERGE'}] {m['name'][:50]}")
|
|
print(f" From: {pending_file.name}")
|
|
print(f" To: {existing_file.name}")
|
|
print(f" Staff: {staff_added}")
|
|
|
|
if not args.dry_run:
|
|
# Merge staff
|
|
merge_staff(pending_data, existing_data, pending_file.name)
|
|
save_yaml(existing_file, existing_data)
|
|
|
|
# Move PENDING to archive
|
|
shutil.move(str(pending_file), str(archive_dir / pending_file.name))
|
|
|
|
total_staff += staff_added
|
|
merged_count += 1
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Files merged: {merged_count}")
|
|
print(f"Files skipped (already has staff or no staff): {skipped_count}")
|
|
print(f"Total staff added: {total_staff}")
|
|
print(f"Unmatched PENDING files remaining: {len(no_matches)}")
|
|
|
|
if not args.dry_run:
|
|
print(f"\nArchived to: {archive_dir}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|