#!/usr/bin/env python3 """ Merge reviewed entity resolution decisions from a backup into new candidates. This script preserves review decisions when regenerating the entity resolution candidates file with new signals (e.g., email semantics). Usage: python scripts/merge_entity_reviews.py \ --new data/entity_resolution/entity_resolution_candidates_new.json \ --backup data/entity_resolution/backups/production_20260113_141819.json \ --output data/entity_resolution/entity_resolution_candidates.json """ import argparse import json import sys from datetime import datetime from pathlib import Path def load_json(filepath: Path) -> dict: """Load JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def save_json(filepath: Path, data: dict) -> None: """Save JSON file with proper formatting.""" with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) def extract_reviews(backup_data: dict) -> dict: """Extract reviewed candidates keyed by (wcms_ppid, linkedin_ppid).""" reviews = {} for candidate in backup_data.get('candidates', []): if candidate.get('reviewed'): key = (candidate['wcms_ppid'], candidate['linkedin_ppid']) reviews[key] = { 'reviewed': True, 'review_decision': candidate.get('review_decision'), 'reviewed_by': candidate.get('reviewed_by'), 'reviewed_at': candidate.get('reviewed_at'), 'review_notes': candidate.get('review_notes'), } return reviews def merge_reviews(new_data: dict, reviews: dict) -> tuple[int, int]: """ Merge review decisions into new candidates. Returns: Tuple of (merged_count, orphaned_count) """ merged_count = 0 matched_keys = set() for candidate in new_data.get('candidates', []): key = (candidate['wcms_ppid'], candidate['linkedin_ppid']) if key in reviews: # Copy all review fields review = reviews[key] candidate['reviewed'] = review['reviewed'] candidate['review_decision'] = review['review_decision'] candidate['reviewed_by'] = review['reviewed_by'] candidate['reviewed_at'] = review['reviewed_at'] if review.get('review_notes'): candidate['review_notes'] = review['review_notes'] merged_count += 1 matched_keys.add(key) # Count orphaned reviews (in backup but not in new file) orphaned_count = len(reviews) - len(matched_keys) return merged_count, orphaned_count def main(): parser = argparse.ArgumentParser( description='Merge entity resolution reviews from backup into new candidates' ) parser.add_argument( '--new', '-n', type=Path, required=True, help='Path to newly generated candidates file' ) parser.add_argument( '--backup', '-b', type=Path, required=True, help='Path to backup file containing reviews' ) parser.add_argument( '--output', '-o', type=Path, required=True, help='Path for output file with merged reviews' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be merged without writing' ) args = parser.parse_args() # Validate input files exist if not args.new.exists(): print(f"Error: New candidates file not found: {args.new}", file=sys.stderr) sys.exit(1) if not args.backup.exists(): print(f"Error: Backup file not found: {args.backup}", file=sys.stderr) sys.exit(1) print(f"Loading new candidates from: {args.new}") new_data = load_json(args.new) new_count = len(new_data.get('candidates', [])) print(f" - {new_count:,} candidates") print(f"\nLoading backup from: {args.backup}") backup_data = load_json(args.backup) backup_count = len(backup_data.get('candidates', [])) print(f" - {backup_count:,} candidates") # Extract reviews from backup reviews = extract_reviews(backup_data) print(f" - {len(reviews)} reviewed candidates to merge") # Show review breakdown match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'match') not_match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'not_match') print(f" - {match_count} matches") print(f" - {not_match_count} not-matches") # Merge reviews print("\nMerging reviews...") merged, orphaned = merge_reviews(new_data, reviews) print(f" - {merged} reviews merged successfully") if orphaned > 0: print(f" - WARNING: {orphaned} reviews could not be matched (candidates removed?)") # Update metadata new_data['metadata'] = new_data.get('metadata', {}) new_data['metadata']['reviews_merged_from'] = str(args.backup) new_data['metadata']['reviews_merged_at'] = datetime.utcnow().isoformat() + 'Z' new_data['metadata']['reviews_merged_count'] = merged if args.dry_run: print("\n[DRY RUN] Would write to:", args.output) print(f" - {new_count:,} candidates") print(f" - {merged} with reviews") else: # Ensure output directory exists args.output.parent.mkdir(parents=True, exist_ok=True) print(f"\nWriting merged file to: {args.output}") save_json(args.output, new_data) # Verify verify_data = load_json(args.output) verify_reviewed = sum(1 for c in verify_data['candidates'] if c.get('reviewed')) print(f" - Verified: {verify_reviewed} reviewed candidates in output") print("\nDone!") return 0 if __name__ == '__main__': sys.exit(main())