glam/scripts/merge_entity_reviews.py
kempersc f74513e8ef feat: Enhance entity resolution with email semantics and review merging
- Updated `entity_review.py` to map email semantic fields from JSON.
- Expanded `email_semantics.py` with additional museum mappings.
- Introduced a new rule in `.opencode/rules/no-duplicate-ontology-mappings.md` to prevent duplicate ontology mappings.
- Added a backup JSON file for entity resolution candidates.
- Created `enrich_email_semantics.py` to enrich candidates with email semantic signals.
- Developed `merge_entity_reviews.py` to merge reviewed decisions from a backup into new candidates.
2026-01-13 16:43:56 +01:00

173 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Merge reviewed entity resolution decisions from a backup into new candidates.
This script preserves review decisions when regenerating the entity resolution
candidates file with new signals (e.g., email semantics).
Usage:
python scripts/merge_entity_reviews.py \
--new data/entity_resolution/entity_resolution_candidates_new.json \
--backup data/entity_resolution/backups/production_20260113_141819.json \
--output data/entity_resolution/entity_resolution_candidates.json
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
def load_json(filepath: Path) -> dict:
"""Load JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(filepath: Path, data: dict) -> None:
"""Save JSON file with proper formatting."""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def extract_reviews(backup_data: dict) -> dict:
"""Extract reviewed candidates keyed by (wcms_ppid, linkedin_ppid)."""
reviews = {}
for candidate in backup_data.get('candidates', []):
if candidate.get('reviewed'):
key = (candidate['wcms_ppid'], candidate['linkedin_ppid'])
reviews[key] = {
'reviewed': True,
'review_decision': candidate.get('review_decision'),
'reviewed_by': candidate.get('reviewed_by'),
'reviewed_at': candidate.get('reviewed_at'),
'review_notes': candidate.get('review_notes'),
}
return reviews
def merge_reviews(new_data: dict, reviews: dict) -> tuple[int, int]:
"""
Merge review decisions into new candidates.
Returns:
Tuple of (merged_count, orphaned_count)
"""
merged_count = 0
matched_keys = set()
for candidate in new_data.get('candidates', []):
key = (candidate['wcms_ppid'], candidate['linkedin_ppid'])
if key in reviews:
# Copy all review fields
review = reviews[key]
candidate['reviewed'] = review['reviewed']
candidate['review_decision'] = review['review_decision']
candidate['reviewed_by'] = review['reviewed_by']
candidate['reviewed_at'] = review['reviewed_at']
if review.get('review_notes'):
candidate['review_notes'] = review['review_notes']
merged_count += 1
matched_keys.add(key)
# Count orphaned reviews (in backup but not in new file)
orphaned_count = len(reviews) - len(matched_keys)
return merged_count, orphaned_count
def main():
parser = argparse.ArgumentParser(
description='Merge entity resolution reviews from backup into new candidates'
)
parser.add_argument(
'--new', '-n',
type=Path,
required=True,
help='Path to newly generated candidates file'
)
parser.add_argument(
'--backup', '-b',
type=Path,
required=True,
help='Path to backup file containing reviews'
)
parser.add_argument(
'--output', '-o',
type=Path,
required=True,
help='Path for output file with merged reviews'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be merged without writing'
)
args = parser.parse_args()
# Validate input files exist
if not args.new.exists():
print(f"Error: New candidates file not found: {args.new}", file=sys.stderr)
sys.exit(1)
if not args.backup.exists():
print(f"Error: Backup file not found: {args.backup}", file=sys.stderr)
sys.exit(1)
print(f"Loading new candidates from: {args.new}")
new_data = load_json(args.new)
new_count = len(new_data.get('candidates', []))
print(f" - {new_count:,} candidates")
print(f"\nLoading backup from: {args.backup}")
backup_data = load_json(args.backup)
backup_count = len(backup_data.get('candidates', []))
print(f" - {backup_count:,} candidates")
# Extract reviews from backup
reviews = extract_reviews(backup_data)
print(f" - {len(reviews)} reviewed candidates to merge")
# Show review breakdown
match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'match')
not_match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'not_match')
print(f" - {match_count} matches")
print(f" - {not_match_count} not-matches")
# Merge reviews
print("\nMerging reviews...")
merged, orphaned = merge_reviews(new_data, reviews)
print(f" - {merged} reviews merged successfully")
if orphaned > 0:
print(f" - WARNING: {orphaned} reviews could not be matched (candidates removed?)")
# Update metadata
new_data['metadata'] = new_data.get('metadata', {})
new_data['metadata']['reviews_merged_from'] = str(args.backup)
new_data['metadata']['reviews_merged_at'] = datetime.utcnow().isoformat() + 'Z'
new_data['metadata']['reviews_merged_count'] = merged
if args.dry_run:
print("\n[DRY RUN] Would write to:", args.output)
print(f" - {new_count:,} candidates")
print(f" - {merged} with reviews")
else:
# Ensure output directory exists
args.output.parent.mkdir(parents=True, exist_ok=True)
print(f"\nWriting merged file to: {args.output}")
save_json(args.output, new_data)
# Verify
verify_data = load_json(args.output)
verify_reviewed = sum(1 for c in verify_data['candidates'] if c.get('reviewed'))
print(f" - Verified: {verify_reviewed} reviewed candidates in output")
print("\nDone!")
return 0
if __name__ == '__main__':
sys.exit(main())