- Updated `entity_review.py` to map email semantic fields from JSON. - Expanded `email_semantics.py` with additional museum mappings. - Introduced a new rule in `.opencode/rules/no-duplicate-ontology-mappings.md` to prevent duplicate ontology mappings. - Added a backup JSON file for entity resolution candidates. - Created `enrich_email_semantics.py` to enrich candidates with email semantic signals. - Developed `merge_entity_reviews.py` to merge reviewed decisions from a backup into new candidates.
173 lines
5.8 KiB
Python
173 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge reviewed entity resolution decisions from a backup into new candidates.
|
|
|
|
This script preserves review decisions when regenerating the entity resolution
|
|
candidates file with new signals (e.g., email semantics).
|
|
|
|
Usage:
|
|
python scripts/merge_entity_reviews.py \
|
|
--new data/entity_resolution/entity_resolution_candidates_new.json \
|
|
--backup data/entity_resolution/backups/production_20260113_141819.json \
|
|
--output data/entity_resolution/entity_resolution_candidates.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
|
|
def load_json(filepath: Path) -> dict:
|
|
"""Load JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def save_json(filepath: Path, data: dict) -> None:
|
|
"""Save JSON file with proper formatting."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def extract_reviews(backup_data: dict) -> dict:
|
|
"""Extract reviewed candidates keyed by (wcms_ppid, linkedin_ppid)."""
|
|
reviews = {}
|
|
for candidate in backup_data.get('candidates', []):
|
|
if candidate.get('reviewed'):
|
|
key = (candidate['wcms_ppid'], candidate['linkedin_ppid'])
|
|
reviews[key] = {
|
|
'reviewed': True,
|
|
'review_decision': candidate.get('review_decision'),
|
|
'reviewed_by': candidate.get('reviewed_by'),
|
|
'reviewed_at': candidate.get('reviewed_at'),
|
|
'review_notes': candidate.get('review_notes'),
|
|
}
|
|
return reviews
|
|
|
|
|
|
def merge_reviews(new_data: dict, reviews: dict) -> tuple[int, int]:
|
|
"""
|
|
Merge review decisions into new candidates.
|
|
|
|
Returns:
|
|
Tuple of (merged_count, orphaned_count)
|
|
"""
|
|
merged_count = 0
|
|
matched_keys = set()
|
|
|
|
for candidate in new_data.get('candidates', []):
|
|
key = (candidate['wcms_ppid'], candidate['linkedin_ppid'])
|
|
if key in reviews:
|
|
# Copy all review fields
|
|
review = reviews[key]
|
|
candidate['reviewed'] = review['reviewed']
|
|
candidate['review_decision'] = review['review_decision']
|
|
candidate['reviewed_by'] = review['reviewed_by']
|
|
candidate['reviewed_at'] = review['reviewed_at']
|
|
if review.get('review_notes'):
|
|
candidate['review_notes'] = review['review_notes']
|
|
merged_count += 1
|
|
matched_keys.add(key)
|
|
|
|
# Count orphaned reviews (in backup but not in new file)
|
|
orphaned_count = len(reviews) - len(matched_keys)
|
|
|
|
return merged_count, orphaned_count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Merge entity resolution reviews from backup into new candidates'
|
|
)
|
|
parser.add_argument(
|
|
'--new', '-n',
|
|
type=Path,
|
|
required=True,
|
|
help='Path to newly generated candidates file'
|
|
)
|
|
parser.add_argument(
|
|
'--backup', '-b',
|
|
type=Path,
|
|
required=True,
|
|
help='Path to backup file containing reviews'
|
|
)
|
|
parser.add_argument(
|
|
'--output', '-o',
|
|
type=Path,
|
|
required=True,
|
|
help='Path for output file with merged reviews'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show what would be merged without writing'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input files exist
|
|
if not args.new.exists():
|
|
print(f"Error: New candidates file not found: {args.new}", file=sys.stderr)
|
|
sys.exit(1)
|
|
if not args.backup.exists():
|
|
print(f"Error: Backup file not found: {args.backup}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Loading new candidates from: {args.new}")
|
|
new_data = load_json(args.new)
|
|
new_count = len(new_data.get('candidates', []))
|
|
print(f" - {new_count:,} candidates")
|
|
|
|
print(f"\nLoading backup from: {args.backup}")
|
|
backup_data = load_json(args.backup)
|
|
backup_count = len(backup_data.get('candidates', []))
|
|
print(f" - {backup_count:,} candidates")
|
|
|
|
# Extract reviews from backup
|
|
reviews = extract_reviews(backup_data)
|
|
print(f" - {len(reviews)} reviewed candidates to merge")
|
|
|
|
# Show review breakdown
|
|
match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'match')
|
|
not_match_count = sum(1 for r in reviews.values() if r['review_decision'] == 'not_match')
|
|
print(f" - {match_count} matches")
|
|
print(f" - {not_match_count} not-matches")
|
|
|
|
# Merge reviews
|
|
print("\nMerging reviews...")
|
|
merged, orphaned = merge_reviews(new_data, reviews)
|
|
|
|
print(f" - {merged} reviews merged successfully")
|
|
if orphaned > 0:
|
|
print(f" - WARNING: {orphaned} reviews could not be matched (candidates removed?)")
|
|
|
|
# Update metadata
|
|
new_data['metadata'] = new_data.get('metadata', {})
|
|
new_data['metadata']['reviews_merged_from'] = str(args.backup)
|
|
new_data['metadata']['reviews_merged_at'] = datetime.utcnow().isoformat() + 'Z'
|
|
new_data['metadata']['reviews_merged_count'] = merged
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Would write to:", args.output)
|
|
print(f" - {new_count:,} candidates")
|
|
print(f" - {merged} with reviews")
|
|
else:
|
|
# Ensure output directory exists
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nWriting merged file to: {args.output}")
|
|
save_json(args.output, new_data)
|
|
|
|
# Verify
|
|
verify_data = load_json(args.output)
|
|
verify_reviewed = sum(1 for c in verify_data['candidates'] if c.get('reviewed'))
|
|
print(f" - Verified: {verify_reviewed} reviewed candidates in output")
|
|
|
|
print("\nDone!")
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|