glam/scripts/enrich_email_semantics.py
kempersc f74513e8ef feat: Enhance entity resolution with email semantics and review merging
- Updated `entity_review.py` to map email semantic fields from JSON.
- Expanded `email_semantics.py` with additional museum mappings.
- Introduced a new rule in `.opencode/rules/no-duplicate-ontology-mappings.md` to prevent duplicate ontology mappings.
- Added a backup JSON file for entity resolution candidates.
- Created `enrich_email_semantics.py` to enrich candidates with email semantic signals.
- Developed `merge_entity_reviews.py` to merge reviewed decisions from a backup into new candidates.
2026-01-13 16:43:56 +01:00

191 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Enrich entity resolution candidates with email semantic signals.
This script adds email semantic analysis to existing candidates without
regenerating the entire file, preserving all review decisions.
Usage:
python scripts/enrich_email_semantics.py \
--input data/entity_resolution/backups/production_20260113_141819.json \
--output data/entity_resolution/entity_resolution_candidates.json
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.entity_resolution.email_semantics import parse_email_semantics
def load_json(filepath: Path) -> dict:
"""Load JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(filepath: Path, data: dict) -> None:
"""Save JSON file with proper formatting."""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def enrich_candidate_with_email_semantics(candidate: dict) -> dict:
"""Add email semantic fields to a candidate."""
email = candidate.get('wcms_email')
if not email:
# No email to analyze
candidate['email_probable_birth_year'] = None
candidate['email_birth_year_confidence'] = 0.0
candidate['email_institution_name'] = None
candidate['email_institution_type'] = None
candidate['email_institution_ghcid'] = None
candidate['email_extracted_names'] = []
candidate['email_extracted_first_name'] = None
candidate['email_extracted_last_name'] = None
candidate['email_has_dutch_prefix'] = False
candidate['email_is_institutional'] = False
return candidate
# Parse email semantics
result = parse_email_semantics(email)
# Add fields to candidate
candidate['email_probable_birth_year'] = result.probable_birth_year
candidate['email_birth_year_confidence'] = result.birth_year_confidence
candidate['email_institution_name'] = result.institution_name
candidate['email_institution_type'] = result.institution_type
candidate['email_institution_ghcid'] = result.institution_ghcid
candidate['email_extracted_names'] = result.extracted_names
candidate['email_extracted_first_name'] = result.extracted_first_name
candidate['email_extracted_last_name'] = result.extracted_last_name
candidate['email_has_dutch_prefix'] = result.has_dutch_prefix
candidate['email_is_institutional'] = result.is_institutional_domain
# Update match signals if we found useful info
signals = candidate.get('match_signals', [])
if result.probable_birth_year:
if 'email_birth_year' not in signals:
signals.append('email_birth_year')
if result.institution_name:
if 'email_institution' not in signals:
signals.append('email_institution')
if result.extracted_names:
if 'email_name_parts' not in signals:
signals.append('email_name_parts')
candidate['match_signals'] = signals
return candidate
def main():
parser = argparse.ArgumentParser(
description='Enrich entity resolution candidates with email semantics'
)
parser.add_argument(
'--input', '-i',
type=Path,
required=True,
help='Path to input candidates file'
)
parser.add_argument(
'--output', '-o',
type=Path,
required=True,
help='Path for output file with email semantics'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show stats without writing'
)
args = parser.parse_args()
# Validate input file exists
if not args.input.exists():
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
sys.exit(1)
print(f"Loading candidates from: {args.input}")
data = load_json(args.input)
candidates = data.get('candidates', [])
print(f" - {len(candidates):,} candidates")
# Count existing reviews
reviewed = sum(1 for c in candidates if c.get('reviewed'))
print(f" - {reviewed} reviewed (will be preserved)")
# Enrich each candidate
print("\nEnriching with email semantics...")
stats = {
'total': len(candidates),
'with_email': 0,
'birth_year_found': 0,
'institution_found': 0,
'names_found': 0,
}
for i, candidate in enumerate(candidates):
if i % 10000 == 0 and i > 0:
print(f" - Processed {i:,} candidates...")
enrich_candidate_with_email_semantics(candidate)
if candidate.get('wcms_email'):
stats['with_email'] += 1
if candidate.get('email_probable_birth_year'):
stats['birth_year_found'] += 1
if candidate.get('email_institution_name'):
stats['institution_found'] += 1
if candidate.get('email_extracted_names'):
stats['names_found'] += 1
print(f"\nEmail semantic enrichment stats:")
print(f" - Candidates with email: {stats['with_email']:,}")
print(f" - Birth year extracted: {stats['birth_year_found']:,}")
print(f" - Institution identified: {stats['institution_found']:,}")
print(f" - Name parts extracted: {stats['names_found']:,}")
# Update metadata
data['metadata'] = data.get('metadata', {})
data['metadata']['email_semantics_enriched_at'] = datetime.utcnow().isoformat() + 'Z'
data['metadata']['email_semantics_stats'] = stats
# Verify reviews preserved
reviewed_after = sum(1 for c in candidates if c.get('reviewed'))
print(f"\nReviews preserved: {reviewed_after} (was {reviewed})")
if reviewed_after != reviewed:
print("ERROR: Review count mismatch!", file=sys.stderr)
sys.exit(1)
if args.dry_run:
print(f"\n[DRY RUN] Would write to: {args.output}")
else:
# Ensure output directory exists
args.output.parent.mkdir(parents=True, exist_ok=True)
print(f"\nWriting enriched file to: {args.output}")
save_json(args.output, data)
# Get file size
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f" - File size: {size_mb:.1f} MB")
print("\nDone!")
return 0
if __name__ == '__main__':
sys.exit(main())