- Updated `entity_review.py` to map email semantic fields from JSON. - Expanded `email_semantics.py` with additional museum mappings. - Introduced a new rule in `.opencode/rules/no-duplicate-ontology-mappings.md` to prevent duplicate ontology mappings. - Added a backup JSON file for entity resolution candidates. - Created `enrich_email_semantics.py` to enrich candidates with email semantic signals. - Developed `merge_entity_reviews.py` to merge reviewed decisions from a backup into new candidates.
191 lines
6.4 KiB
Python
191 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich entity resolution candidates with email semantic signals.
|
|
|
|
This script adds email semantic analysis to existing candidates without
|
|
regenerating the entire file, preserving all review decisions.
|
|
|
|
Usage:
|
|
python scripts/enrich_email_semantics.py \
|
|
--input data/entity_resolution/backups/production_20260113_141819.json \
|
|
--output data/entity_resolution/entity_resolution_candidates.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from glam_extractor.entity_resolution.email_semantics import parse_email_semantics
|
|
|
|
|
|
def load_json(filepath: Path) -> dict:
|
|
"""Load JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def save_json(filepath: Path, data: dict) -> None:
|
|
"""Save JSON file with proper formatting."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def enrich_candidate_with_email_semantics(candidate: dict) -> dict:
|
|
"""Add email semantic fields to a candidate."""
|
|
email = candidate.get('wcms_email')
|
|
|
|
if not email:
|
|
# No email to analyze
|
|
candidate['email_probable_birth_year'] = None
|
|
candidate['email_birth_year_confidence'] = 0.0
|
|
candidate['email_institution_name'] = None
|
|
candidate['email_institution_type'] = None
|
|
candidate['email_institution_ghcid'] = None
|
|
candidate['email_extracted_names'] = []
|
|
candidate['email_extracted_first_name'] = None
|
|
candidate['email_extracted_last_name'] = None
|
|
candidate['email_has_dutch_prefix'] = False
|
|
candidate['email_is_institutional'] = False
|
|
return candidate
|
|
|
|
# Parse email semantics
|
|
result = parse_email_semantics(email)
|
|
|
|
# Add fields to candidate
|
|
candidate['email_probable_birth_year'] = result.probable_birth_year
|
|
candidate['email_birth_year_confidence'] = result.birth_year_confidence
|
|
candidate['email_institution_name'] = result.institution_name
|
|
candidate['email_institution_type'] = result.institution_type
|
|
candidate['email_institution_ghcid'] = result.institution_ghcid
|
|
candidate['email_extracted_names'] = result.extracted_names
|
|
candidate['email_extracted_first_name'] = result.extracted_first_name
|
|
candidate['email_extracted_last_name'] = result.extracted_last_name
|
|
candidate['email_has_dutch_prefix'] = result.has_dutch_prefix
|
|
candidate['email_is_institutional'] = result.is_institutional_domain
|
|
|
|
# Update match signals if we found useful info
|
|
signals = candidate.get('match_signals', [])
|
|
|
|
if result.probable_birth_year:
|
|
if 'email_birth_year' not in signals:
|
|
signals.append('email_birth_year')
|
|
|
|
if result.institution_name:
|
|
if 'email_institution' not in signals:
|
|
signals.append('email_institution')
|
|
|
|
if result.extracted_names:
|
|
if 'email_name_parts' not in signals:
|
|
signals.append('email_name_parts')
|
|
|
|
candidate['match_signals'] = signals
|
|
|
|
return candidate
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Enrich entity resolution candidates with email semantics'
|
|
)
|
|
parser.add_argument(
|
|
'--input', '-i',
|
|
type=Path,
|
|
required=True,
|
|
help='Path to input candidates file'
|
|
)
|
|
parser.add_argument(
|
|
'--output', '-o',
|
|
type=Path,
|
|
required=True,
|
|
help='Path for output file with email semantics'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show stats without writing'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input file exists
|
|
if not args.input.exists():
|
|
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Loading candidates from: {args.input}")
|
|
data = load_json(args.input)
|
|
candidates = data.get('candidates', [])
|
|
print(f" - {len(candidates):,} candidates")
|
|
|
|
# Count existing reviews
|
|
reviewed = sum(1 for c in candidates if c.get('reviewed'))
|
|
print(f" - {reviewed} reviewed (will be preserved)")
|
|
|
|
# Enrich each candidate
|
|
print("\nEnriching with email semantics...")
|
|
stats = {
|
|
'total': len(candidates),
|
|
'with_email': 0,
|
|
'birth_year_found': 0,
|
|
'institution_found': 0,
|
|
'names_found': 0,
|
|
}
|
|
|
|
for i, candidate in enumerate(candidates):
|
|
if i % 10000 == 0 and i > 0:
|
|
print(f" - Processed {i:,} candidates...")
|
|
|
|
enrich_candidate_with_email_semantics(candidate)
|
|
|
|
if candidate.get('wcms_email'):
|
|
stats['with_email'] += 1
|
|
if candidate.get('email_probable_birth_year'):
|
|
stats['birth_year_found'] += 1
|
|
if candidate.get('email_institution_name'):
|
|
stats['institution_found'] += 1
|
|
if candidate.get('email_extracted_names'):
|
|
stats['names_found'] += 1
|
|
|
|
print(f"\nEmail semantic enrichment stats:")
|
|
print(f" - Candidates with email: {stats['with_email']:,}")
|
|
print(f" - Birth year extracted: {stats['birth_year_found']:,}")
|
|
print(f" - Institution identified: {stats['institution_found']:,}")
|
|
print(f" - Name parts extracted: {stats['names_found']:,}")
|
|
|
|
# Update metadata
|
|
data['metadata'] = data.get('metadata', {})
|
|
data['metadata']['email_semantics_enriched_at'] = datetime.utcnow().isoformat() + 'Z'
|
|
data['metadata']['email_semantics_stats'] = stats
|
|
|
|
# Verify reviews preserved
|
|
reviewed_after = sum(1 for c in candidates if c.get('reviewed'))
|
|
print(f"\nReviews preserved: {reviewed_after} (was {reviewed})")
|
|
|
|
if reviewed_after != reviewed:
|
|
print("ERROR: Review count mismatch!", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.dry_run:
|
|
print(f"\n[DRY RUN] Would write to: {args.output}")
|
|
else:
|
|
# Ensure output directory exists
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nWriting enriched file to: {args.output}")
|
|
save_json(args.output, data)
|
|
|
|
# Get file size
|
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
|
print(f" - File size: {size_mb:.1f} MB")
|
|
|
|
print("\nDone!")
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|