#!/usr/bin/env python3 """ Merge Reviewed Entity Resolution Matches This script processes confirmed match decisions from the review system and merges LinkedIn profile data into WCMS person profiles. For each confirmed match (review_decision == 'match'): 1. Load the WCMS profile from data/person/{wcms_ppid}.json 2. Load the LinkedIn profile from data/person/{linkedin_ppid}.json 3. Merge LinkedIn data (profile_data, affiliations, heritage_relevance, web_claims) 4. Update entity_resolution section with match metadata 5. Save the merged profile Usage: python scripts/merge_reviewed_profiles.py --dry-run # Preview changes python scripts/merge_reviewed_profiles.py # Execute merge python scripts/merge_reviewed_profiles.py --candidates-file /path/to/file.json """ import argparse import json import shutil from datetime import datetime, timezone from pathlib import Path from typing import Any # Default paths DEFAULT_CANDIDATES_FILE = Path(__file__).parent.parent / "data" / "entity_resolution" / "entity_resolution_candidates.json" DEFAULT_PERSON_DIR = Path(__file__).parent.parent / "data" / "person" DEFAULT_BACKUP_DIR = Path(__file__).parent.parent / "data" / "backups" / "merge_reviewed" def load_candidates(candidates_file: Path) -> list[dict]: """Load and filter candidates with 'match' decision.""" if not candidates_file.exists(): print(f"ERROR: Candidates file not found: {candidates_file}") return [] with open(candidates_file) as f: data = json.load(f) candidates = data.get("candidates", []) matched = [c for c in candidates if c.get("review_decision") == "match"] print(f"Loaded {len(candidates)} total candidates, {len(matched)} with 'match' decision") return matched def load_profile(person_dir: Path, ppid: str) -> dict | None: """Load a person profile by PPID.""" file_path = person_dir / f"{ppid}.json" if not file_path.exists(): return None with open(file_path) as f: return json.load(f) def merge_profiles(wcms_profile: dict, linkedin_profile: dict, match_candidate: dict) -> dict: """ Merge LinkedIn profile data into WCMS profile. Preserves all WCMS data and adds LinkedIn enrichments. """ merged = wcms_profile.copy() now = datetime.now(timezone.utc).isoformat() # Add data_sources if not present if "data_sources" not in merged: merged["data_sources"] = [] if "wcms" not in merged["data_sources"]: merged["data_sources"].append("wcms") if "linkedin" not in merged["data_sources"]: merged["data_sources"].append("linkedin") # Merge profile_data from LinkedIn if linkedin_profile.get("profile_data"): if "profile_data" not in merged: merged["profile_data"] = {} linkedin_data = linkedin_profile["profile_data"] wcms_data = merged["profile_data"] # Add LinkedIn fields that WCMS doesn't have for key in ["headline", "location", "about", "experience", "education", "skills", "languages", "profile_image_url", "linkedin_url"]: if key in linkedin_data and linkedin_data[key]: # Don't overwrite existing WCMS data if key not in wcms_data or not wcms_data[key]: wcms_data[key] = linkedin_data[key] # Merge affiliations from LinkedIn if linkedin_profile.get("affiliations"): if "affiliations" not in merged: merged["affiliations"] = [] existing_custodians = {a.get("custodian_name", "").lower() for a in merged["affiliations"]} for affiliation in linkedin_profile["affiliations"]: custodian_name = affiliation.get("custodian_name", "").lower() if custodian_name and custodian_name not in existing_custodians: merged["affiliations"].append(affiliation) existing_custodians.add(custodian_name) # Add heritage_relevance from LinkedIn (if WCMS doesn't have it) if linkedin_profile.get("heritage_relevance") and not merged.get("heritage_relevance"): merged["heritage_relevance"] = linkedin_profile["heritage_relevance"] # Merge web_claims from LinkedIn if linkedin_profile.get("web_claims"): if "web_claims" not in merged: merged["web_claims"] = [] # Add LinkedIn claims with deduplication by (claim_type, claim_value_hash) # claim_value can be dict, so we use JSON serialization for hashing def claim_key(c): claim_type = c.get("claim_type", "") claim_value = c.get("claim_value") # Handle dict/list values by serializing to JSON string if isinstance(claim_value, (dict, list)): value_str = json.dumps(claim_value, sort_keys=True) else: value_str = str(claim_value) if claim_value else "" return (claim_type, value_str) existing_claims = {claim_key(c) for c in merged["web_claims"]} for claim in linkedin_profile["web_claims"]: key = claim_key(claim) if key not in existing_claims: merged["web_claims"].append(claim) existing_claims.add(key) # Add linkedin_slug if not present if linkedin_profile.get("linkedin_slug") and not merged.get("linkedin_slug"): merged["linkedin_slug"] = linkedin_profile["linkedin_slug"] # Update entity_resolution section if "entity_resolution" not in merged: merged["entity_resolution"] = {} er = merged["entity_resolution"] er["resolved"] = True er["resolved_linkedin_ppid"] = match_candidate.get("linkedin_ppid") er["resolved_linkedin_slug"] = match_candidate.get("linkedin_slug") er["resolution_confidence"] = match_candidate.get("confidence_score", 0) er["resolution_signals"] = match_candidate.get("match_signals", []) er["resolution_decision"] = "match" er["resolved_by"] = match_candidate.get("reviewed_by", "unknown") er["resolved_at"] = match_candidate.get("reviewed_at", now) er["review_notes"] = match_candidate.get("review_notes") er["merge_timestamp"] = now er["merge_script"] = "merge_reviewed_profiles.py" # Update match_candidates to mark the matched one if er.get("match_candidates"): for candidate in er["match_candidates"]: if candidate.get("linkedin_ppid") == match_candidate.get("linkedin_ppid"): candidate["reviewed"] = True candidate["review_decision"] = "match" candidate["reviewed_by"] = match_candidate.get("reviewed_by") candidate["reviewed_at"] = match_candidate.get("reviewed_at") return merged def save_profile(person_dir: Path, ppid: str, profile: dict, backup_dir: Path | None = None) -> bool: """Save a person profile, optionally creating a backup first.""" file_path = person_dir / f"{ppid}.json" # Create backup if requested if backup_dir and file_path.exists(): backup_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backup_path = backup_dir / f"{ppid}_{timestamp}.json" shutil.copy2(file_path, backup_path) # Write profile with open(file_path, "w", encoding="utf-8") as f: json.dump(profile, f, indent=2, ensure_ascii=False) return True def main(): parser = argparse.ArgumentParser(description="Merge reviewed entity resolution matches") parser.add_argument("--dry-run", action="store_true", help="Preview changes without saving") parser.add_argument("--candidates-file", type=Path, default=DEFAULT_CANDIDATES_FILE, help="Path to entity_resolution_candidates.json") parser.add_argument("--person-dir", type=Path, default=DEFAULT_PERSON_DIR, help="Path to data/person directory") parser.add_argument("--backup-dir", type=Path, default=DEFAULT_BACKUP_DIR, help="Directory for backups (set to empty to disable)") parser.add_argument("--limit", type=int, default=None, help="Limit number of profiles to process (for testing)") args = parser.parse_args() print("=" * 60) print("MERGE REVIEWED ENTITY RESOLUTION MATCHES") print("=" * 60) print(f"Candidates file: {args.candidates_file}") print(f"Person directory: {args.person_dir}") print(f"Backup directory: {args.backup_dir if args.backup_dir else 'DISABLED'}") print(f"Dry run: {args.dry_run}") print() # Load matched candidates matched = load_candidates(args.candidates_file) if not matched: print("No matches to process") return if args.limit: matched = matched[:args.limit] print(f"Limited to {len(matched)} profiles for testing") # Process each match stats = { "processed": 0, "merged": 0, "wcms_not_found": 0, "linkedin_not_found": 0, "already_merged": 0, "errors": 0 } for i, candidate in enumerate(matched, 1): wcms_ppid = candidate.get("wcms_ppid") linkedin_ppid = candidate.get("linkedin_ppid") wcms_name = candidate.get("wcms_name", "Unknown") print(f"\n[{i}/{len(matched)}] Processing: {wcms_name}") print(f" WCMS: {wcms_ppid}") print(f" LinkedIn: {linkedin_ppid}") stats["processed"] += 1 # Load WCMS profile wcms_profile = load_profile(args.person_dir, wcms_ppid) if not wcms_profile: print(f" WARNING: WCMS profile not found") stats["wcms_not_found"] += 1 continue # Check if already merged er = wcms_profile.get("entity_resolution", {}) if er.get("resolved") and er.get("resolved_linkedin_ppid") == linkedin_ppid: print(f" SKIP: Already merged") stats["already_merged"] += 1 continue # Load LinkedIn profile linkedin_profile = load_profile(args.person_dir, linkedin_ppid) if not linkedin_profile: print(f" WARNING: LinkedIn profile not found") stats["linkedin_not_found"] += 1 continue # Merge profiles try: merged_profile = merge_profiles(wcms_profile, linkedin_profile, candidate) if args.dry_run: print(f" DRY RUN: Would merge LinkedIn data into WCMS profile") print(f" - Affiliations: {len(merged_profile.get('affiliations', []))}") print(f" - Web claims: {len(merged_profile.get('web_claims', []))}") print(f" - Heritage relevance: {merged_profile.get('heritage_relevance', {}).get('is_heritage_relevant')}") else: backup_dir = args.backup_dir if args.backup_dir else None save_profile(args.person_dir, wcms_ppid, merged_profile, backup_dir) print(f" MERGED: Successfully merged profiles") stats["merged"] += 1 except Exception as e: print(f" ERROR: {e}") stats["errors"] += 1 # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Processed: {stats['processed']}") print(f"Merged: {stats['merged']}") print(f"Already merged: {stats['already_merged']}") print(f"WCMS not found: {stats['wcms_not_found']}") print(f"LinkedIn not found: {stats['linkedin_not_found']}") print(f"Errors: {stats['errors']}") if args.dry_run: print("\n*** DRY RUN - No changes were made ***") print("Run without --dry-run to apply changes") if __name__ == "__main__": main()