glam/scripts/merge_reviewed_profiles.py

#!/usr/bin/env python3
"""
Merge Reviewed Entity Resolution Matches

This script processes confirmed match decisions from the review system and merges
LinkedIn profile data into WCMS person profiles.

For each confirmed match (review_decision == 'match'):
1. Load the WCMS profile from data/person/{wcms_ppid}.json
2. Load the LinkedIn profile from data/person/{linkedin_ppid}.json
3. Merge LinkedIn data (profile_data, affiliations, heritage_relevance, web_claims)
4. Update entity_resolution section with match metadata
5. Save the merged profile

Usage:
    python scripts/merge_reviewed_profiles.py --dry-run  # Preview changes
    python scripts/merge_reviewed_profiles.py            # Execute merge
    python scripts/merge_reviewed_profiles.py --candidates-file /path/to/file.json
"""

import argparse
import json
import shutil
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


# Default paths
DEFAULT_CANDIDATES_FILE = Path(__file__).parent.parent / "data" / "entity_resolution" / "entity_resolution_candidates.json"
DEFAULT_PERSON_DIR = Path(__file__).parent.parent / "data" / "person"
DEFAULT_BACKUP_DIR = Path(__file__).parent.parent / "data" / "backups" / "merge_reviewed"


def load_candidates(candidates_file: Path) -> list[dict]:
    """Load and filter candidates with 'match' decision."""
    if not candidates_file.exists():
        print(f"ERROR: Candidates file not found: {candidates_file}")
        return []

    with open(candidates_file) as f:
        data = json.load(f)

    candidates = data.get("candidates", [])
    matched = [c for c in candidates if c.get("review_decision") == "match"]

    print(f"Loaded {len(candidates)} total candidates, {len(matched)} with 'match' decision")
    return matched


def load_profile(person_dir: Path, ppid: str) -> dict | None:
    """Load a person profile by PPID."""
    file_path = person_dir / f"{ppid}.json"
    if not file_path.exists():
        return None

    with open(file_path) as f:
        return json.load(f)


def merge_profiles(wcms_profile: dict, linkedin_profile: dict, match_candidate: dict) -> dict:
    """
    Merge LinkedIn profile data into WCMS profile.

    Preserves all WCMS data and adds LinkedIn enrichments.
    """
    merged = wcms_profile.copy()
    now = datetime.now(timezone.utc).isoformat()

    # Add data_sources if not present
    if "data_sources" not in merged:
        merged["data_sources"] = []
    if "wcms" not in merged["data_sources"]:
        merged["data_sources"].append("wcms")
    if "linkedin" not in merged["data_sources"]:
        merged["data_sources"].append("linkedin")

    # Merge profile_data from LinkedIn
    if linkedin_profile.get("profile_data"):
        if "profile_data" not in merged:
            merged["profile_data"] = {}

        linkedin_data = linkedin_profile["profile_data"]
        wcms_data = merged["profile_data"]

        # Add LinkedIn fields that WCMS doesn't have
        for key in ["headline", "location", "about", "experience", "education",
                    "skills", "languages", "profile_image_url", "linkedin_url"]:
            if key in linkedin_data and linkedin_data[key]:
                # Don't overwrite existing WCMS data
                if key not in wcms_data or not wcms_data[key]:
                    wcms_data[key] = linkedin_data[key]

    # Merge affiliations from LinkedIn
    if linkedin_profile.get("affiliations"):
        if "affiliations" not in merged:
            merged["affiliations"] = []

        existing_custodians = {a.get("custodian_name", "").lower() for a in merged["affiliations"]}

        for affiliation in linkedin_profile["affiliations"]:
            custodian_name = affiliation.get("custodian_name", "").lower()
            if custodian_name and custodian_name not in existing_custodians:
                merged["affiliations"].append(affiliation)
                existing_custodians.add(custodian_name)

    # Add heritage_relevance from LinkedIn (if WCMS doesn't have it)
    if linkedin_profile.get("heritage_relevance") and not merged.get("heritage_relevance"):
        merged["heritage_relevance"] = linkedin_profile["heritage_relevance"]

    # Merge web_claims from LinkedIn
    if linkedin_profile.get("web_claims"):
        if "web_claims" not in merged:
            merged["web_claims"] = []

        # Add LinkedIn claims with deduplication by (claim_type, claim_value_hash)
        # claim_value can be dict, so we use JSON serialization for hashing
        def claim_key(c):
            claim_type = c.get("claim_type", "")
            claim_value = c.get("claim_value")
            # Handle dict/list values by serializing to JSON string
            if isinstance(claim_value, (dict, list)):
                value_str = json.dumps(claim_value, sort_keys=True)
            else:
                value_str = str(claim_value) if claim_value else ""
            return (claim_type, value_str)

        existing_claims = {claim_key(c) for c in merged["web_claims"]}

        for claim in linkedin_profile["web_claims"]:
            key = claim_key(claim)
            if key not in existing_claims:
                merged["web_claims"].append(claim)
                existing_claims.add(key)

    # Add linkedin_slug if not present
    if linkedin_profile.get("linkedin_slug") and not merged.get("linkedin_slug"):
        merged["linkedin_slug"] = linkedin_profile["linkedin_slug"]

    # Update entity_resolution section
    if "entity_resolution" not in merged:
        merged["entity_resolution"] = {}

    er = merged["entity_resolution"]
    er["resolved"] = True
    er["resolved_linkedin_ppid"] = match_candidate.get("linkedin_ppid")
    er["resolved_linkedin_slug"] = match_candidate.get("linkedin_slug")
    er["resolution_confidence"] = match_candidate.get("confidence_score", 0)
    er["resolution_signals"] = match_candidate.get("match_signals", [])
    er["resolution_decision"] = "match"
    er["resolved_by"] = match_candidate.get("reviewed_by", "unknown")
    er["resolved_at"] = match_candidate.get("reviewed_at", now)
    er["review_notes"] = match_candidate.get("review_notes")
    er["merge_timestamp"] = now
    er["merge_script"] = "merge_reviewed_profiles.py"

    # Update match_candidates to mark the matched one
    if er.get("match_candidates"):
        for candidate in er["match_candidates"]:
            if candidate.get("linkedin_ppid") == match_candidate.get("linkedin_ppid"):
                candidate["reviewed"] = True
                candidate["review_decision"] = "match"
                candidate["reviewed_by"] = match_candidate.get("reviewed_by")
                candidate["reviewed_at"] = match_candidate.get("reviewed_at")

    return merged


def save_profile(person_dir: Path, ppid: str, profile: dict, backup_dir: Path | None = None) -> bool:
    """Save a person profile, optionally creating a backup first."""
    file_path = person_dir / f"{ppid}.json"

    # Create backup if requested
    if backup_dir and file_path.exists():
        backup_dir.mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_path = backup_dir / f"{ppid}_{timestamp}.json"
        shutil.copy2(file_path, backup_path)

    # Write profile
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(profile, f, indent=2, ensure_ascii=False)

    return True


def main():
    parser = argparse.ArgumentParser(description="Merge reviewed entity resolution matches")
    parser.add_argument("--dry-run", action="store_true", help="Preview changes without saving")
    parser.add_argument("--candidates-file", type=Path, default=DEFAULT_CANDIDATES_FILE,
                        help="Path to entity_resolution_candidates.json")
    parser.add_argument("--person-dir", type=Path, default=DEFAULT_PERSON_DIR,
                        help="Path to data/person directory")
    parser.add_argument("--backup-dir", type=Path, default=DEFAULT_BACKUP_DIR,
                        help="Directory for backups (set to empty to disable)")
    parser.add_argument("--limit", type=int, default=None,
                        help="Limit number of profiles to process (for testing)")
    args = parser.parse_args()

    print("=" * 60)
    print("MERGE REVIEWED ENTITY RESOLUTION MATCHES")
    print("=" * 60)
    print(f"Candidates file: {args.candidates_file}")
    print(f"Person directory: {args.person_dir}")
    print(f"Backup directory: {args.backup_dir if args.backup_dir else 'DISABLED'}")
    print(f"Dry run: {args.dry_run}")
    print()

    # Load matched candidates
    matched = load_candidates(args.candidates_file)
    if not matched:
        print("No matches to process")
        return

    if args.limit:
        matched = matched[:args.limit]
        print(f"Limited to {len(matched)} profiles for testing")

    # Process each match
    stats = {
        "processed": 0,
        "merged": 0,
        "wcms_not_found": 0,
        "linkedin_not_found": 0,
        "already_merged": 0,
        "errors": 0
    }

    for i, candidate in enumerate(matched, 1):
        wcms_ppid = candidate.get("wcms_ppid")
        linkedin_ppid = candidate.get("linkedin_ppid")
        wcms_name = candidate.get("wcms_name", "Unknown")

        print(f"\n[{i}/{len(matched)}] Processing: {wcms_name}")
        print(f"  WCMS: {wcms_ppid}")
        print(f"  LinkedIn: {linkedin_ppid}")

        stats["processed"] += 1

        # Load WCMS profile
        wcms_profile = load_profile(args.person_dir, wcms_ppid)
        if not wcms_profile:
            print(f"  WARNING: WCMS profile not found")
            stats["wcms_not_found"] += 1
            continue

        # Check if already merged
        er = wcms_profile.get("entity_resolution", {})
        if er.get("resolved") and er.get("resolved_linkedin_ppid") == linkedin_ppid:
            print(f"  SKIP: Already merged")
            stats["already_merged"] += 1
            continue

        # Load LinkedIn profile
        linkedin_profile = load_profile(args.person_dir, linkedin_ppid)
        if not linkedin_profile:
            print(f"  WARNING: LinkedIn profile not found")
            stats["linkedin_not_found"] += 1
            continue

        # Merge profiles
        try:
            merged_profile = merge_profiles(wcms_profile, linkedin_profile, candidate)

            if args.dry_run:
                print(f"  DRY RUN: Would merge LinkedIn data into WCMS profile")
                print(f"    - Affiliations: {len(merged_profile.get('affiliations', []))}")
                print(f"    - Web claims: {len(merged_profile.get('web_claims', []))}")
                print(f"    - Heritage relevance: {merged_profile.get('heritage_relevance', {}).get('is_heritage_relevant')}")
            else:
                backup_dir = args.backup_dir if args.backup_dir else None
                save_profile(args.person_dir, wcms_ppid, merged_profile, backup_dir)
                print(f"  MERGED: Successfully merged profiles")

            stats["merged"] += 1

        except Exception as e:
            print(f"  ERROR: {e}")
            stats["errors"] += 1

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Processed:          {stats['processed']}")
    print(f"Merged:             {stats['merged']}")
    print(f"Already merged:     {stats['already_merged']}")
    print(f"WCMS not found:     {stats['wcms_not_found']}")
    print(f"LinkedIn not found: {stats['linkedin_not_found']}")
    print(f"Errors:             {stats['errors']}")

    if args.dry_run:
        print("\n*** DRY RUN - No changes were made ***")
        print("Run without --dry-run to apply changes")


if __name__ == "__main__":
    main()