glam/scripts/merge_reviewed_profiles.py
kempersc 043ea868b5
All checks were successful
Deploy Frontend / build-and-deploy (push) Successful in 4m31s
fix(schema): Resolve broken imports after slot migration
- Fix empty import list elements (- # comment pattern) in Laptop, Expenses,
  FunctionType, Overview, WebLink, Photography classes
- Replace valid_from/valid_to slots with temporal_extent in class slots lists
- Update slot_usage to use temporal_extent with TimeSpan range
- Update examples to use temporal_extent with begin_of_the_begin/end_of_the_end
- Fix typo is_or_was_is_or_was_archived_at → is_or_was_archived_at in WebObservation
- Add TimeSpan imports to classes using temporal_extent
- Fix relative import paths for Timestamp in temporal slots
- Fix CustodianIdentifier → Identifier imports in FundingAgenda, ReadingRoomAnnex

Schema validates successfully with 902 classes and 2043 slots.
2026-01-15 12:25:27 +01:00

298 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Merge Reviewed Entity Resolution Matches
This script processes confirmed match decisions from the review system and merges
LinkedIn profile data into WCMS person profiles.
For each confirmed match (review_decision == 'match'):
1. Load the WCMS profile from data/person/{wcms_ppid}.json
2. Load the LinkedIn profile from data/person/{linkedin_ppid}.json
3. Merge LinkedIn data (profile_data, affiliations, heritage_relevance, web_claims)
4. Update entity_resolution section with match metadata
5. Save the merged profile
Usage:
python scripts/merge_reviewed_profiles.py --dry-run # Preview changes
python scripts/merge_reviewed_profiles.py # Execute merge
python scripts/merge_reviewed_profiles.py --candidates-file /path/to/file.json
"""
import argparse
import json
import shutil
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# Default paths
DEFAULT_CANDIDATES_FILE = Path(__file__).parent.parent / "data" / "entity_resolution" / "entity_resolution_candidates.json"
DEFAULT_PERSON_DIR = Path(__file__).parent.parent / "data" / "person"
DEFAULT_BACKUP_DIR = Path(__file__).parent.parent / "data" / "backups" / "merge_reviewed"
def load_candidates(candidates_file: Path) -> list[dict]:
"""Load and filter candidates with 'match' decision."""
if not candidates_file.exists():
print(f"ERROR: Candidates file not found: {candidates_file}")
return []
with open(candidates_file) as f:
data = json.load(f)
candidates = data.get("candidates", [])
matched = [c for c in candidates if c.get("review_decision") == "match"]
print(f"Loaded {len(candidates)} total candidates, {len(matched)} with 'match' decision")
return matched
def load_profile(person_dir: Path, ppid: str) -> dict | None:
"""Load a person profile by PPID."""
file_path = person_dir / f"{ppid}.json"
if not file_path.exists():
return None
with open(file_path) as f:
return json.load(f)
def merge_profiles(wcms_profile: dict, linkedin_profile: dict, match_candidate: dict) -> dict:
"""
Merge LinkedIn profile data into WCMS profile.
Preserves all WCMS data and adds LinkedIn enrichments.
"""
merged = wcms_profile.copy()
now = datetime.now(timezone.utc).isoformat()
# Add data_sources if not present
if "data_sources" not in merged:
merged["data_sources"] = []
if "wcms" not in merged["data_sources"]:
merged["data_sources"].append("wcms")
if "linkedin" not in merged["data_sources"]:
merged["data_sources"].append("linkedin")
# Merge profile_data from LinkedIn
if linkedin_profile.get("profile_data"):
if "profile_data" not in merged:
merged["profile_data"] = {}
linkedin_data = linkedin_profile["profile_data"]
wcms_data = merged["profile_data"]
# Add LinkedIn fields that WCMS doesn't have
for key in ["headline", "location", "about", "experience", "education",
"skills", "languages", "profile_image_url", "linkedin_url"]:
if key in linkedin_data and linkedin_data[key]:
# Don't overwrite existing WCMS data
if key not in wcms_data or not wcms_data[key]:
wcms_data[key] = linkedin_data[key]
# Merge affiliations from LinkedIn
if linkedin_profile.get("affiliations"):
if "affiliations" not in merged:
merged["affiliations"] = []
existing_custodians = {a.get("custodian_name", "").lower() for a in merged["affiliations"]}
for affiliation in linkedin_profile["affiliations"]:
custodian_name = affiliation.get("custodian_name", "").lower()
if custodian_name and custodian_name not in existing_custodians:
merged["affiliations"].append(affiliation)
existing_custodians.add(custodian_name)
# Add heritage_relevance from LinkedIn (if WCMS doesn't have it)
if linkedin_profile.get("heritage_relevance") and not merged.get("heritage_relevance"):
merged["heritage_relevance"] = linkedin_profile["heritage_relevance"]
# Merge web_claims from LinkedIn
if linkedin_profile.get("web_claims"):
if "web_claims" not in merged:
merged["web_claims"] = []
# Add LinkedIn claims with deduplication by (claim_type, claim_value_hash)
# claim_value can be dict, so we use JSON serialization for hashing
def claim_key(c):
claim_type = c.get("claim_type", "")
claim_value = c.get("claim_value")
# Handle dict/list values by serializing to JSON string
if isinstance(claim_value, (dict, list)):
value_str = json.dumps(claim_value, sort_keys=True)
else:
value_str = str(claim_value) if claim_value else ""
return (claim_type, value_str)
existing_claims = {claim_key(c) for c in merged["web_claims"]}
for claim in linkedin_profile["web_claims"]:
key = claim_key(claim)
if key not in existing_claims:
merged["web_claims"].append(claim)
existing_claims.add(key)
# Add linkedin_slug if not present
if linkedin_profile.get("linkedin_slug") and not merged.get("linkedin_slug"):
merged["linkedin_slug"] = linkedin_profile["linkedin_slug"]
# Update entity_resolution section
if "entity_resolution" not in merged:
merged["entity_resolution"] = {}
er = merged["entity_resolution"]
er["resolved"] = True
er["resolved_linkedin_ppid"] = match_candidate.get("linkedin_ppid")
er["resolved_linkedin_slug"] = match_candidate.get("linkedin_slug")
er["resolution_confidence"] = match_candidate.get("confidence_score", 0)
er["resolution_signals"] = match_candidate.get("match_signals", [])
er["resolution_decision"] = "match"
er["resolved_by"] = match_candidate.get("reviewed_by", "unknown")
er["resolved_at"] = match_candidate.get("reviewed_at", now)
er["review_notes"] = match_candidate.get("review_notes")
er["merge_timestamp"] = now
er["merge_script"] = "merge_reviewed_profiles.py"
# Update match_candidates to mark the matched one
if er.get("match_candidates"):
for candidate in er["match_candidates"]:
if candidate.get("linkedin_ppid") == match_candidate.get("linkedin_ppid"):
candidate["reviewed"] = True
candidate["review_decision"] = "match"
candidate["reviewed_by"] = match_candidate.get("reviewed_by")
candidate["reviewed_at"] = match_candidate.get("reviewed_at")
return merged
def save_profile(person_dir: Path, ppid: str, profile: dict, backup_dir: Path | None = None) -> bool:
"""Save a person profile, optionally creating a backup first."""
file_path = person_dir / f"{ppid}.json"
# Create backup if requested
if backup_dir and file_path.exists():
backup_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = backup_dir / f"{ppid}_{timestamp}.json"
shutil.copy2(file_path, backup_path)
# Write profile
with open(file_path, "w", encoding="utf-8") as f:
json.dump(profile, f, indent=2, ensure_ascii=False)
return True
def main():
parser = argparse.ArgumentParser(description="Merge reviewed entity resolution matches")
parser.add_argument("--dry-run", action="store_true", help="Preview changes without saving")
parser.add_argument("--candidates-file", type=Path, default=DEFAULT_CANDIDATES_FILE,
help="Path to entity_resolution_candidates.json")
parser.add_argument("--person-dir", type=Path, default=DEFAULT_PERSON_DIR,
help="Path to data/person directory")
parser.add_argument("--backup-dir", type=Path, default=DEFAULT_BACKUP_DIR,
help="Directory for backups (set to empty to disable)")
parser.add_argument("--limit", type=int, default=None,
help="Limit number of profiles to process (for testing)")
args = parser.parse_args()
print("=" * 60)
print("MERGE REVIEWED ENTITY RESOLUTION MATCHES")
print("=" * 60)
print(f"Candidates file: {args.candidates_file}")
print(f"Person directory: {args.person_dir}")
print(f"Backup directory: {args.backup_dir if args.backup_dir else 'DISABLED'}")
print(f"Dry run: {args.dry_run}")
print()
# Load matched candidates
matched = load_candidates(args.candidates_file)
if not matched:
print("No matches to process")
return
if args.limit:
matched = matched[:args.limit]
print(f"Limited to {len(matched)} profiles for testing")
# Process each match
stats = {
"processed": 0,
"merged": 0,
"wcms_not_found": 0,
"linkedin_not_found": 0,
"already_merged": 0,
"errors": 0
}
for i, candidate in enumerate(matched, 1):
wcms_ppid = candidate.get("wcms_ppid")
linkedin_ppid = candidate.get("linkedin_ppid")
wcms_name = candidate.get("wcms_name", "Unknown")
print(f"\n[{i}/{len(matched)}] Processing: {wcms_name}")
print(f" WCMS: {wcms_ppid}")
print(f" LinkedIn: {linkedin_ppid}")
stats["processed"] += 1
# Load WCMS profile
wcms_profile = load_profile(args.person_dir, wcms_ppid)
if not wcms_profile:
print(f" WARNING: WCMS profile not found")
stats["wcms_not_found"] += 1
continue
# Check if already merged
er = wcms_profile.get("entity_resolution", {})
if er.get("resolved") and er.get("resolved_linkedin_ppid") == linkedin_ppid:
print(f" SKIP: Already merged")
stats["already_merged"] += 1
continue
# Load LinkedIn profile
linkedin_profile = load_profile(args.person_dir, linkedin_ppid)
if not linkedin_profile:
print(f" WARNING: LinkedIn profile not found")
stats["linkedin_not_found"] += 1
continue
# Merge profiles
try:
merged_profile = merge_profiles(wcms_profile, linkedin_profile, candidate)
if args.dry_run:
print(f" DRY RUN: Would merge LinkedIn data into WCMS profile")
print(f" - Affiliations: {len(merged_profile.get('affiliations', []))}")
print(f" - Web claims: {len(merged_profile.get('web_claims', []))}")
print(f" - Heritage relevance: {merged_profile.get('heritage_relevance', {}).get('is_heritage_relevant')}")
else:
backup_dir = args.backup_dir if args.backup_dir else None
save_profile(args.person_dir, wcms_ppid, merged_profile, backup_dir)
print(f" MERGED: Successfully merged profiles")
stats["merged"] += 1
except Exception as e:
print(f" ERROR: {e}")
stats["errors"] += 1
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Processed: {stats['processed']}")
print(f"Merged: {stats['merged']}")
print(f"Already merged: {stats['already_merged']}")
print(f"WCMS not found: {stats['wcms_not_found']}")
print(f"LinkedIn not found: {stats['linkedin_not_found']}")
print(f"Errors: {stats['errors']}")
if args.dry_run:
print("\n*** DRY RUN - No changes were made ***")
print("Run without --dry-run to apply changes")
if __name__ == "__main__":
main()