All checks were successful
Deploy Frontend / build-and-deploy (push) Successful in 4m31s
- Fix empty import list elements (- # comment pattern) in Laptop, Expenses, FunctionType, Overview, WebLink, Photography classes - Replace valid_from/valid_to slots with temporal_extent in class slots lists - Update slot_usage to use temporal_extent with TimeSpan range - Update examples to use temporal_extent with begin_of_the_begin/end_of_the_end - Fix typo is_or_was_is_or_was_archived_at → is_or_was_archived_at in WebObservation - Add TimeSpan imports to classes using temporal_extent - Fix relative import paths for Timestamp in temporal slots - Fix CustodianIdentifier → Identifier imports in FundingAgenda, ReadingRoomAnnex Schema validates successfully with 902 classes and 2043 slots.
298 lines
12 KiB
Python
298 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Reviewed Entity Resolution Matches
|
|
|
|
This script processes confirmed match decisions from the review system and merges
|
|
LinkedIn profile data into WCMS person profiles.
|
|
|
|
For each confirmed match (review_decision == 'match'):
|
|
1. Load the WCMS profile from data/person/{wcms_ppid}.json
|
|
2. Load the LinkedIn profile from data/person/{linkedin_ppid}.json
|
|
3. Merge LinkedIn data (profile_data, affiliations, heritage_relevance, web_claims)
|
|
4. Update entity_resolution section with match metadata
|
|
5. Save the merged profile
|
|
|
|
Usage:
|
|
python scripts/merge_reviewed_profiles.py --dry-run # Preview changes
|
|
python scripts/merge_reviewed_profiles.py # Execute merge
|
|
python scripts/merge_reviewed_profiles.py --candidates-file /path/to/file.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
# Default paths
|
|
DEFAULT_CANDIDATES_FILE = Path(__file__).parent.parent / "data" / "entity_resolution" / "entity_resolution_candidates.json"
|
|
DEFAULT_PERSON_DIR = Path(__file__).parent.parent / "data" / "person"
|
|
DEFAULT_BACKUP_DIR = Path(__file__).parent.parent / "data" / "backups" / "merge_reviewed"
|
|
|
|
|
|
def load_candidates(candidates_file: Path) -> list[dict]:
|
|
"""Load and filter candidates with 'match' decision."""
|
|
if not candidates_file.exists():
|
|
print(f"ERROR: Candidates file not found: {candidates_file}")
|
|
return []
|
|
|
|
with open(candidates_file) as f:
|
|
data = json.load(f)
|
|
|
|
candidates = data.get("candidates", [])
|
|
matched = [c for c in candidates if c.get("review_decision") == "match"]
|
|
|
|
print(f"Loaded {len(candidates)} total candidates, {len(matched)} with 'match' decision")
|
|
return matched
|
|
|
|
|
|
def load_profile(person_dir: Path, ppid: str) -> dict | None:
|
|
"""Load a person profile by PPID."""
|
|
file_path = person_dir / f"{ppid}.json"
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
with open(file_path) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def merge_profiles(wcms_profile: dict, linkedin_profile: dict, match_candidate: dict) -> dict:
|
|
"""
|
|
Merge LinkedIn profile data into WCMS profile.
|
|
|
|
Preserves all WCMS data and adds LinkedIn enrichments.
|
|
"""
|
|
merged = wcms_profile.copy()
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Add data_sources if not present
|
|
if "data_sources" not in merged:
|
|
merged["data_sources"] = []
|
|
if "wcms" not in merged["data_sources"]:
|
|
merged["data_sources"].append("wcms")
|
|
if "linkedin" not in merged["data_sources"]:
|
|
merged["data_sources"].append("linkedin")
|
|
|
|
# Merge profile_data from LinkedIn
|
|
if linkedin_profile.get("profile_data"):
|
|
if "profile_data" not in merged:
|
|
merged["profile_data"] = {}
|
|
|
|
linkedin_data = linkedin_profile["profile_data"]
|
|
wcms_data = merged["profile_data"]
|
|
|
|
# Add LinkedIn fields that WCMS doesn't have
|
|
for key in ["headline", "location", "about", "experience", "education",
|
|
"skills", "languages", "profile_image_url", "linkedin_url"]:
|
|
if key in linkedin_data and linkedin_data[key]:
|
|
# Don't overwrite existing WCMS data
|
|
if key not in wcms_data or not wcms_data[key]:
|
|
wcms_data[key] = linkedin_data[key]
|
|
|
|
# Merge affiliations from LinkedIn
|
|
if linkedin_profile.get("affiliations"):
|
|
if "affiliations" not in merged:
|
|
merged["affiliations"] = []
|
|
|
|
existing_custodians = {a.get("custodian_name", "").lower() for a in merged["affiliations"]}
|
|
|
|
for affiliation in linkedin_profile["affiliations"]:
|
|
custodian_name = affiliation.get("custodian_name", "").lower()
|
|
if custodian_name and custodian_name not in existing_custodians:
|
|
merged["affiliations"].append(affiliation)
|
|
existing_custodians.add(custodian_name)
|
|
|
|
# Add heritage_relevance from LinkedIn (if WCMS doesn't have it)
|
|
if linkedin_profile.get("heritage_relevance") and not merged.get("heritage_relevance"):
|
|
merged["heritage_relevance"] = linkedin_profile["heritage_relevance"]
|
|
|
|
# Merge web_claims from LinkedIn
|
|
if linkedin_profile.get("web_claims"):
|
|
if "web_claims" not in merged:
|
|
merged["web_claims"] = []
|
|
|
|
# Add LinkedIn claims with deduplication by (claim_type, claim_value_hash)
|
|
# claim_value can be dict, so we use JSON serialization for hashing
|
|
def claim_key(c):
|
|
claim_type = c.get("claim_type", "")
|
|
claim_value = c.get("claim_value")
|
|
# Handle dict/list values by serializing to JSON string
|
|
if isinstance(claim_value, (dict, list)):
|
|
value_str = json.dumps(claim_value, sort_keys=True)
|
|
else:
|
|
value_str = str(claim_value) if claim_value else ""
|
|
return (claim_type, value_str)
|
|
|
|
existing_claims = {claim_key(c) for c in merged["web_claims"]}
|
|
|
|
for claim in linkedin_profile["web_claims"]:
|
|
key = claim_key(claim)
|
|
if key not in existing_claims:
|
|
merged["web_claims"].append(claim)
|
|
existing_claims.add(key)
|
|
|
|
# Add linkedin_slug if not present
|
|
if linkedin_profile.get("linkedin_slug") and not merged.get("linkedin_slug"):
|
|
merged["linkedin_slug"] = linkedin_profile["linkedin_slug"]
|
|
|
|
# Update entity_resolution section
|
|
if "entity_resolution" not in merged:
|
|
merged["entity_resolution"] = {}
|
|
|
|
er = merged["entity_resolution"]
|
|
er["resolved"] = True
|
|
er["resolved_linkedin_ppid"] = match_candidate.get("linkedin_ppid")
|
|
er["resolved_linkedin_slug"] = match_candidate.get("linkedin_slug")
|
|
er["resolution_confidence"] = match_candidate.get("confidence_score", 0)
|
|
er["resolution_signals"] = match_candidate.get("match_signals", [])
|
|
er["resolution_decision"] = "match"
|
|
er["resolved_by"] = match_candidate.get("reviewed_by", "unknown")
|
|
er["resolved_at"] = match_candidate.get("reviewed_at", now)
|
|
er["review_notes"] = match_candidate.get("review_notes")
|
|
er["merge_timestamp"] = now
|
|
er["merge_script"] = "merge_reviewed_profiles.py"
|
|
|
|
# Update match_candidates to mark the matched one
|
|
if er.get("match_candidates"):
|
|
for candidate in er["match_candidates"]:
|
|
if candidate.get("linkedin_ppid") == match_candidate.get("linkedin_ppid"):
|
|
candidate["reviewed"] = True
|
|
candidate["review_decision"] = "match"
|
|
candidate["reviewed_by"] = match_candidate.get("reviewed_by")
|
|
candidate["reviewed_at"] = match_candidate.get("reviewed_at")
|
|
|
|
return merged
|
|
|
|
|
|
def save_profile(person_dir: Path, ppid: str, profile: dict, backup_dir: Path | None = None) -> bool:
|
|
"""Save a person profile, optionally creating a backup first."""
|
|
file_path = person_dir / f"{ppid}.json"
|
|
|
|
# Create backup if requested
|
|
if backup_dir and file_path.exists():
|
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
backup_path = backup_dir / f"{ppid}_{timestamp}.json"
|
|
shutil.copy2(file_path, backup_path)
|
|
|
|
# Write profile
|
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
json.dump(profile, f, indent=2, ensure_ascii=False)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Merge reviewed entity resolution matches")
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview changes without saving")
|
|
parser.add_argument("--candidates-file", type=Path, default=DEFAULT_CANDIDATES_FILE,
|
|
help="Path to entity_resolution_candidates.json")
|
|
parser.add_argument("--person-dir", type=Path, default=DEFAULT_PERSON_DIR,
|
|
help="Path to data/person directory")
|
|
parser.add_argument("--backup-dir", type=Path, default=DEFAULT_BACKUP_DIR,
|
|
help="Directory for backups (set to empty to disable)")
|
|
parser.add_argument("--limit", type=int, default=None,
|
|
help="Limit number of profiles to process (for testing)")
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 60)
|
|
print("MERGE REVIEWED ENTITY RESOLUTION MATCHES")
|
|
print("=" * 60)
|
|
print(f"Candidates file: {args.candidates_file}")
|
|
print(f"Person directory: {args.person_dir}")
|
|
print(f"Backup directory: {args.backup_dir if args.backup_dir else 'DISABLED'}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
# Load matched candidates
|
|
matched = load_candidates(args.candidates_file)
|
|
if not matched:
|
|
print("No matches to process")
|
|
return
|
|
|
|
if args.limit:
|
|
matched = matched[:args.limit]
|
|
print(f"Limited to {len(matched)} profiles for testing")
|
|
|
|
# Process each match
|
|
stats = {
|
|
"processed": 0,
|
|
"merged": 0,
|
|
"wcms_not_found": 0,
|
|
"linkedin_not_found": 0,
|
|
"already_merged": 0,
|
|
"errors": 0
|
|
}
|
|
|
|
for i, candidate in enumerate(matched, 1):
|
|
wcms_ppid = candidate.get("wcms_ppid")
|
|
linkedin_ppid = candidate.get("linkedin_ppid")
|
|
wcms_name = candidate.get("wcms_name", "Unknown")
|
|
|
|
print(f"\n[{i}/{len(matched)}] Processing: {wcms_name}")
|
|
print(f" WCMS: {wcms_ppid}")
|
|
print(f" LinkedIn: {linkedin_ppid}")
|
|
|
|
stats["processed"] += 1
|
|
|
|
# Load WCMS profile
|
|
wcms_profile = load_profile(args.person_dir, wcms_ppid)
|
|
if not wcms_profile:
|
|
print(f" WARNING: WCMS profile not found")
|
|
stats["wcms_not_found"] += 1
|
|
continue
|
|
|
|
# Check if already merged
|
|
er = wcms_profile.get("entity_resolution", {})
|
|
if er.get("resolved") and er.get("resolved_linkedin_ppid") == linkedin_ppid:
|
|
print(f" SKIP: Already merged")
|
|
stats["already_merged"] += 1
|
|
continue
|
|
|
|
# Load LinkedIn profile
|
|
linkedin_profile = load_profile(args.person_dir, linkedin_ppid)
|
|
if not linkedin_profile:
|
|
print(f" WARNING: LinkedIn profile not found")
|
|
stats["linkedin_not_found"] += 1
|
|
continue
|
|
|
|
# Merge profiles
|
|
try:
|
|
merged_profile = merge_profiles(wcms_profile, linkedin_profile, candidate)
|
|
|
|
if args.dry_run:
|
|
print(f" DRY RUN: Would merge LinkedIn data into WCMS profile")
|
|
print(f" - Affiliations: {len(merged_profile.get('affiliations', []))}")
|
|
print(f" - Web claims: {len(merged_profile.get('web_claims', []))}")
|
|
print(f" - Heritage relevance: {merged_profile.get('heritage_relevance', {}).get('is_heritage_relevant')}")
|
|
else:
|
|
backup_dir = args.backup_dir if args.backup_dir else None
|
|
save_profile(args.person_dir, wcms_ppid, merged_profile, backup_dir)
|
|
print(f" MERGED: Successfully merged profiles")
|
|
|
|
stats["merged"] += 1
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
stats["errors"] += 1
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Processed: {stats['processed']}")
|
|
print(f"Merged: {stats['merged']}")
|
|
print(f"Already merged: {stats['already_merged']}")
|
|
print(f"WCMS not found: {stats['wcms_not_found']}")
|
|
print(f"LinkedIn not found: {stats['linkedin_not_found']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
if args.dry_run:
|
|
print("\n*** DRY RUN - No changes were made ***")
|
|
print("Run without --dry-run to apply changes")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|