glam/scripts/merge_wcms_to_persons.py
kempersc c2629f6d29 Fix LinkML schema validation errors (0 errors, 30 warnings)
Schema Migration Fixes:
- Fix YAML import indentation in ~650 slot files (linkml:types and enum imports)
- Rename slot reference: has_or_had_holds_record_set_type → hold_or_held_record_set_type
  (70+ archive class files, main schema, manifest.json)
- Fix ProvenanceBlock.yaml: remove invalid any_of range, use string with multivalued
- Fix has_or_had_provenance.yaml: remove nested template_specificity from annotations

Validation Status:
- 0 errors (was multiple import/reference errors)
- 30 warnings (missing descriptions on inline slots, intentional SCREAMING_CASE names)

Files changed: ~3,850 (slots, classes, main schema, manifest)
2026-01-15 23:21:38 +01:00

301 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Merge WCMS data into person entity files.
This script:
1. Builds an email → WCMS user data index from WCMS user files
2. Loads entity_resolution_candidates to find confirmed WCMS↔LinkedIn matches
3. Updates person entity files with WCMS identifiers and activity data
Usage:
python scripts/merge_wcms_to_persons.py --wcms-dir /Volumes/KINGSTON/data/wcms/data/person_profiles/users
python scripts/merge_wcms_to_persons.py --dry-run # Preview without writing
"""
import argparse
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Optional, Any
from collections import defaultdict
def load_wcms_index(wcms_dir: Path, verbose: bool = False) -> Dict[str, Dict]:
"""Build email → WCMS user data index."""
index = {}
errors = 0
if not wcms_dir.exists():
print(f"ERROR: WCMS directory not found: {wcms_dir}")
sys.exit(1)
# Walk through all subdirectories (000, 001, 002, etc.)
subdirs = sorted([d for d in wcms_dir.iterdir() if d.is_dir()])
print(f"Found {len(subdirs)} WCMS subdirectories")
for subdir in subdirs:
for user_file in subdir.glob("user_*.json"):
try:
with open(user_file, 'r', encoding='utf-8') as f:
data = json.load(f)
email = data.get('email', '').lower().strip()
if email:
# Store the full WCMS record indexed by email
index[email] = {
'user_id': data.get('user_id'),
'username': data.get('username'),
'username_url': data.get('username_url'),
'full_name': data.get('full_name'),
'status': data.get('status'),
'roles': data.get('roles', []),
'registered_since': data.get('registered_since'),
'last_access': data.get('last_access'),
'abs_id': data.get('abs_id'),
'crm_id': data.get('crm_id'),
'email': email,
'_source_file': str(user_file)
}
except Exception as e:
errors += 1
if verbose:
print(f" Error reading {user_file}: {e}")
print(f"Built WCMS index: {len(index)} emails indexed, {errors} errors")
return index
def load_entity_candidates(candidates_file: Path) -> Dict[str, Dict]:
"""Load entity resolution candidates and build email → candidate mapping."""
with open(candidates_file, 'r', encoding='utf-8') as f:
data = json.load(f)
candidates = data.get('candidates', [])
# Build email → best candidate mapping
# Prioritize confirmed matches, then high-confidence candidates
email_to_candidate = {}
for c in candidates:
email = c.get('wcms_email', '').lower().strip()
if not email:
continue
# Check if this is a better candidate than existing
existing = email_to_candidate.get(email)
is_confirmed = c.get('review_decision') == 'match'
existing_confirmed = existing.get('review_decision') == 'match' if existing else False
if not existing:
email_to_candidate[email] = c
elif is_confirmed and not existing_confirmed:
# Prefer confirmed match
email_to_candidate[email] = c
elif is_confirmed == existing_confirmed:
# Same confirmation status - prefer higher confidence
if c.get('confidence_score', 0) > existing.get('confidence_score', 0):
email_to_candidate[email] = c
confirmed = sum(1 for c in email_to_candidate.values() if c.get('review_decision') == 'match')
print(f"Loaded {len(email_to_candidate)} unique email→candidate mappings ({confirmed} confirmed matches)")
return email_to_candidate
def find_person_file_by_slug(person_dir: Path, linkedin_slug: str) -> Optional[Path]:
"""Find person entity file by LinkedIn slug."""
# Person files are named like: {linkedin-slug}_{timestamp}.json
# There may be multiple versions - find the most recent
pattern = f"{linkedin_slug}_*.json"
matches = list(person_dir.glob(pattern))
if not matches:
return None
# Return most recent (sorted by filename which includes timestamp)
return sorted(matches)[-1]
def create_wcms_fields(wcms_data: Dict) -> tuple[Dict, Dict, Dict]:
"""Create wcms_identifiers, wcms_activity, and contact_details from WCMS data."""
wcms_identifiers = {
'user_id': wcms_data.get('user_id'),
'abs_id': wcms_data.get('abs_id') or None,
'crm_id': wcms_data.get('crm_id') or None,
'username': wcms_data.get('username'),
'username_url': wcms_data.get('username_url'),
}
wcms_activity = {
'status': wcms_data.get('status'),
'roles': wcms_data.get('roles', []),
'registered_since': wcms_data.get('registered_since'),
'last_access': wcms_data.get('last_access'),
}
email = wcms_data.get('email', '')
contact_details = {
'email': email,
'email_domain': email.split('@')[1] if '@' in email else None,
}
return wcms_identifiers, wcms_activity, contact_details
def update_person_file(
person_file: Path,
wcms_identifiers: Dict,
wcms_activity: Dict,
contact_details: Dict,
dry_run: bool = False
) -> bool:
"""Update person entity file with WCMS data."""
try:
with open(person_file, 'r', encoding='utf-8') as f:
person_data = json.load(f)
except Exception as e:
print(f" Error reading {person_file}: {e}")
return False
# Check if already has WCMS data
if person_data.get('wcms_identifiers'):
return False # Already has WCMS data
# Add WCMS fields
person_data['wcms_identifiers'] = wcms_identifiers
person_data['wcms_activity'] = wcms_activity
# Add/update contact details
if 'contact_details' not in person_data:
person_data['contact_details'] = contact_details
else:
# Merge with existing contact details
for k, v in contact_details.items():
if v and not person_data['contact_details'].get(k):
person_data['contact_details'][k] = v
# Add provenance note
if 'extraction_metadata' in person_data:
notes = person_data['extraction_metadata'].get('notes', '')
wcms_note = f"WCMS data merged on {datetime.now(timezone.utc).isoformat()}"
person_data['extraction_metadata']['notes'] = f"{notes} {wcms_note}".strip()
if dry_run:
return True
# Write updated file
try:
with open(person_file, 'w', encoding='utf-8') as f:
json.dump(person_data, f, indent=2, ensure_ascii=False)
return True
except Exception as e:
print(f" Error writing {person_file}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='Merge WCMS data into person entity files')
parser.add_argument('--wcms-dir', type=Path,
default=Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users'),
help='Path to WCMS users directory')
parser.add_argument('--candidates-file', type=Path,
default=Path('data/entity_resolution/entity_resolution_candidates.json'),
help='Path to entity resolution candidates file')
parser.add_argument('--person-dir', type=Path,
default=Path('data/custodian/person/entity'),
help='Path to person entity files directory')
parser.add_argument('--dry-run', action='store_true',
help='Preview changes without writing files')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output')
parser.add_argument('--limit', type=int, default=None,
help='Limit number of files to process (for testing)')
args = parser.parse_args()
print("=" * 60)
print("WCMS → Person Entity Merge")
print("=" * 60)
if args.dry_run:
print("DRY RUN MODE - no files will be modified")
# Step 1: Build WCMS email index
print("\n[1/4] Building WCMS email index...")
wcms_index = load_wcms_index(args.wcms_dir, verbose=args.verbose)
# Step 2: Load entity resolution candidates
print("\n[2/4] Loading entity resolution candidates...")
candidates = load_entity_candidates(args.candidates_file)
# Step 3: Find matches and update person files
print("\n[3/4] Matching and updating person files...")
stats = {
'processed': 0,
'matched': 0,
'updated': 0,
'already_has_wcms': 0,
'no_person_file': 0,
'no_wcms_data': 0,
'errors': 0,
}
# Process each candidate that has both WCMS email and LinkedIn slug
for email, candidate in list(candidates.items())[:args.limit] if args.limit else candidates.items():
stats['processed'] += 1
linkedin_slug = candidate.get('linkedin_slug')
if not linkedin_slug:
continue
# Look up WCMS data by email
wcms_data = wcms_index.get(email)
if not wcms_data:
stats['no_wcms_data'] += 1
if args.verbose:
print(f" No WCMS data for email: {email}")
continue
# Find person entity file
person_file = find_person_file_by_slug(args.person_dir, linkedin_slug)
if not person_file:
stats['no_person_file'] += 1
if args.verbose:
print(f" No person file for slug: {linkedin_slug}")
continue
stats['matched'] += 1
# Create WCMS fields
wcms_identifiers, wcms_activity, contact_details = create_wcms_fields(wcms_data)
# Update person file
if update_person_file(person_file, wcms_identifiers, wcms_activity, contact_details, args.dry_run):
stats['updated'] += 1
if args.verbose:
print(f" Updated: {person_file.name}")
else:
stats['already_has_wcms'] += 1
# Step 4: Report results
print("\n[4/4] Results")
print("-" * 40)
print(f" Candidates processed: {stats['processed']}")
print(f" WCMS↔Person matches: {stats['matched']}")
print(f" Files updated: {stats['updated']}")
print(f" Already had WCMS: {stats['already_has_wcms']}")
print(f" No person file found: {stats['no_person_file']}")
print(f" No WCMS data found: {stats['no_wcms_data']}")
if args.dry_run:
print("\nDRY RUN - no files were modified. Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()