Schema Migration Fixes: - Fix YAML import indentation in ~650 slot files (linkml:types and enum imports) - Rename slot reference: has_or_had_holds_record_set_type → hold_or_held_record_set_type (70+ archive class files, main schema, manifest.json) - Fix ProvenanceBlock.yaml: remove invalid any_of range, use string with multivalued - Fix has_or_had_provenance.yaml: remove nested template_specificity from annotations Validation Status: - 0 errors (was multiple import/reference errors) - 30 warnings (missing descriptions on inline slots, intentional SCREAMING_CASE names) Files changed: ~3,850 (slots, classes, main schema, manifest)
301 lines
11 KiB
Python
301 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge WCMS data into person entity files.
|
|
|
|
This script:
|
|
1. Builds an email → WCMS user data index from WCMS user files
|
|
2. Loads entity_resolution_candidates to find confirmed WCMS↔LinkedIn matches
|
|
3. Updates person entity files with WCMS identifiers and activity data
|
|
|
|
Usage:
|
|
python scripts/merge_wcms_to_persons.py --wcms-dir /Volumes/KINGSTON/data/wcms/data/person_profiles/users
|
|
python scripts/merge_wcms_to_persons.py --dry-run # Preview without writing
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, Any
|
|
from collections import defaultdict
|
|
|
|
|
|
def load_wcms_index(wcms_dir: Path, verbose: bool = False) -> Dict[str, Dict]:
|
|
"""Build email → WCMS user data index."""
|
|
index = {}
|
|
errors = 0
|
|
|
|
if not wcms_dir.exists():
|
|
print(f"ERROR: WCMS directory not found: {wcms_dir}")
|
|
sys.exit(1)
|
|
|
|
# Walk through all subdirectories (000, 001, 002, etc.)
|
|
subdirs = sorted([d for d in wcms_dir.iterdir() if d.is_dir()])
|
|
print(f"Found {len(subdirs)} WCMS subdirectories")
|
|
|
|
for subdir in subdirs:
|
|
for user_file in subdir.glob("user_*.json"):
|
|
try:
|
|
with open(user_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
email = data.get('email', '').lower().strip()
|
|
if email:
|
|
# Store the full WCMS record indexed by email
|
|
index[email] = {
|
|
'user_id': data.get('user_id'),
|
|
'username': data.get('username'),
|
|
'username_url': data.get('username_url'),
|
|
'full_name': data.get('full_name'),
|
|
'status': data.get('status'),
|
|
'roles': data.get('roles', []),
|
|
'registered_since': data.get('registered_since'),
|
|
'last_access': data.get('last_access'),
|
|
'abs_id': data.get('abs_id'),
|
|
'crm_id': data.get('crm_id'),
|
|
'email': email,
|
|
'_source_file': str(user_file)
|
|
}
|
|
except Exception as e:
|
|
errors += 1
|
|
if verbose:
|
|
print(f" Error reading {user_file}: {e}")
|
|
|
|
print(f"Built WCMS index: {len(index)} emails indexed, {errors} errors")
|
|
return index
|
|
|
|
|
|
def load_entity_candidates(candidates_file: Path) -> Dict[str, Dict]:
|
|
"""Load entity resolution candidates and build email → candidate mapping."""
|
|
with open(candidates_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
candidates = data.get('candidates', [])
|
|
|
|
# Build email → best candidate mapping
|
|
# Prioritize confirmed matches, then high-confidence candidates
|
|
email_to_candidate = {}
|
|
|
|
for c in candidates:
|
|
email = c.get('wcms_email', '').lower().strip()
|
|
if not email:
|
|
continue
|
|
|
|
# Check if this is a better candidate than existing
|
|
existing = email_to_candidate.get(email)
|
|
|
|
is_confirmed = c.get('review_decision') == 'match'
|
|
existing_confirmed = existing.get('review_decision') == 'match' if existing else False
|
|
|
|
if not existing:
|
|
email_to_candidate[email] = c
|
|
elif is_confirmed and not existing_confirmed:
|
|
# Prefer confirmed match
|
|
email_to_candidate[email] = c
|
|
elif is_confirmed == existing_confirmed:
|
|
# Same confirmation status - prefer higher confidence
|
|
if c.get('confidence_score', 0) > existing.get('confidence_score', 0):
|
|
email_to_candidate[email] = c
|
|
|
|
confirmed = sum(1 for c in email_to_candidate.values() if c.get('review_decision') == 'match')
|
|
print(f"Loaded {len(email_to_candidate)} unique email→candidate mappings ({confirmed} confirmed matches)")
|
|
|
|
return email_to_candidate
|
|
|
|
|
|
def find_person_file_by_slug(person_dir: Path, linkedin_slug: str) -> Optional[Path]:
|
|
"""Find person entity file by LinkedIn slug."""
|
|
# Person files are named like: {linkedin-slug}_{timestamp}.json
|
|
# There may be multiple versions - find the most recent
|
|
pattern = f"{linkedin_slug}_*.json"
|
|
matches = list(person_dir.glob(pattern))
|
|
|
|
if not matches:
|
|
return None
|
|
|
|
# Return most recent (sorted by filename which includes timestamp)
|
|
return sorted(matches)[-1]
|
|
|
|
|
|
def create_wcms_fields(wcms_data: Dict) -> tuple[Dict, Dict, Dict]:
|
|
"""Create wcms_identifiers, wcms_activity, and contact_details from WCMS data."""
|
|
|
|
wcms_identifiers = {
|
|
'user_id': wcms_data.get('user_id'),
|
|
'abs_id': wcms_data.get('abs_id') or None,
|
|
'crm_id': wcms_data.get('crm_id') or None,
|
|
'username': wcms_data.get('username'),
|
|
'username_url': wcms_data.get('username_url'),
|
|
}
|
|
|
|
wcms_activity = {
|
|
'status': wcms_data.get('status'),
|
|
'roles': wcms_data.get('roles', []),
|
|
'registered_since': wcms_data.get('registered_since'),
|
|
'last_access': wcms_data.get('last_access'),
|
|
}
|
|
|
|
email = wcms_data.get('email', '')
|
|
contact_details = {
|
|
'email': email,
|
|
'email_domain': email.split('@')[1] if '@' in email else None,
|
|
}
|
|
|
|
return wcms_identifiers, wcms_activity, contact_details
|
|
|
|
|
|
def update_person_file(
|
|
person_file: Path,
|
|
wcms_identifiers: Dict,
|
|
wcms_activity: Dict,
|
|
contact_details: Dict,
|
|
dry_run: bool = False
|
|
) -> bool:
|
|
"""Update person entity file with WCMS data."""
|
|
|
|
try:
|
|
with open(person_file, 'r', encoding='utf-8') as f:
|
|
person_data = json.load(f)
|
|
except Exception as e:
|
|
print(f" Error reading {person_file}: {e}")
|
|
return False
|
|
|
|
# Check if already has WCMS data
|
|
if person_data.get('wcms_identifiers'):
|
|
return False # Already has WCMS data
|
|
|
|
# Add WCMS fields
|
|
person_data['wcms_identifiers'] = wcms_identifiers
|
|
person_data['wcms_activity'] = wcms_activity
|
|
|
|
# Add/update contact details
|
|
if 'contact_details' not in person_data:
|
|
person_data['contact_details'] = contact_details
|
|
else:
|
|
# Merge with existing contact details
|
|
for k, v in contact_details.items():
|
|
if v and not person_data['contact_details'].get(k):
|
|
person_data['contact_details'][k] = v
|
|
|
|
# Add provenance note
|
|
if 'extraction_metadata' in person_data:
|
|
notes = person_data['extraction_metadata'].get('notes', '')
|
|
wcms_note = f"WCMS data merged on {datetime.now(timezone.utc).isoformat()}"
|
|
person_data['extraction_metadata']['notes'] = f"{notes} {wcms_note}".strip()
|
|
|
|
if dry_run:
|
|
return True
|
|
|
|
# Write updated file
|
|
try:
|
|
with open(person_file, 'w', encoding='utf-8') as f:
|
|
json.dump(person_data, f, indent=2, ensure_ascii=False)
|
|
return True
|
|
except Exception as e:
|
|
print(f" Error writing {person_file}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Merge WCMS data into person entity files')
|
|
parser.add_argument('--wcms-dir', type=Path,
|
|
default=Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users'),
|
|
help='Path to WCMS users directory')
|
|
parser.add_argument('--candidates-file', type=Path,
|
|
default=Path('data/entity_resolution/entity_resolution_candidates.json'),
|
|
help='Path to entity resolution candidates file')
|
|
parser.add_argument('--person-dir', type=Path,
|
|
default=Path('data/custodian/person/entity'),
|
|
help='Path to person entity files directory')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Preview changes without writing files')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Verbose output')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Limit number of files to process (for testing)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 60)
|
|
print("WCMS → Person Entity Merge")
|
|
print("=" * 60)
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN MODE - no files will be modified")
|
|
|
|
# Step 1: Build WCMS email index
|
|
print("\n[1/4] Building WCMS email index...")
|
|
wcms_index = load_wcms_index(args.wcms_dir, verbose=args.verbose)
|
|
|
|
# Step 2: Load entity resolution candidates
|
|
print("\n[2/4] Loading entity resolution candidates...")
|
|
candidates = load_entity_candidates(args.candidates_file)
|
|
|
|
# Step 3: Find matches and update person files
|
|
print("\n[3/4] Matching and updating person files...")
|
|
|
|
stats = {
|
|
'processed': 0,
|
|
'matched': 0,
|
|
'updated': 0,
|
|
'already_has_wcms': 0,
|
|
'no_person_file': 0,
|
|
'no_wcms_data': 0,
|
|
'errors': 0,
|
|
}
|
|
|
|
# Process each candidate that has both WCMS email and LinkedIn slug
|
|
for email, candidate in list(candidates.items())[:args.limit] if args.limit else candidates.items():
|
|
stats['processed'] += 1
|
|
|
|
linkedin_slug = candidate.get('linkedin_slug')
|
|
if not linkedin_slug:
|
|
continue
|
|
|
|
# Look up WCMS data by email
|
|
wcms_data = wcms_index.get(email)
|
|
if not wcms_data:
|
|
stats['no_wcms_data'] += 1
|
|
if args.verbose:
|
|
print(f" No WCMS data for email: {email}")
|
|
continue
|
|
|
|
# Find person entity file
|
|
person_file = find_person_file_by_slug(args.person_dir, linkedin_slug)
|
|
if not person_file:
|
|
stats['no_person_file'] += 1
|
|
if args.verbose:
|
|
print(f" No person file for slug: {linkedin_slug}")
|
|
continue
|
|
|
|
stats['matched'] += 1
|
|
|
|
# Create WCMS fields
|
|
wcms_identifiers, wcms_activity, contact_details = create_wcms_fields(wcms_data)
|
|
|
|
# Update person file
|
|
if update_person_file(person_file, wcms_identifiers, wcms_activity, contact_details, args.dry_run):
|
|
stats['updated'] += 1
|
|
if args.verbose:
|
|
print(f" Updated: {person_file.name}")
|
|
else:
|
|
stats['already_has_wcms'] += 1
|
|
|
|
# Step 4: Report results
|
|
print("\n[4/4] Results")
|
|
print("-" * 40)
|
|
print(f" Candidates processed: {stats['processed']}")
|
|
print(f" WCMS↔Person matches: {stats['matched']}")
|
|
print(f" Files updated: {stats['updated']}")
|
|
print(f" Already had WCMS: {stats['already_has_wcms']}")
|
|
print(f" No person file found: {stats['no_person_file']}")
|
|
print(f" No WCMS data found: {stats['no_wcms_data']}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN - no files were modified. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|