- Introduced setpoint_max, setpoint_min, setpoint_tolerance, setpoint_type, setpoint_unit, setpoint_value, temperature_target, track_id, typical_http_methods, typical_metadata_standard, typical_response_formats, typical_scope, typical_technical_feature, unit_code, unit_symbol, unit_type, wikidata_entity, wikidata_equivalent, and wikidata_id slots. - Each slot includes a unique identifier, name, title, description, and annotations for custodian types and specificity score.
394 lines
14 KiB
Python
394 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge WCMS data into LinkedIn person entity files.
|
|
|
|
This script:
|
|
1. Builds an email → WCMS data map from data/person/ID_*.json files
|
|
2. Uses entity_resolution_candidates to find confirmed/high-confidence matches
|
|
3. Updates LinkedIn person entity files (data/custodian/person/entity/) with WCMS data
|
|
|
|
Usage:
|
|
python scripts/merge_wcms_to_linkedin_profiles.py --dry-run # Preview
|
|
python scripts/merge_wcms_to_linkedin_profiles.py # Apply changes
|
|
python scripts/merge_wcms_to_linkedin_profiles.py --confirmed-only # Only confirmed matches
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, Any, List
|
|
from collections import defaultdict
|
|
import re
|
|
|
|
|
|
def build_wcms_email_index(wcms_dir: Path, verbose: bool = False) -> Dict[str, Dict]:
|
|
"""Build email → WCMS data map from data/person/ID_*.json files."""
|
|
print(f"Scanning WCMS person files in {wcms_dir}...")
|
|
|
|
email_index = {}
|
|
processed = 0
|
|
errors = 0
|
|
no_email = 0
|
|
|
|
# Find all ID_*.json files
|
|
pattern = wcms_dir / "ID_*.json"
|
|
files = list(wcms_dir.glob("ID_*.json"))
|
|
total = len(files)
|
|
print(f"Found {total:,} WCMS person files")
|
|
|
|
for i, filepath in enumerate(files):
|
|
if i % 50000 == 0 and i > 0:
|
|
print(f" Processed {i:,}/{total:,} ({i*100//total}%)...")
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
processed += 1
|
|
|
|
# Extract email from contact_details
|
|
email = data.get('contact_details', {}).get('email', '').lower().strip()
|
|
if not email:
|
|
no_email += 1
|
|
continue
|
|
|
|
# Store WCMS data indexed by email
|
|
email_index[email] = {
|
|
'ppid': data.get('ppid'),
|
|
'name': data.get('name'),
|
|
'wcms_identifiers': data.get('wcms_identifiers', {}),
|
|
'wcms_activity': data.get('wcms_activity', {}),
|
|
'contact_details': data.get('contact_details', {}),
|
|
'_source_file': filepath.name
|
|
}
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
if verbose:
|
|
print(f" Error reading {filepath}: {e}")
|
|
|
|
print(f"Built WCMS email index: {len(email_index):,} emails indexed")
|
|
print(f" - Processed: {processed:,}")
|
|
print(f" - No email: {no_email:,}")
|
|
print(f" - Errors: {errors:,}")
|
|
|
|
return email_index
|
|
|
|
|
|
def load_entity_candidates(candidates_file: Path, confirmed_only: bool = False,
|
|
min_confidence: float = 0.65) -> Dict[str, Dict]:
|
|
"""Load entity resolution candidates and build email → candidate mapping."""
|
|
print(f"Loading entity resolution candidates from {candidates_file}...")
|
|
|
|
with open(candidates_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
candidates = data.get('candidates', [])
|
|
print(f" Total candidates: {len(candidates):,}")
|
|
|
|
# Build email → best candidate mapping
|
|
email_to_candidate = {}
|
|
confirmed_count = 0
|
|
high_confidence_count = 0
|
|
|
|
for c in candidates:
|
|
email = c.get('wcms_email', '').lower().strip()
|
|
linkedin_slug = c.get('linkedin_slug')
|
|
|
|
if not email or not linkedin_slug:
|
|
continue
|
|
|
|
is_confirmed = c.get('review_decision') == 'match'
|
|
is_rejected = c.get('review_decision') == 'not_match'
|
|
confidence = c.get('confidence_score', 0)
|
|
|
|
# Skip rejected matches
|
|
if is_rejected:
|
|
continue
|
|
|
|
# Filter based on mode
|
|
if confirmed_only and not is_confirmed:
|
|
continue
|
|
|
|
if not is_confirmed and confidence < min_confidence:
|
|
continue
|
|
|
|
# Check if this is a better candidate than existing
|
|
existing = email_to_candidate.get(email)
|
|
|
|
if not existing:
|
|
email_to_candidate[email] = c
|
|
if is_confirmed:
|
|
confirmed_count += 1
|
|
else:
|
|
high_confidence_count += 1
|
|
else:
|
|
existing_confirmed = existing.get('review_decision') == 'match'
|
|
|
|
# Prefer confirmed match
|
|
if is_confirmed and not existing_confirmed:
|
|
email_to_candidate[email] = c
|
|
elif is_confirmed == existing_confirmed:
|
|
# Same status - prefer higher confidence
|
|
if confidence > existing.get('confidence_score', 0):
|
|
email_to_candidate[email] = c
|
|
|
|
print(f" Selected candidates: {len(email_to_candidate):,}")
|
|
print(f" - Confirmed matches: {confirmed_count:,}")
|
|
print(f" - High-confidence (>= {min_confidence}): {high_confidence_count:,}")
|
|
|
|
return email_to_candidate
|
|
|
|
|
|
def build_slug_to_files_index(person_dir: Path) -> Dict[str, List[str]]:
|
|
"""Build LinkedIn slug → list of person entity filenames index."""
|
|
print(f"Building slug-to-files index from {person_dir}...")
|
|
|
|
slug_to_files = defaultdict(list)
|
|
|
|
files = list(person_dir.glob("*.json"))
|
|
print(f" Found {len(files):,} person entity files")
|
|
|
|
# Pattern: {slug}_{timestamp}.json
|
|
# Handle URL-encoded characters and various slug formats
|
|
for filepath in files:
|
|
filename = filepath.name
|
|
|
|
# Skip metadata files
|
|
if filename.startswith('_'):
|
|
continue
|
|
|
|
# Extract slug from filename (everything before last underscore + timestamp)
|
|
# Example: john-doe-123abc_20260109T224201Z.json -> john-doe-123abc
|
|
match = re.match(r'^(.+?)_(\d{8}T\d{6}Z)\.json$', filename)
|
|
if match:
|
|
slug = match.group(1)
|
|
slug_to_files[slug].append(filename)
|
|
|
|
print(f" Indexed {len(slug_to_files):,} unique slugs")
|
|
return dict(slug_to_files)
|
|
|
|
|
|
def find_person_file(person_dir: Path, slug_index: Dict[str, List[str]],
|
|
linkedin_slug: str) -> Optional[Path]:
|
|
"""Find person entity file by LinkedIn slug, returning most recent version."""
|
|
files = slug_index.get(linkedin_slug, [])
|
|
|
|
if not files:
|
|
return None
|
|
|
|
# Return most recent (sorted by timestamp in filename)
|
|
most_recent = sorted(files)[-1]
|
|
return person_dir / most_recent
|
|
|
|
|
|
def update_person_file(
|
|
person_file: Path,
|
|
wcms_data: Dict,
|
|
dry_run: bool = False,
|
|
verbose: bool = False
|
|
) -> tuple[bool, str]:
|
|
"""
|
|
Update person entity file with WCMS data.
|
|
Returns (success, reason).
|
|
"""
|
|
try:
|
|
with open(person_file, 'r', encoding='utf-8') as f:
|
|
person_data = json.load(f)
|
|
except Exception as e:
|
|
return False, f"read_error: {e}"
|
|
|
|
# Check if already has WCMS data
|
|
if person_data.get('wcms_identifiers'):
|
|
return False, "already_has_wcms"
|
|
|
|
# Add WCMS fields
|
|
person_data['wcms_identifiers'] = wcms_data.get('wcms_identifiers', {})
|
|
person_data['wcms_activity'] = wcms_data.get('wcms_activity', {})
|
|
|
|
# Add/merge contact details
|
|
wcms_contact = wcms_data.get('contact_details', {})
|
|
if 'contact_details' not in person_data:
|
|
person_data['contact_details'] = wcms_contact
|
|
else:
|
|
# Merge - WCMS contact details take precedence for email
|
|
for key, value in wcms_contact.items():
|
|
if value and not person_data['contact_details'].get(key):
|
|
person_data['contact_details'][key] = value
|
|
|
|
# Add data source marker
|
|
if 'data_sources' not in person_data:
|
|
person_data['data_sources'] = []
|
|
if 'wcms' not in person_data['data_sources']:
|
|
person_data['data_sources'].append('wcms')
|
|
|
|
# Add provenance note
|
|
merge_note = f"WCMS data merged on {datetime.now(timezone.utc).isoformat()}"
|
|
if 'extraction_metadata' in person_data:
|
|
notes = person_data['extraction_metadata'].get('notes', '') or ''
|
|
person_data['extraction_metadata']['notes'] = f"{notes} {merge_note}".strip()
|
|
else:
|
|
person_data['merge_metadata'] = {
|
|
'wcms_merged_at': datetime.now(timezone.utc).isoformat(),
|
|
'wcms_source_file': wcms_data.get('_source_file')
|
|
}
|
|
|
|
if dry_run:
|
|
if verbose:
|
|
print(f" [DRY RUN] Would update: {person_file.name}")
|
|
return True, "would_update"
|
|
|
|
# Write updated file
|
|
try:
|
|
with open(person_file, 'w', encoding='utf-8') as f:
|
|
json.dump(person_data, f, indent=2, ensure_ascii=False)
|
|
return True, "updated"
|
|
except Exception as e:
|
|
return False, f"write_error: {e}"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Merge WCMS data into LinkedIn person entity files')
|
|
parser.add_argument('--wcms-dir', type=Path,
|
|
default=Path('data/person'),
|
|
help='Path to WCMS person files directory (default: data/person)')
|
|
parser.add_argument('--candidates-file', type=Path,
|
|
default=Path('data/entity_resolution/entity_resolution_candidates.json'),
|
|
help='Path to entity resolution candidates file')
|
|
parser.add_argument('--person-dir', type=Path,
|
|
default=Path('data/custodian/person/entity'),
|
|
help='Path to LinkedIn person entity files directory')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Preview changes without writing files')
|
|
parser.add_argument('--confirmed-only', action='store_true',
|
|
help='Only process confirmed matches (review_decision=match)')
|
|
parser.add_argument('--min-confidence', type=float, default=0.65,
|
|
help='Minimum confidence score for unconfirmed matches (default: 0.65)')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Verbose output')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Limit number of candidates to process (for testing)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("WCMS → LinkedIn Person Entity Merge")
|
|
print("=" * 70)
|
|
|
|
if args.dry_run:
|
|
print("MODE: DRY RUN - no files will be modified")
|
|
if args.confirmed_only:
|
|
print("MODE: Confirmed matches only")
|
|
else:
|
|
print(f"MODE: Confirmed matches + high-confidence (>= {args.min_confidence})")
|
|
print()
|
|
|
|
# Step 1: Build WCMS email index
|
|
print("[1/4] Building WCMS email index...")
|
|
wcms_index = build_wcms_email_index(args.wcms_dir, verbose=args.verbose)
|
|
print()
|
|
|
|
# Step 2: Load entity resolution candidates
|
|
print("[2/4] Loading entity resolution candidates...")
|
|
candidates = load_entity_candidates(
|
|
args.candidates_file,
|
|
confirmed_only=args.confirmed_only,
|
|
min_confidence=args.min_confidence
|
|
)
|
|
print()
|
|
|
|
# Step 3: Build slug-to-files index
|
|
print("[3/4] Building LinkedIn slug-to-files index...")
|
|
slug_index = build_slug_to_files_index(args.person_dir)
|
|
print()
|
|
|
|
# Step 4: Match and update
|
|
print("[4/4] Matching and updating person files...")
|
|
|
|
stats = {
|
|
'processed': 0,
|
|
'matched': 0,
|
|
'updated': 0,
|
|
'already_has_wcms': 0,
|
|
'no_person_file': 0,
|
|
'no_wcms_data': 0,
|
|
'errors': 0,
|
|
}
|
|
|
|
items = list(candidates.items())
|
|
if args.limit:
|
|
items = items[:args.limit]
|
|
|
|
for email, candidate in items:
|
|
stats['processed'] += 1
|
|
|
|
linkedin_slug = candidate.get('linkedin_slug')
|
|
if not linkedin_slug:
|
|
continue
|
|
|
|
# Look up WCMS data by email
|
|
wcms_data = wcms_index.get(email)
|
|
if not wcms_data:
|
|
stats['no_wcms_data'] += 1
|
|
if args.verbose:
|
|
print(f" No WCMS data for email: {email}")
|
|
continue
|
|
|
|
# Find person entity file
|
|
person_file = find_person_file(args.person_dir, slug_index, linkedin_slug)
|
|
if not person_file:
|
|
stats['no_person_file'] += 1
|
|
if args.verbose:
|
|
print(f" No person file for slug: {linkedin_slug}")
|
|
continue
|
|
|
|
stats['matched'] += 1
|
|
|
|
# Update person file
|
|
success, reason = update_person_file(
|
|
person_file, wcms_data,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
if success and reason in ('updated', 'would_update'):
|
|
stats['updated'] += 1
|
|
if args.verbose and not args.dry_run:
|
|
print(f" Updated: {person_file.name}")
|
|
elif reason == 'already_has_wcms':
|
|
stats['already_has_wcms'] += 1
|
|
else:
|
|
stats['errors'] += 1
|
|
if args.verbose:
|
|
print(f" Error: {person_file.name} - {reason}")
|
|
|
|
# Report results
|
|
print()
|
|
print("=" * 70)
|
|
print("RESULTS")
|
|
print("=" * 70)
|
|
print(f" Candidates processed: {stats['processed']:,}")
|
|
print(f" WCMS↔LinkedIn matches: {stats['matched']:,}")
|
|
print(f" Files updated: {stats['updated']:,}")
|
|
print(f" Already had WCMS: {stats['already_has_wcms']:,}")
|
|
print(f" No LinkedIn file found: {stats['no_person_file']:,}")
|
|
print(f" No WCMS data found: {stats['no_wcms_data']:,}")
|
|
print(f" Errors: {stats['errors']:,}")
|
|
|
|
if args.dry_run:
|
|
print()
|
|
print("DRY RUN complete - no files were modified.")
|
|
print("Run without --dry-run to apply changes.")
|
|
else:
|
|
print()
|
|
print("Merge complete!")
|
|
if stats['updated'] > 0:
|
|
print(f"Next step: Deploy updated files to server:")
|
|
print(f" rsync -avz --progress {args.person_dir}/ root@91.98.224.44:/mnt/data/custodian/person/entity/")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|