glam/scripts/migrate_entity_to_ppid_v4.py

418 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Migrate entity profiles from data/custodian/person/entity/ to data/person/
This script (v4) processes ALL entries:
1. NO filtering - every profile is migrated
2. Adds classification tags indicating human vs institution likelihood
3. Handles collisions with UUID suffix (not counter)
4. Preserves ALL data with full provenance
Usage:
python scripts/migrate_entity_to_ppid_v4.py --dry-run --limit 100 # Preview 100 profiles
python scripts/migrate_entity_to_ppid_v4.py --dry-run # Preview all
python scripts/migrate_entity_to_ppid_v4.py # Execute migration
"""
import json
import argparse
import re
import uuid
from pathlib import Path
from datetime import datetime, timezone
import unicodedata
from multiprocessing import Pool
from typing import Dict, List, Tuple, Any
# Patterns that suggest this might be an INSTITUTION (not a person)
INSTITUTION_INDICATORS = [
(r'^Company\s+name\s', 'parsing_artifact', 'Profile name starts with "Company name" - likely parsing artifact'),
(r'^Stichting\s', 'dutch_foundation', 'Dutch foundation (Stichting)'),
(r'^Fondazione\s', 'italian_foundation', 'Italian foundation (Fondazione)'),
(r'^ICOM\s', 'icom_organization', 'ICOM organization'),
(r'^Google\s', 'company_profile', 'Google company profile'),
(r'^TheMuseumsLab$', 'organization', 'TheMuseumsLab organization'),
(r'^Sound\s+Heritage$', 'organization', 'Sound Heritage organization'),
(r'^Computational\s+Research$', 'organization', 'Computational Research organization'),
(r'Museum$', 'museum_suffix', 'Name ends with "Museum" - likely institution'),
(r'Foundation$', 'foundation_suffix', 'Name ends with "Foundation" - likely institution'),
(r'Institute$', 'institute_suffix', 'Name ends with "Institute" - likely institution'),
(r'Organisation$', 'org_suffix', 'Name ends with "Organisation" - likely institution'),
(r'Organization$', 'org_suffix', 'Name ends with "Organization" - likely institution'),
(r'University$', 'university_suffix', 'Name ends with "University" - likely institution'),
(r'Library$', 'library_suffix', 'Name ends with "Library" - likely institution'),
(r'Archive$', 'archive_suffix', 'Name ends with "Archive" - likely institution'),
(r'Archief$', 'archive_suffix', 'Name ends with "Archief" (Dutch archive) - likely institution'),
(r'Bibliotheek$', 'library_suffix', 'Name ends with "Bibliotheek" (Dutch library) - likely institution'),
]
# Patterns that suggest this is a PERSON
PERSON_INDICATORS = [
(r'^(Dr|Prof|Mr|Mrs|Ms|Drs|Ir|Ing)\.\s', 'title_prefix', 'Has personal title prefix'),
(r'\s(PhD|MA|MSc|MBA|BSc|Jr|Sr)$', 'degree_suffix', 'Has degree/suffix'),
(r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', 'two_word_name', 'Simple two-word personal name pattern'),
(r'^[A-Z][a-z]+\s+(van|de|den|der|von|van der|van den|van de)\s+[A-Z]', 'dutch_name', 'Dutch personal name with particle'),
]
def classify_profile(name: str, profile_data: Dict) -> Dict[str, Any]:
"""Classify profile as human, institution, anonymous, or unknown.
Returns classification dict with:
- primary_classification: 'human', 'institution', 'anonymous', 'unknown'
- confidence: 0.0-1.0
- indicators: list of matched patterns
- reasoning: human-readable explanation
"""
if not name:
return {
'primary_classification': 'unknown',
'confidence': 0.0,
'indicators': [{'type': 'empty_name', 'reason': 'Name field is empty'}],
'reasoning': 'Cannot classify - name is empty'
}
if name == 'LinkedIn Member':
headline = profile_data.get('headline', '')
affiliations_count = len(profile_data.get('affiliations', []) if isinstance(profile_data.get('affiliations'), list) else [])
return {
'primary_classification': 'anonymous',
'confidence': 0.9,
'indicators': [
{'type': 'linkedin_member', 'reason': 'LinkedIn privacy settings hide real name'},
{'type': 'has_headline', 'value': headline[:50] if headline else None},
],
'reasoning': f'Anonymous LinkedIn profile with privacy settings. Has headline: {bool(headline)}'
}
institution_matches = []
person_matches = []
# Check institution indicators
for pattern, indicator_type, reason in INSTITUTION_INDICATORS:
if re.search(pattern, name, re.IGNORECASE):
institution_matches.append({
'type': indicator_type,
'pattern': pattern,
'reason': reason
})
# Check person indicators
for pattern, indicator_type, reason in PERSON_INDICATORS:
if re.search(pattern, name, re.IGNORECASE):
person_matches.append({
'type': indicator_type,
'pattern': pattern,
'reason': reason
})
# Check for personal LinkedIn URL (strong person indicator)
linkedin_url = profile_data.get('linkedin_url', '')
if linkedin_url and '/in/' in linkedin_url:
person_matches.append({
'type': 'personal_linkedin_url',
'reason': 'Has personal LinkedIn /in/ URL'
})
# Determine classification
if institution_matches and not person_matches:
return {
'primary_classification': 'institution',
'confidence': min(0.5 + 0.1 * len(institution_matches), 0.9),
'indicators': institution_matches,
'reasoning': f'Matched {len(institution_matches)} institution pattern(s), no person patterns'
}
elif person_matches and not institution_matches:
return {
'primary_classification': 'human',
'confidence': min(0.5 + 0.15 * len(person_matches), 0.95),
'indicators': person_matches,
'reasoning': f'Matched {len(person_matches)} person pattern(s), no institution patterns'
}
elif person_matches and institution_matches:
# Conflicting signals - personal LinkedIn URL wins
if any(i['type'] == 'personal_linkedin_url' for i in person_matches):
return {
'primary_classification': 'human',
'confidence': 0.7,
'indicators': person_matches + institution_matches,
'reasoning': f'Conflicting patterns but has personal LinkedIn URL - likely human'
}
return {
'primary_classification': 'unknown',
'confidence': 0.3,
'indicators': person_matches + institution_matches,
'reasoning': f'Conflicting patterns: {len(person_matches)} person, {len(institution_matches)} institution'
}
else:
# No patterns matched - assume human (most profiles are people)
return {
'primary_classification': 'human',
'confidence': 0.6,
'indicators': [{'type': 'default', 'reason': 'No specific patterns matched, defaulting to human'}],
'reasoning': 'No specific patterns matched - assuming human (default)'
}
def normalize_name_for_ppid(name: str) -> str:
"""Convert name to PPID format: FIRST-LAST"""
if not name:
return "UNKNOWN"
name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)
parts = [p.strip() for p in name.split() if p.strip()]
if not parts:
return "UNKNOWN"
def normalize_part(p):
nfkd = unicodedata.normalize('NFKD', p)
ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper()
normalized = [normalize_part(p) for p in parts if normalize_part(p)]
return '-'.join(normalized) if normalized else "UNKNOWN"
def generate_ppid(name: str, entity_data: Dict = None) -> str:
"""Generate PPID from name (locations/dates use XX placeholders).
For LinkedIn Member profiles, use affiliation context to create unique ID.
"""
if name == 'LinkedIn Member' and entity_data:
affiliations = entity_data.get('affiliations', [])
headline = entity_data.get('profile_data', {}).get('headline', '') if entity_data.get('profile_data') else ''
if affiliations and isinstance(affiliations, list) and len(affiliations) > 0:
org = affiliations[0].get('custodian_name', 'UNKNOWN-ORG')
org_token = normalize_name_for_ppid(org)[:20]
if headline:
role_words = []
for word in headline.split()[:3]:
normalized = normalize_name_for_ppid(word)
if normalized and len(normalized) > 2:
role_words.append(normalized)
role_token = '-'.join(role_words[:2]) if role_words else 'STAFF'
else:
role_token = 'STAFF'
name_token = f"ANON-{org_token}-{role_token[:15]}"
else:
name_token = "LINKEDIN-MEMBER"
else:
name_token = normalize_name_for_ppid(name)
return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}"
def transform_entity_to_ppid(entity_data: Dict, entity_file_name: str) -> Tuple[str, Dict]:
"""Transform entity profile to PPID format, preserving ALL data."""
name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
ppid = generate_ppid(name, entity_data)
# Classify the profile
profile_data = entity_data.get('profile_data', {})
# Merge top-level affiliations into profile_data for classification
profile_data_for_classification = {**profile_data}
if 'affiliations' not in profile_data_for_classification:
profile_data_for_classification['affiliations'] = entity_data.get('affiliations', [])
classification = classify_profile(name, profile_data_for_classification)
is_anonymous = (name == 'LinkedIn Member')
if is_anonymous:
name_tokens = ppid.split('_')[-1].split('-')
else:
name_tokens = normalize_name_for_ppid(name).split('-')
ppid_profile = {
"ppid": ppid,
"ppid_type": "ID",
"ppid_components": {
"type": "ID",
"first_location": "XX-XX-XXX",
"first_date": "XXXX",
"last_location": "XX-XX-XXX",
"last_date": "XXXX",
"name_tokens": name_tokens
},
"name": name,
"birth_date": {
"edtf": "XXXX",
"precision": "unknown",
"note": "Not yet enriched - requires manual research"
},
"is_living": True,
"is_anonymous": is_anonymous,
# Classification tags (the key feature of v4)
"profile_classification": classification,
"heritage_relevance": entity_data.get('heritage_relevance', {
"is_heritage_relevant": True,
"heritage_types": [],
"rationale": "Extracted from heritage custodian LinkedIn page"
}),
"affiliations": entity_data.get('affiliations', []),
"profile_data": entity_data.get('profile_data', {}),
"web_claims": entity_data.get('web_claims', []),
"source_observations": entity_data.get('source_observations', []),
"extraction_metadata": entity_data.get('extraction_metadata', {}),
"migration_metadata": {
"original_entity_file": entity_file_name,
"original_person_id": entity_data.get('person_id'),
"original_linkedin_slug": entity_data.get('linkedin_slug'),
"migrated_at": datetime.now(timezone.utc).isoformat(),
"migration_script": "migrate_entity_to_ppid_v4.py",
"migration_version": "4.0"
}
}
return ppid, ppid_profile
def process_entity_file(args):
"""Process a single entity file. Returns (status, ppid, classification, file_path)."""
entity_file_path, existing_ppids_set, person_dir, dry_run = args
try:
with open(entity_file_path) as f:
data = json.load(f)
name = data.get('profile_data', {}).get('name') or data.get('name', '')
# NO FILTERING - process everything
ppid, ppid_profile = transform_entity_to_ppid(data, Path(entity_file_path).name)
classification = ppid_profile['profile_classification']['primary_classification']
# Check if already exists - add UUID suffix for collision
output_ppid = ppid
if ppid in existing_ppids_set:
# Add short UUID suffix for collision resolution
short_uuid = str(uuid.uuid4())[:8]
output_ppid = f"{ppid}-{short_uuid}"
ppid_profile['ppid'] = output_ppid
ppid_profile['ppid_components']['collision_uuid'] = short_uuid
output_file = Path(person_dir) / f"{output_ppid}.json"
if not dry_run:
with open(output_file, 'w') as f:
json.dump(ppid_profile, f, indent=2, ensure_ascii=False)
return ('migrated', output_ppid, classification, str(entity_file_path))
except Exception as e:
return ('error', str(e), 'error', str(entity_file_path))
def main():
parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v4 - tag everything)')
parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process')
parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers')
parser.add_argument('--verbose', action='store_true', help='Show each migrated file')
args = parser.parse_args()
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
person_dir = Path('/Users/kempersc/apps/glam/data/person')
print("=" * 70)
print("PPID MIGRATION SCRIPT v4.0 (Tag Everything, No Filtering)")
print("=" * 70)
# Phase 1: Build index of existing PPID filenames
print("\nPhase 1: Indexing existing PPID files...")
existing_ppids = set()
for f in person_dir.glob('ID_*.json'):
existing_ppids.add(f.stem)
print(f" Found {len(existing_ppids):,} existing PPID files")
# Phase 2: List entity files
print("\nPhase 2: Listing entity files...")
entity_files = list(entity_dir.glob('*.json'))
total_entity = len(entity_files)
print(f" Found {total_entity:,} entity files")
if args.limit:
entity_files = entity_files[:args.limit]
print(f" Limited to {args.limit} files for this run")
# Phase 3: Process files
print(f"\nPhase 3: Processing ALL files (workers={args.workers}, dry_run={args.dry_run})...")
print(" Note: NO filtering - all profiles are migrated with classification tags")
process_args = [
(str(f), existing_ppids, str(person_dir), args.dry_run)
for f in entity_files
]
results = {'migrated': 0, 'error': 0}
classifications = {'human': 0, 'institution': 0, 'anonymous': 0, 'unknown': 0}
collisions = 0
samples = []
batch_size = 1000
for batch_start in range(0, len(process_args), batch_size):
batch_end = min(batch_start + batch_size, len(process_args))
batch = process_args[batch_start:batch_end]
with Pool(args.workers) as pool:
batch_results = pool.map(process_entity_file, batch)
for status, ppid_or_error, classification, file_path in batch_results:
if status == 'migrated':
results['migrated'] += 1
classifications[classification] = classifications.get(classification, 0) + 1
if '-' in ppid_or_error.split('_')[-1] and len(ppid_or_error.split('-')[-1]) == 8:
collisions += 1
existing_ppids.add(ppid_or_error)
if len(samples) < 5:
samples.append((ppid_or_error, classification, Path(file_path).name))
else:
results['error'] += 1
print(f" ERROR: {file_path}: {ppid_or_error}")
processed = batch_end
pct = (processed / len(process_args)) * 100
print(f" Progress: {processed:,}/{len(process_args):,} ({pct:.1f}%) - "
f"H:{classifications['human']:,} I:{classifications['institution']:,} "
f"A:{classifications['anonymous']:,} U:{classifications['unknown']:,} "
f"Collisions:{collisions}")
# Summary
print("\n" + "=" * 70)
print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
print("=" * 70)
print(f" Total processed: {results['migrated'] + results['error']:,}")
print(f" Successfully migrated: {results['migrated']:,}")
print(f" Errors: {results['error']}")
print(f" Collisions (UUID suffix added): {collisions}")
print(f"\n Classification breakdown:")
print(f" Human: {classifications['human']:,}")
print(f" Institution: {classifications['institution']:,}")
print(f" Anonymous: {classifications['anonymous']:,}")
print(f" Unknown: {classifications['unknown']:,}")
if samples:
print(f"\n Sample migrated profiles:")
for ppid, classification, source in samples:
print(f" [{classification:11}] {ppid[:60]}... <- {source[:40]}...")
if args.dry_run:
print(f"\n To execute migration, run without --dry-run flag")
else:
final_count = len(list(person_dir.glob('ID_*.json')))
print(f"\n Migration complete!")
print(f" Final PPID count: {final_count:,}")
if __name__ == '__main__':
main()