303 lines
10 KiB
Python
303 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate entity profiles from data/custodian/person/entity/ to data/person/
|
|
|
|
This script:
|
|
1. Reads entity profiles that are NOT already in data/person/
|
|
2. Filters out non-human profiles (institutions, anonymous LinkedIn members)
|
|
3. Generates PPID based on profile data
|
|
4. Preserves ALL data including web_claims with XPath provenance
|
|
5. Creates proper PPID file in data/person/
|
|
|
|
Usage:
|
|
python scripts/migrate_entity_to_ppid_v2.py --dry-run --limit 5 # Preview 5 profiles
|
|
python scripts/migrate_entity_to_ppid_v2.py --dry-run # Preview all
|
|
python scripts/migrate_entity_to_ppid_v2.py # Execute migration
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
import unicodedata
|
|
|
|
# Patterns for detecting non-human profiles
|
|
NON_HUMAN_PATTERNS = [
|
|
r'^LinkedIn\s+Member$',
|
|
r'^TheMuseumsLab$',
|
|
r'Museum$',
|
|
r'Foundation$',
|
|
r'Stichting\s',
|
|
r'^ICOM\s',
|
|
r'^Fondazione\s',
|
|
r'Institute$',
|
|
r'Organisation$',
|
|
r'Organization$',
|
|
r'University$',
|
|
r'^Google\s',
|
|
r'^Sound\s+Heritage$',
|
|
r'^Company\s',
|
|
r'^Computational\s+Research$',
|
|
]
|
|
|
|
def extract_linkedin_slug(url):
|
|
"""Extract LinkedIn slug from URL."""
|
|
if not url or 'linkedin.com/in/' not in url:
|
|
return None
|
|
slug = url.split('linkedin.com/in/')[-1].rstrip('/').split('?')[0]
|
|
slug = unquote(slug)
|
|
return slug.lower()
|
|
|
|
def is_human_profile(name, profile_data):
|
|
"""Determine if profile represents a human being (not an institution)."""
|
|
if not name:
|
|
return False
|
|
|
|
# Check against non-human patterns
|
|
for pattern in NON_HUMAN_PATTERNS:
|
|
if re.search(pattern, name, re.IGNORECASE):
|
|
return False
|
|
|
|
# LinkedIn Member with no URL is anonymous
|
|
if name == 'LinkedIn Member' and not profile_data.get('linkedin_url'):
|
|
return False
|
|
|
|
return True
|
|
|
|
def normalize_name_for_ppid(name):
|
|
"""Convert name to PPID format: FIRST-LAST"""
|
|
if not name:
|
|
return "UNKNOWN"
|
|
|
|
# Remove titles/suffixes
|
|
name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)
|
|
|
|
# Split and clean
|
|
parts = [p.strip() for p in name.split() if p.strip()]
|
|
if not parts:
|
|
return "UNKNOWN"
|
|
|
|
def normalize_part(p):
|
|
nfkd = unicodedata.normalize('NFKD', p)
|
|
ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
|
|
return re.sub(r'[^A-Za-z]', '', ascii_name).upper()
|
|
|
|
normalized = [normalize_part(p) for p in parts if normalize_part(p)]
|
|
if not normalized:
|
|
return "UNKNOWN"
|
|
|
|
return '-'.join(normalized)
|
|
|
|
def generate_ppid(name):
|
|
"""Generate PPID from name (locations/dates use XX placeholders)."""
|
|
birth_loc = "XX-XX-XXX"
|
|
birth_date = "XXXX"
|
|
current_loc = "XX-XX-XXX"
|
|
death_date = "XXXX"
|
|
|
|
name_token = normalize_name_for_ppid(name)
|
|
|
|
return f"ID_{birth_loc}_{birth_date}_{current_loc}_{death_date}_{name_token}"
|
|
|
|
def transform_entity_to_ppid(entity_data, entity_file):
|
|
"""Transform entity profile to PPID format, preserving ALL data."""
|
|
|
|
name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
|
|
ppid = generate_ppid(name)
|
|
|
|
# Build comprehensive PPID profile preserving ALL source data
|
|
ppid_profile = {
|
|
# PPID identification
|
|
"ppid": ppid,
|
|
"ppid_type": "ID",
|
|
"ppid_components": {
|
|
"type": "ID",
|
|
"first_location": "XX-XX-XXX",
|
|
"first_date": "XXXX",
|
|
"last_location": "XX-XX-XXX",
|
|
"last_date": "XXXX",
|
|
"name_tokens": normalize_name_for_ppid(name).split('-')
|
|
},
|
|
|
|
# Basic identity
|
|
"name": name,
|
|
"birth_date": {
|
|
"edtf": "XXXX",
|
|
"precision": "unknown",
|
|
"note": "Not yet enriched - requires manual research"
|
|
},
|
|
"is_living": True,
|
|
|
|
# Heritage relevance (preserve from source)
|
|
"heritage_relevance": entity_data.get('heritage_relevance', {
|
|
"is_heritage_relevant": True, # Default to true since from custodian context
|
|
"heritage_types": [],
|
|
"rationale": "Extracted from heritage custodian LinkedIn page"
|
|
}),
|
|
|
|
# Affiliations (preserve ALL)
|
|
"affiliations": entity_data.get('affiliations', []),
|
|
|
|
# Profile data (preserve ALL)
|
|
"profile_data": entity_data.get('profile_data', {}),
|
|
|
|
# Web claims with full provenance (preserve ALL)
|
|
"web_claims": entity_data.get('web_claims', []),
|
|
|
|
# Source observations (preserve ALL)
|
|
"source_observations": entity_data.get('source_observations', []),
|
|
|
|
# Original extraction metadata
|
|
"extraction_metadata": entity_data.get('extraction_metadata', {}),
|
|
|
|
# Migration metadata
|
|
"migration_metadata": {
|
|
"original_entity_file": entity_file.name,
|
|
"original_person_id": entity_data.get('person_id'),
|
|
"original_linkedin_slug": entity_data.get('linkedin_slug'),
|
|
"migrated_at": datetime.now(timezone.utc).isoformat(),
|
|
"migration_script": "migrate_entity_to_ppid_v2.py",
|
|
"migration_version": "2.0"
|
|
}
|
|
}
|
|
|
|
return ppid, ppid_profile
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v2)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process')
|
|
parser.add_argument('--verbose', action='store_true', help='Show detailed output for each profile')
|
|
args = parser.parse_args()
|
|
|
|
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
|
|
person_dir = Path('/Users/kempersc/apps/glam/data/person')
|
|
|
|
# 1. Get existing LinkedIn slugs in data/person/
|
|
print("=" * 60)
|
|
print("PPID MIGRATION SCRIPT v2.0")
|
|
print("=" * 60)
|
|
print("\nPhase 1: Loading existing PPID profiles...")
|
|
existing_slugs = set()
|
|
for f in person_dir.glob('ID_*.json'):
|
|
try:
|
|
data = json.load(open(f))
|
|
if 'profile_data' in data:
|
|
url = data['profile_data'].get('linkedin_url')
|
|
if url:
|
|
slug = extract_linkedin_slug(url)
|
|
if slug:
|
|
existing_slugs.add(slug)
|
|
except:
|
|
pass
|
|
|
|
print(f" Found {len(existing_slugs):,} existing LinkedIn slugs in data/person/")
|
|
|
|
# 2. Find entity profiles NOT in data/person/
|
|
print("\nPhase 2: Scanning entity profiles...")
|
|
to_migrate = []
|
|
skipped_existing = 0
|
|
skipped_no_linkedin = 0
|
|
skipped_non_human = 0
|
|
|
|
entity_files = list(entity_dir.glob('*.json'))
|
|
print(f" Found {len(entity_files):,} entity files to scan")
|
|
|
|
for f in entity_files:
|
|
try:
|
|
data = json.load(open(f))
|
|
name = data.get('profile_data', {}).get('name') or data.get('name', '')
|
|
|
|
# Skip non-human profiles
|
|
if not is_human_profile(name, data.get('profile_data', {})):
|
|
skipped_non_human += 1
|
|
continue
|
|
|
|
# Check for LinkedIn URL
|
|
linkedin_url = data.get('profile_data', {}).get('linkedin_url')
|
|
if not linkedin_url:
|
|
skipped_no_linkedin += 1
|
|
continue
|
|
|
|
slug = extract_linkedin_slug(linkedin_url)
|
|
if slug and slug not in existing_slugs:
|
|
to_migrate.append((f, data, slug))
|
|
elif slug:
|
|
skipped_existing += 1
|
|
|
|
except Exception as e:
|
|
pass
|
|
|
|
print(f"\n Scan Results:")
|
|
print(f" Already in PPID: {skipped_existing:,}")
|
|
print(f" Skipped (non-human): {skipped_non_human:,}")
|
|
print(f" Skipped (no LinkedIn): {skipped_no_linkedin:,}")
|
|
print(f" TO MIGRATE: {len(to_migrate):,}")
|
|
|
|
if args.limit:
|
|
to_migrate = to_migrate[:args.limit]
|
|
print(f"\n Limited to {args.limit} profiles for this run")
|
|
|
|
# 3. Migrate profiles
|
|
print("\nPhase 3: Migrating profiles...")
|
|
migrated = 0
|
|
errors = 0
|
|
collision_count = 0
|
|
|
|
for entity_file, data, slug in to_migrate:
|
|
try:
|
|
ppid, ppid_profile = transform_entity_to_ppid(data, entity_file)
|
|
output_file = person_dir / f"{ppid}.json"
|
|
|
|
# Handle collisions with counter suffix
|
|
original_ppid = ppid
|
|
counter = 1
|
|
while output_file.exists():
|
|
collision_count += 1
|
|
ppid = f"{original_ppid}-{counter}"
|
|
ppid_profile['ppid'] = ppid
|
|
output_file = person_dir / f"{ppid}.json"
|
|
counter += 1
|
|
|
|
name = ppid_profile['name']
|
|
web_claims_count = len(ppid_profile.get('web_claims', []))
|
|
affiliations_count = len(ppid_profile.get('affiliations', []))
|
|
|
|
if args.verbose or args.dry_run:
|
|
print(f"\n {'[DRY-RUN] ' if args.dry_run else ''}Creating: {output_file.name}")
|
|
print(f" Name: {name}")
|
|
print(f" LinkedIn slug: {slug}")
|
|
print(f" Web claims: {web_claims_count}")
|
|
print(f" Affiliations: {affiliations_count}")
|
|
if ppid_profile.get('source_observations'):
|
|
print(f" Source observations: {len(ppid_profile['source_observations'])}")
|
|
|
|
if not args.dry_run:
|
|
with open(output_file, 'w') as f:
|
|
json.dump(ppid_profile, f, indent=2, ensure_ascii=False)
|
|
|
|
migrated += 1
|
|
|
|
except Exception as e:
|
|
print(f" ERROR processing {entity_file.name}: {e}")
|
|
errors += 1
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
|
|
print("=" * 60)
|
|
print(f" Profiles migrated: {migrated:,}")
|
|
print(f" Name collisions resolved: {collision_count}")
|
|
print(f" Errors: {errors}")
|
|
|
|
if args.dry_run:
|
|
print(f"\n To execute migration, run without --dry-run flag")
|
|
else:
|
|
print(f"\n Migration complete!")
|
|
print(f" New profile count: {len(list(person_dir.glob('ID_*.json'))):,}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|