glam/scripts/generate_ppids.py
kempersc abe30cb302 feat(ppid): add unidecode support for non-Latin script transliteration
Add optional unidecode dependency to handle Hebrew, Arabic, Chinese,
and other non-Latin scripts when generating Person Persistent IDs.
2026-01-09 18:28:41 +01:00

480 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Generate PPIDs for Person Entity Files
This script:
1. Reads all person entity files from data/custodian/person/entity/
2. Deduplicates by keeping latest timestamp per LinkedIn slug
3. Filters to heritage_relevant: true only
4. Generates ID-class identifiers for living persons
5. Creates data/person/ directory structure with PPID filenames
PPID Format (for living persons with unknown dates/locations):
ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_FIRSTNAME-LASTNAME
Per Rule 44: EDTF notation used for unknown dates (X = unspecified digit)
"""
import json
import os
import re
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import unquote
from typing import Optional
import shutil
try:
from unidecode import unidecode as _unidecode
HAS_UNIDECODE = True
except ImportError:
HAS_UNIDECODE = False
_unidecode = None
print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
# Dutch tussenvoegsels (particles) to skip in last name token
DUTCH_PARTICLES = {
'van', 'de', 'den', 'der', 'het', "'t", 'te', 'ter', 'ten',
'van de', 'van den', 'van der', 'van het', "van 't",
'in de', 'in den', 'in het', "in 't",
'op de', 'op den', 'op het', "op 't",
'aan de', 'aan den', 'aan het',
}
# International particles to skip
INTERNATIONAL_PARTICLES = {
# Dutch
'van', 'de', 'den', 'der', 'het', "'t", 'te', 'ter', 'ten',
# German
'von', 'vom', 'zu', 'zum', 'zur',
# French
'de', 'du', 'des', 'la', 'le', 'les', "l'", "d'",
# Spanish/Portuguese
'da', 'das', 'do', 'dos', 'del', 'de la', 'de los', 'de las',
# Italian
'di', 'della', 'dello', 'dei', 'degli', 'delle',
# Arabic
'al', 'el', 'bin', 'ibn', 'abu',
}
def normalize_name(name: str) -> str:
"""Normalize name to ASCII equivalents.
Uses NFD decomposition for Latin scripts with diacritics,
and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
"""
if not name:
return ""
# Check if name contains non-Latin characters
# If any character is not in Latin extended range, use unidecode
has_non_latin = any(
ord(c) > 0x024F and unicodedata.category(c).startswith('L')
for c in name
)
if has_non_latin and HAS_UNIDECODE:
# Use unidecode for Hebrew, Arabic, Chinese, etc.
ascii_name = unidecode(name)
else:
# Use NFD decomposition for Latin scripts with diacritics
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_name
def extract_name_tokens(full_name: str) -> tuple[str, str]:
"""
Extract first and last name tokens for PPID.
Rules:
- Skip tussenvoegsels (van, de, den, der, etc.)
- Use UPPERCASE
- Normalize diacritics to ASCII
Examples:
"Jan van den Berg" -> ("JAN", "BERG")
"Maria de la Cruz" -> ("MARIA", "CRUZ")
"Vincent van Gogh" -> ("VINCENT", "GOGH")
"""
if not full_name:
return ("UNKNOWN", "UNKNOWN")
# Normalize diacritics
name = normalize_name(full_name)
# Split into words
words = name.split()
if not words:
return ("UNKNOWN", "UNKNOWN")
# First token is always the first word
first_token = words[0].upper()
# Remove any non-alpha characters
first_token = re.sub(r'[^A-Z]', '', first_token)
if not first_token:
first_token = "UNKNOWN"
# Find last token (skip particles)
last_token = "UNKNOWN"
for word in reversed(words):
word_lower = word.lower()
if word_lower not in INTERNATIONAL_PARTICLES:
last_token = word.upper()
# Remove any non-alpha characters
last_token = re.sub(r'[^A-Z]', '', last_token)
if last_token:
break
if not last_token:
last_token = "UNKNOWN"
return (first_token, last_token)
def extract_slug_and_timestamp(filename: str) -> tuple[str, str]:
"""Extract LinkedIn slug and timestamp from filename.
Format: {linkedin-slug}_{ISO-timestamp}.json
Example: iris-van-meer-34329131_20251211T000000Z.json
"""
# Remove .json extension
base = filename.replace('.json', '')
# Split on last underscore (timestamp is always last)
parts = base.rsplit('_', 1)
if len(parts) == 2:
slug = unquote(parts[0]) # URL-decode the slug
timestamp = parts[1]
return slug, timestamp
else:
return unquote(base), ''
def parse_timestamp(ts: str) -> datetime:
"""Parse ISO timestamp like 20251211T000000Z."""
try:
return datetime.strptime(ts, '%Y%m%dT%H%M%SZ')
except ValueError:
return datetime.min
def generate_ppid(
name: str,
birth_location: Optional[str] = None,
birth_date: Optional[str] = None,
death_location: Optional[str] = None,
death_date: Optional[str] = None,
is_living: bool = True
) -> str:
"""
Generate a PPID for a person.
Format: {TYPE}_{FL}_{FD}_{LL}_{LD}_{NT}
For living persons with unknown data:
ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_FIRSTNAME-LASTNAME
"""
# Type: ID for living/unverified, PID for deceased+verified
id_type = "ID" if is_living else "PID"
# First Location (birth place)
first_location = birth_location if birth_location else "XX-XX-XXX"
# First Date (birth date) - EDTF format
first_date = birth_date if birth_date else "XXXX"
# Last Location (death place or current location)
last_location = death_location if death_location else "XX-XX-XXX"
# Last Date (death date) - EDTF format
# For living persons, use XXXX (unknown)
last_date = death_date if death_date else "XXXX"
# Name Tokens
first_token, last_token = extract_name_tokens(name)
name_tokens = f"{first_token}-{last_token}"
# Combine
ppid = f"{id_type}_{first_location}_{first_date}_{last_location}_{last_date}_{name_tokens}"
return ppid
def load_person_entity(filepath: Path) -> Optional[dict]:
"""Load and parse a person entity JSON file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, UnicodeDecodeError) as e:
print(f" ERROR: Failed to parse {filepath.name}: {e}")
return None
def get_person_name(data: dict) -> str:
"""Extract person name from entity data."""
# Try multiple locations
name = (
data.get('profile_data', {}).get('name') or
data.get('source_staff_info', {}).get('name') or
data.get('fallback_data', {}).get('name') or
''
)
return name.strip()
def is_heritage_relevant(data: dict) -> bool:
"""Check if person is heritage-relevant."""
# Check nested heritage_relevance structure
hr = data.get('heritage_relevance', {})
if isinstance(hr, dict):
return hr.get('is_heritage_relevant', False) is True
# Check direct field
return data.get('heritage_relevant', False) is True
def get_current_location(data: dict) -> Optional[str]:
"""Try to extract current work location from affiliations."""
affiliations = data.get('affiliations', [])
for aff in affiliations:
if isinstance(aff, dict):
location = aff.get('location')
if location:
# TODO: Convert location string to CC-RR-PPP format
# For now, return None (needs GeoNames lookup)
pass
return None
def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
"""Create a new PPID entity structure from source data."""
name = get_person_name(data)
entity = {
"ppid": ppid,
"ppid_type": "ID", # All living persons are ID class
"ppid_components": {
"type": "ID",
"first_location": "XX-XX-XXX",
"first_date": "XXXX",
"last_location": "XX-XX-XXX",
"last_date": "XXXX",
"name_tokens": extract_name_tokens(name)
},
"name": {
"full_name": name,
"name_tokens": extract_name_tokens(name),
"source": "linkedin_profile"
},
"birth_date": {
"edtf": "XXXX",
"precision": "unknown"
},
"is_living": True,
"heritage_relevance": data.get('heritage_relevance', {}),
"affiliations": data.get('affiliations', []),
"profile_data": data.get('profile_data', {}),
"web_claims": data.get('web_claims', []),
"source_observations": [
{
"source_file": source_file,
"observed_on": data.get('extraction_metadata', {}).get('extraction_date'),
"extraction_agent": data.get('extraction_metadata', {}).get('extraction_agent')
}
],
"enrichment_metadata": {
"birth_date_search": {
"attempted": False,
"notes": "Not yet searched - requires manual enrichment"
}
},
"provenance": {
"created_at": datetime.now(timezone.utc).isoformat(),
"created_by": "generate_ppids.py",
"source_files": [source_file]
}
}
return entity
def main():
"""Main entry point."""
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
output_dir = Path('/Users/kempersc/apps/glam/data/person')
if not entity_dir.exists():
print(f"ERROR: Entity directory not found: {entity_dir}")
return
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
print("="*60)
print("PPID GENERATION FOR PERSON ENTITIES")
print("="*60)
# Collect all JSON files
json_files = list(entity_dir.glob('*.json'))
print(f"\nFound {len(json_files)} JSON files")
# Group by LinkedIn slug, keeping only latest timestamp
slug_to_latest = {} # slug -> (filepath, timestamp, data)
errors = []
print("\n📂 STEP 1: Loading and deduplicating files...")
for i, filepath in enumerate(json_files):
if i % 1000 == 0 and i > 0:
print(f" Processing {i}/{len(json_files)}...")
slug, timestamp = extract_slug_and_timestamp(filepath.name)
data = load_person_entity(filepath)
if data is None:
errors.append(str(filepath))
continue
# Keep only latest timestamp per slug
if slug not in slug_to_latest:
slug_to_latest[slug] = (filepath, timestamp, data)
else:
existing_ts = slug_to_latest[slug][1]
if timestamp > existing_ts:
slug_to_latest[slug] = (filepath, timestamp, data)
print(f" Loaded: {len(slug_to_latest)} unique persons")
print(f" Errors: {len(errors)}")
# Filter to heritage-relevant only
print("\n🏛️ STEP 2: Filtering to heritage-relevant persons...")
heritage_relevant = {}
non_heritage = 0
unknown_heritage = 0
for slug, (filepath, timestamp, data) in slug_to_latest.items():
if is_heritage_relevant(data):
heritage_relevant[slug] = (filepath, timestamp, data)
elif data.get('heritage_relevance', {}).get('is_heritage_relevant') is False:
non_heritage += 1
else:
unknown_heritage += 1
print(f" Heritage relevant: {len(heritage_relevant)}")
print(f" Non-heritage: {non_heritage}")
print(f" Unknown: {unknown_heritage}")
# Generate PPIDs
print("\n🆔 STEP 3: Generating PPIDs...")
ppid_entities = []
ppid_collisions = defaultdict(list) # ppid -> list of slugs
for slug, (filepath, timestamp, data) in heritage_relevant.items():
name = get_person_name(data)
if not name:
print(f" WARNING: No name found for {slug}, skipping")
continue
# Generate PPID (all living persons, unknown dates/locations)
ppid = generate_ppid(
name=name,
is_living=True
)
# Track collisions
ppid_collisions[ppid].append(slug)
# Create entity
entity = create_ppid_entity(data, ppid, str(filepath))
entity['linkedin_slug'] = slug
ppid_entities.append(entity)
print(f" Generated {len(ppid_entities)} PPIDs")
# Handle collisions
collision_count = sum(1 for slugs in ppid_collisions.values() if len(slugs) > 1)
print(f" Collisions detected: {collision_count}")
if collision_count > 0:
print("\n⚠️ STEP 3b: Resolving collisions with LinkedIn slug suffix...")
# Add linkedin_slug suffix to resolve collisions
for entity in ppid_entities:
base_ppid = entity['ppid']
if len(ppid_collisions[base_ppid]) > 1:
# Add linkedin slug as collision suffix
slug = entity['linkedin_slug']
# Convert slug to safe suffix (replace special chars)
safe_slug = re.sub(r'[^a-z0-9]', '_', slug.lower())
entity['ppid'] = f"{base_ppid}-{safe_slug}"
entity['ppid_collision_suffix'] = safe_slug
# Save entities
print(f"\n💾 STEP 4: Saving {len(ppid_entities)} entities to {output_dir}...")
saved = 0
save_errors = 0
for entity in ppid_entities:
ppid = entity['ppid']
# Create safe filename (replace problematic chars)
safe_filename = ppid.replace('/', '_').replace('\\', '_')
output_path = output_dir / f"{safe_filename}.json"
try:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(entity, f, indent=2, ensure_ascii=False)
saved += 1
except Exception as e:
print(f" ERROR saving {ppid}: {e}")
save_errors += 1
print(f" Saved: {saved}")
print(f" Errors: {save_errors}")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f" Input files: {len(json_files)}")
print(f" Unique persons: {len(slug_to_latest)}")
print(f" Heritage relevant: {len(heritage_relevant)}")
print(f" PPIDs generated: {len(ppid_entities)}")
print(f" Collisions resolved: {collision_count}")
print(f" Files saved: {saved}")
print(f" Output directory: {output_dir}")
# Save manifest
manifest = {
"generation_timestamp": datetime.now(timezone.utc).isoformat(),
"input_directory": str(entity_dir),
"output_directory": str(output_dir),
"statistics": {
"input_files": len(json_files),
"unique_persons": len(slug_to_latest),
"heritage_relevant": len(heritage_relevant),
"ppids_generated": len(ppid_entities),
"collisions_resolved": collision_count,
"files_saved": saved
},
"collisions": {
ppid: slugs for ppid, slugs in ppid_collisions.items()
if len(slugs) > 1
}
}
manifest_path = output_dir / "_manifest.json"
with open(manifest_path, 'w', encoding='utf-8') as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
print(f"\n Manifest saved to: {manifest_path}")
if __name__ == '__main__':
main()