glam/scripts/analyze_person_entities.py
2026-01-09 18:26:58 +01:00

284 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Analyze person entity files in data/custodian/person/entity/
Tasks:
1. Count unique LinkedIn slugs vs total files
2. Identify duplicates (same person, multiple timestamps)
3. Count heritage_relevant: true vs false
4. Analyze data completeness (birth date, location, affiliations)
5. Generate statistics for PPID implementation planning
"""
import json
import os
import re
from collections import defaultdict
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime
def extract_slug_and_timestamp(filename: str) -> tuple[str, str]:
"""Extract LinkedIn slug and timestamp from filename.
Format: {linkedin-slug}_{ISO-timestamp}.json
Example: iris-van-meer-34329131_20251211T000000Z.json
"""
# Remove .json extension
base = filename.replace('.json', '')
# Split on last underscore (timestamp is always last)
parts = base.rsplit('_', 1)
if len(parts) == 2:
slug = unquote(parts[0]) # URL-decode the slug
timestamp = parts[1]
return slug, timestamp
else:
return unquote(base), ''
def parse_timestamp(ts: str) -> datetime:
"""Parse ISO timestamp like 20251211T000000Z."""
try:
return datetime.strptime(ts, '%Y%m%dT%H%M%SZ')
except ValueError:
return datetime.min
def analyze_person_file(filepath: Path) -> dict:
"""Analyze a single person entity file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
except (json.JSONDecodeError, UnicodeDecodeError) as e:
return {'error': str(e), 'path': str(filepath)}
# Extract key fields
result = {
'path': str(filepath),
'slug': extract_slug_and_timestamp(filepath.name)[0],
'timestamp': extract_slug_and_timestamp(filepath.name)[1],
# Check multiple locations for heritage_relevant flag
'heritage_relevant': (
data.get('heritage_relevance', {}).get('is_heritage_relevant') or
data.get('heritage_relevant')
),
'heritage_types': data.get('heritage_relevance', {}).get('heritage_types', []),
'has_name': bool(data.get('profile_data', {}).get('name') or
data.get('fallback_data', {}).get('name') or
data.get('source_staff_info', {}).get('name')),
'has_headline': bool(data.get('profile_data', {}).get('headline') or
data.get('fallback_data', {}).get('headline') or
data.get('source_staff_info', {}).get('headline')),
'has_affiliations': len(data.get('affiliations', [])) > 0,
'affiliation_count': len(data.get('affiliations', [])),
'has_web_claims': len(data.get('web_claims', [])) > 0,
'web_claim_count': len(data.get('web_claims', [])),
'has_extraction_metadata': bool(data.get('extraction_metadata')),
'extraction_agent': data.get('extraction_metadata', {}).get('extraction_agent'),
'extraction_method': data.get('extraction_metadata', {}).get('extraction_method'),
'has_birth_date': False, # Check if any claims have birth date
'has_birth_place': False,
'has_current_location': False,
'name': (data.get('profile_data', {}).get('name') or
data.get('source_staff_info', {}).get('name') or ''),
'custodian': data.get('source_staff_info', {}).get('custodian', ''),
}
# Check web claims for birth/location data
for claim in data.get('web_claims', []):
claim_type = claim.get('claim_type', '')
if 'birth' in claim_type.lower() and 'date' in claim_type.lower():
result['has_birth_date'] = True
if 'birth' in claim_type.lower() and 'place' in claim_type.lower():
result['has_birth_place'] = True
if 'location' in claim_type.lower() or 'address' in claim_type.lower():
result['has_current_location'] = True
# Check affiliations for location data
for aff in data.get('affiliations', []):
if isinstance(aff, dict) and aff.get('location'):
result['has_current_location'] = True
break
return result
def main():
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
if not entity_dir.exists():
print(f"ERROR: Directory not found: {entity_dir}")
return
# Collect all JSON files
json_files = list(entity_dir.glob('*.json'))
print(f"Found {len(json_files)} JSON files")
# Group by LinkedIn slug
slug_to_files = defaultdict(list)
all_results = []
errors = []
for i, filepath in enumerate(json_files):
if i % 1000 == 0:
print(f"Processing {i}/{len(json_files)}...")
result = analyze_person_file(filepath)
if 'error' in result:
errors.append(result)
continue
all_results.append(result)
slug = result['slug']
slug_to_files[slug].append({
'path': result['path'],
'timestamp': result['timestamp']
})
# Calculate statistics
print("\n" + "="*60)
print("PERSON ENTITY ANALYSIS REPORT")
print("="*60)
print(f"\n📁 FILE COUNTS:")
print(f" Total JSON files: {len(json_files)}")
print(f" Successfully parsed: {len(all_results)}")
print(f" Parse errors: {len(errors)}")
print(f"\n👤 UNIQUE PERSONS:")
print(f" Unique LinkedIn slugs: {len(slug_to_files)}")
# Count duplicates
duplicates = {k: v for k, v in slug_to_files.items() if len(v) > 1}
print(f" Slugs with duplicates: {len(duplicates)}")
print(f" Total duplicate files: {sum(len(v) - 1 for v in duplicates.values())}")
# Heritage relevance
heritage_true = sum(1 for r in all_results if r['heritage_relevant'] is True)
heritage_false = sum(1 for r in all_results if r['heritage_relevant'] is False)
heritage_none = sum(1 for r in all_results if r['heritage_relevant'] is None)
print(f"\n🏛️ HERITAGE RELEVANCE:")
print(f" heritage_relevant: true = {heritage_true}")
print(f" heritage_relevant: false = {heritage_false}")
print(f" heritage_relevant: null/missing = {heritage_none}")
# Data completeness
has_affiliations = sum(1 for r in all_results if r['has_affiliations'])
has_web_claims = sum(1 for r in all_results if r['has_web_claims'])
has_birth_date = sum(1 for r in all_results if r['has_birth_date'])
has_birth_place = sum(1 for r in all_results if r['has_birth_place'])
has_location = sum(1 for r in all_results if r['has_current_location'])
print(f"\n📊 DATA COMPLETENESS:")
print(f" Has affiliations: {has_affiliations} ({100*has_affiliations/len(all_results):.1f}%)")
print(f" Has web claims: {has_web_claims} ({100*has_web_claims/len(all_results):.1f}%)")
print(f" Has birth date: {has_birth_date} ({100*has_birth_date/len(all_results):.1f}%)")
print(f" Has birth place: {has_birth_place} ({100*has_birth_place/len(all_results):.1f}%)")
print(f" Has any location: {has_location} ({100*has_location/len(all_results):.1f}%)")
# Extraction agents
agents = defaultdict(int)
for r in all_results:
agent = r.get('extraction_agent') or 'unknown'
agents[agent] += 1
print(f"\n🤖 EXTRACTION AGENTS:")
for agent, count in sorted(agents.items(), key=lambda x: -x[1]):
print(f" {agent}: {count}")
# Extraction methods
methods = defaultdict(int)
for r in all_results:
method = r.get('extraction_method') or 'unknown'
methods[method] += 1
print(f"\n📝 EXTRACTION METHODS:")
for method, count in sorted(methods.items(), key=lambda x: -x[1]):
print(f" {method}: {count}")
# Heritage types breakdown
heritage_types = defaultdict(int)
for r in all_results:
for ht in r.get('heritage_types', []):
heritage_types[ht] += 1
print(f"\n🏛️ HERITAGE TYPES (GLAMORCUBESFIXPHDNT):")
type_labels = {
'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
'B': 'Botanical/Zoo', 'E': 'Education', 'S': 'Society', 'F': 'Feature',
'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy',
'D': 'Digital', 'N': 'NGO', 'T': 'Taste/Smell'
}
for ht, count in sorted(heritage_types.items(), key=lambda x: -x[1]):
label = type_labels.get(ht, 'Unknown')
print(f" {ht} ({label}): {count}")
# Show some duplicate examples
print(f"\n📋 DUPLICATE EXAMPLES (first 5):")
for slug, files in list(duplicates.items())[:5]:
print(f" {slug}:")
for f in sorted(files, key=lambda x: x['timestamp']):
print(f" - {f['timestamp']}")
# Unique persons after deduplication (keeping latest)
print(f"\n✅ AFTER DEDUPLICATION:")
print(f" Unique persons (keeping latest per slug): {len(slug_to_files)}")
# Heritage relevant after dedup
# Need to check which slug has heritage_relevant=True
slug_heritage = {}
for r in all_results:
slug = r['slug']
ts = r['timestamp']
if slug not in slug_heritage or ts > slug_heritage[slug]['timestamp']:
slug_heritage[slug] = {
'timestamp': ts,
'heritage_relevant': r['heritage_relevant']
}
unique_heritage_true = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is True)
unique_heritage_false = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is False)
print(f" Heritage relevant (latest per slug): {unique_heritage_true}")
print(f" Non-heritage (latest per slug): {unique_heritage_false}")
print(f" Unknown (latest per slug): {len(slug_heritage) - unique_heritage_true - unique_heritage_false}")
# PPID-eligible count
print(f"\n🆔 PPID CANDIDATES:")
print(f" Total unique heritage-relevant persons: {unique_heritage_true}")
print(f" ID-class (living, no birth/death): {unique_heritage_true}")
print(f" PID-class eligible (has birth+death): 0 (all living)")
# Save detailed report
report = {
'analysis_timestamp': datetime.now().isoformat(),
'total_files': len(json_files),
'unique_slugs': len(slug_to_files),
'duplicates_count': len(duplicates),
'heritage_true': heritage_true,
'heritage_false': heritage_false,
'unique_heritage_true': unique_heritage_true,
'unique_heritage_false': unique_heritage_false,
'has_affiliations': has_affiliations,
'has_web_claims': has_web_claims,
'has_birth_date': has_birth_date,
'has_location': has_location,
'agents': dict(agents),
'duplicate_slugs': {k: v for k, v in list(duplicates.items())[:50]}, # First 50
'errors': errors[:20] # First 20 errors
}
report_path = Path('/Users/kempersc/apps/glam/data/custodian/person/entity_analysis_report.json')
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\n💾 Detailed report saved to: {report_path}")
if __name__ == '__main__':
main()