284 lines
11 KiB
Python
284 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze person entity files in data/custodian/person/entity/
|
|
|
|
Tasks:
|
|
1. Count unique LinkedIn slugs vs total files
|
|
2. Identify duplicates (same person, multiple timestamps)
|
|
3. Count heritage_relevant: true vs false
|
|
4. Analyze data completeness (birth date, location, affiliations)
|
|
5. Generate statistics for PPID implementation planning
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from datetime import datetime
|
|
|
|
|
|
def extract_slug_and_timestamp(filename: str) -> tuple[str, str]:
|
|
"""Extract LinkedIn slug and timestamp from filename.
|
|
|
|
Format: {linkedin-slug}_{ISO-timestamp}.json
|
|
Example: iris-van-meer-34329131_20251211T000000Z.json
|
|
"""
|
|
# Remove .json extension
|
|
base = filename.replace('.json', '')
|
|
|
|
# Split on last underscore (timestamp is always last)
|
|
parts = base.rsplit('_', 1)
|
|
if len(parts) == 2:
|
|
slug = unquote(parts[0]) # URL-decode the slug
|
|
timestamp = parts[1]
|
|
return slug, timestamp
|
|
else:
|
|
return unquote(base), ''
|
|
|
|
|
|
def parse_timestamp(ts: str) -> datetime:
|
|
"""Parse ISO timestamp like 20251211T000000Z."""
|
|
try:
|
|
return datetime.strptime(ts, '%Y%m%dT%H%M%SZ')
|
|
except ValueError:
|
|
return datetime.min
|
|
|
|
|
|
def analyze_person_file(filepath: Path) -> dict:
|
|
"""Analyze a single person entity file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
return {'error': str(e), 'path': str(filepath)}
|
|
|
|
# Extract key fields
|
|
result = {
|
|
'path': str(filepath),
|
|
'slug': extract_slug_and_timestamp(filepath.name)[0],
|
|
'timestamp': extract_slug_and_timestamp(filepath.name)[1],
|
|
# Check multiple locations for heritage_relevant flag
|
|
'heritage_relevant': (
|
|
data.get('heritage_relevance', {}).get('is_heritage_relevant') or
|
|
data.get('heritage_relevant')
|
|
),
|
|
'heritage_types': data.get('heritage_relevance', {}).get('heritage_types', []),
|
|
'has_name': bool(data.get('profile_data', {}).get('name') or
|
|
data.get('fallback_data', {}).get('name') or
|
|
data.get('source_staff_info', {}).get('name')),
|
|
'has_headline': bool(data.get('profile_data', {}).get('headline') or
|
|
data.get('fallback_data', {}).get('headline') or
|
|
data.get('source_staff_info', {}).get('headline')),
|
|
'has_affiliations': len(data.get('affiliations', [])) > 0,
|
|
'affiliation_count': len(data.get('affiliations', [])),
|
|
'has_web_claims': len(data.get('web_claims', [])) > 0,
|
|
'web_claim_count': len(data.get('web_claims', [])),
|
|
'has_extraction_metadata': bool(data.get('extraction_metadata')),
|
|
'extraction_agent': data.get('extraction_metadata', {}).get('extraction_agent'),
|
|
'extraction_method': data.get('extraction_metadata', {}).get('extraction_method'),
|
|
'has_birth_date': False, # Check if any claims have birth date
|
|
'has_birth_place': False,
|
|
'has_current_location': False,
|
|
'name': (data.get('profile_data', {}).get('name') or
|
|
data.get('source_staff_info', {}).get('name') or ''),
|
|
'custodian': data.get('source_staff_info', {}).get('custodian', ''),
|
|
}
|
|
|
|
# Check web claims for birth/location data
|
|
for claim in data.get('web_claims', []):
|
|
claim_type = claim.get('claim_type', '')
|
|
if 'birth' in claim_type.lower() and 'date' in claim_type.lower():
|
|
result['has_birth_date'] = True
|
|
if 'birth' in claim_type.lower() and 'place' in claim_type.lower():
|
|
result['has_birth_place'] = True
|
|
if 'location' in claim_type.lower() or 'address' in claim_type.lower():
|
|
result['has_current_location'] = True
|
|
|
|
# Check affiliations for location data
|
|
for aff in data.get('affiliations', []):
|
|
if isinstance(aff, dict) and aff.get('location'):
|
|
result['has_current_location'] = True
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
|
|
|
|
if not entity_dir.exists():
|
|
print(f"ERROR: Directory not found: {entity_dir}")
|
|
return
|
|
|
|
# Collect all JSON files
|
|
json_files = list(entity_dir.glob('*.json'))
|
|
print(f"Found {len(json_files)} JSON files")
|
|
|
|
# Group by LinkedIn slug
|
|
slug_to_files = defaultdict(list)
|
|
all_results = []
|
|
errors = []
|
|
|
|
for i, filepath in enumerate(json_files):
|
|
if i % 1000 == 0:
|
|
print(f"Processing {i}/{len(json_files)}...")
|
|
|
|
result = analyze_person_file(filepath)
|
|
|
|
if 'error' in result:
|
|
errors.append(result)
|
|
continue
|
|
|
|
all_results.append(result)
|
|
slug = result['slug']
|
|
slug_to_files[slug].append({
|
|
'path': result['path'],
|
|
'timestamp': result['timestamp']
|
|
})
|
|
|
|
# Calculate statistics
|
|
print("\n" + "="*60)
|
|
print("PERSON ENTITY ANALYSIS REPORT")
|
|
print("="*60)
|
|
|
|
print(f"\n📁 FILE COUNTS:")
|
|
print(f" Total JSON files: {len(json_files)}")
|
|
print(f" Successfully parsed: {len(all_results)}")
|
|
print(f" Parse errors: {len(errors)}")
|
|
|
|
print(f"\n👤 UNIQUE PERSONS:")
|
|
print(f" Unique LinkedIn slugs: {len(slug_to_files)}")
|
|
|
|
# Count duplicates
|
|
duplicates = {k: v for k, v in slug_to_files.items() if len(v) > 1}
|
|
print(f" Slugs with duplicates: {len(duplicates)}")
|
|
print(f" Total duplicate files: {sum(len(v) - 1 for v in duplicates.values())}")
|
|
|
|
# Heritage relevance
|
|
heritage_true = sum(1 for r in all_results if r['heritage_relevant'] is True)
|
|
heritage_false = sum(1 for r in all_results if r['heritage_relevant'] is False)
|
|
heritage_none = sum(1 for r in all_results if r['heritage_relevant'] is None)
|
|
|
|
print(f"\n🏛️ HERITAGE RELEVANCE:")
|
|
print(f" heritage_relevant: true = {heritage_true}")
|
|
print(f" heritage_relevant: false = {heritage_false}")
|
|
print(f" heritage_relevant: null/missing = {heritage_none}")
|
|
|
|
# Data completeness
|
|
has_affiliations = sum(1 for r in all_results if r['has_affiliations'])
|
|
has_web_claims = sum(1 for r in all_results if r['has_web_claims'])
|
|
has_birth_date = sum(1 for r in all_results if r['has_birth_date'])
|
|
has_birth_place = sum(1 for r in all_results if r['has_birth_place'])
|
|
has_location = sum(1 for r in all_results if r['has_current_location'])
|
|
|
|
print(f"\n📊 DATA COMPLETENESS:")
|
|
print(f" Has affiliations: {has_affiliations} ({100*has_affiliations/len(all_results):.1f}%)")
|
|
print(f" Has web claims: {has_web_claims} ({100*has_web_claims/len(all_results):.1f}%)")
|
|
print(f" Has birth date: {has_birth_date} ({100*has_birth_date/len(all_results):.1f}%)")
|
|
print(f" Has birth place: {has_birth_place} ({100*has_birth_place/len(all_results):.1f}%)")
|
|
print(f" Has any location: {has_location} ({100*has_location/len(all_results):.1f}%)")
|
|
|
|
# Extraction agents
|
|
agents = defaultdict(int)
|
|
for r in all_results:
|
|
agent = r.get('extraction_agent') or 'unknown'
|
|
agents[agent] += 1
|
|
|
|
print(f"\n🤖 EXTRACTION AGENTS:")
|
|
for agent, count in sorted(agents.items(), key=lambda x: -x[1]):
|
|
print(f" {agent}: {count}")
|
|
|
|
# Extraction methods
|
|
methods = defaultdict(int)
|
|
for r in all_results:
|
|
method = r.get('extraction_method') or 'unknown'
|
|
methods[method] += 1
|
|
|
|
print(f"\n📝 EXTRACTION METHODS:")
|
|
for method, count in sorted(methods.items(), key=lambda x: -x[1]):
|
|
print(f" {method}: {count}")
|
|
|
|
# Heritage types breakdown
|
|
heritage_types = defaultdict(int)
|
|
for r in all_results:
|
|
for ht in r.get('heritage_types', []):
|
|
heritage_types[ht] += 1
|
|
|
|
print(f"\n🏛️ HERITAGE TYPES (GLAMORCUBESFIXPHDNT):")
|
|
type_labels = {
|
|
'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
|
|
'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
|
|
'B': 'Botanical/Zoo', 'E': 'Education', 'S': 'Society', 'F': 'Feature',
|
|
'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy',
|
|
'D': 'Digital', 'N': 'NGO', 'T': 'Taste/Smell'
|
|
}
|
|
for ht, count in sorted(heritage_types.items(), key=lambda x: -x[1]):
|
|
label = type_labels.get(ht, 'Unknown')
|
|
print(f" {ht} ({label}): {count}")
|
|
|
|
# Show some duplicate examples
|
|
print(f"\n📋 DUPLICATE EXAMPLES (first 5):")
|
|
for slug, files in list(duplicates.items())[:5]:
|
|
print(f" {slug}:")
|
|
for f in sorted(files, key=lambda x: x['timestamp']):
|
|
print(f" - {f['timestamp']}")
|
|
|
|
# Unique persons after deduplication (keeping latest)
|
|
print(f"\n✅ AFTER DEDUPLICATION:")
|
|
print(f" Unique persons (keeping latest per slug): {len(slug_to_files)}")
|
|
|
|
# Heritage relevant after dedup
|
|
# Need to check which slug has heritage_relevant=True
|
|
slug_heritage = {}
|
|
for r in all_results:
|
|
slug = r['slug']
|
|
ts = r['timestamp']
|
|
if slug not in slug_heritage or ts > slug_heritage[slug]['timestamp']:
|
|
slug_heritage[slug] = {
|
|
'timestamp': ts,
|
|
'heritage_relevant': r['heritage_relevant']
|
|
}
|
|
|
|
unique_heritage_true = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is True)
|
|
unique_heritage_false = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is False)
|
|
|
|
print(f" Heritage relevant (latest per slug): {unique_heritage_true}")
|
|
print(f" Non-heritage (latest per slug): {unique_heritage_false}")
|
|
print(f" Unknown (latest per slug): {len(slug_heritage) - unique_heritage_true - unique_heritage_false}")
|
|
|
|
# PPID-eligible count
|
|
print(f"\n🆔 PPID CANDIDATES:")
|
|
print(f" Total unique heritage-relevant persons: {unique_heritage_true}")
|
|
print(f" ID-class (living, no birth/death): {unique_heritage_true}")
|
|
print(f" PID-class eligible (has birth+death): 0 (all living)")
|
|
|
|
# Save detailed report
|
|
report = {
|
|
'analysis_timestamp': datetime.now().isoformat(),
|
|
'total_files': len(json_files),
|
|
'unique_slugs': len(slug_to_files),
|
|
'duplicates_count': len(duplicates),
|
|
'heritage_true': heritage_true,
|
|
'heritage_false': heritage_false,
|
|
'unique_heritage_true': unique_heritage_true,
|
|
'unique_heritage_false': unique_heritage_false,
|
|
'has_affiliations': has_affiliations,
|
|
'has_web_claims': has_web_claims,
|
|
'has_birth_date': has_birth_date,
|
|
'has_location': has_location,
|
|
'agents': dict(agents),
|
|
'duplicate_slugs': {k: v for k, v in list(duplicates.items())[:50]}, # First 50
|
|
'errors': errors[:20] # First 20 errors
|
|
}
|
|
|
|
report_path = Path('/Users/kempersc/apps/glam/data/custodian/person/entity_analysis_report.json')
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n💾 Detailed report saved to: {report_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|