#!/usr/bin/env python3 """ Analyze person entity files in data/custodian/person/entity/ Tasks: 1. Count unique LinkedIn slugs vs total files 2. Identify duplicates (same person, multiple timestamps) 3. Count heritage_relevant: true vs false 4. Analyze data completeness (birth date, location, affiliations) 5. Generate statistics for PPID implementation planning """ import json import os import re from collections import defaultdict from pathlib import Path from urllib.parse import unquote from datetime import datetime def extract_slug_and_timestamp(filename: str) -> tuple[str, str]: """Extract LinkedIn slug and timestamp from filename. Format: {linkedin-slug}_{ISO-timestamp}.json Example: iris-van-meer-34329131_20251211T000000Z.json """ # Remove .json extension base = filename.replace('.json', '') # Split on last underscore (timestamp is always last) parts = base.rsplit('_', 1) if len(parts) == 2: slug = unquote(parts[0]) # URL-decode the slug timestamp = parts[1] return slug, timestamp else: return unquote(base), '' def parse_timestamp(ts: str) -> datetime: """Parse ISO timestamp like 20251211T000000Z.""" try: return datetime.strptime(ts, '%Y%m%dT%H%M%SZ') except ValueError: return datetime.min def analyze_person_file(filepath: Path) -> dict: """Analyze a single person entity file.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) except (json.JSONDecodeError, UnicodeDecodeError) as e: return {'error': str(e), 'path': str(filepath)} # Extract key fields result = { 'path': str(filepath), 'slug': extract_slug_and_timestamp(filepath.name)[0], 'timestamp': extract_slug_and_timestamp(filepath.name)[1], # Check multiple locations for heritage_relevant flag 'heritage_relevant': ( data.get('heritage_relevance', {}).get('is_heritage_relevant') or data.get('heritage_relevant') ), 'heritage_types': data.get('heritage_relevance', {}).get('heritage_types', []), 'has_name': bool(data.get('profile_data', {}).get('name') or data.get('fallback_data', {}).get('name') or data.get('source_staff_info', {}).get('name')), 'has_headline': bool(data.get('profile_data', {}).get('headline') or data.get('fallback_data', {}).get('headline') or data.get('source_staff_info', {}).get('headline')), 'has_affiliations': len(data.get('affiliations', [])) > 0, 'affiliation_count': len(data.get('affiliations', [])), 'has_web_claims': len(data.get('web_claims', [])) > 0, 'web_claim_count': len(data.get('web_claims', [])), 'has_extraction_metadata': bool(data.get('extraction_metadata')), 'extraction_agent': data.get('extraction_metadata', {}).get('extraction_agent'), 'extraction_method': data.get('extraction_metadata', {}).get('extraction_method'), 'has_birth_date': False, # Check if any claims have birth date 'has_birth_place': False, 'has_current_location': False, 'name': (data.get('profile_data', {}).get('name') or data.get('source_staff_info', {}).get('name') or ''), 'custodian': data.get('source_staff_info', {}).get('custodian', ''), } # Check web claims for birth/location data for claim in data.get('web_claims', []): claim_type = claim.get('claim_type', '') if 'birth' in claim_type.lower() and 'date' in claim_type.lower(): result['has_birth_date'] = True if 'birth' in claim_type.lower() and 'place' in claim_type.lower(): result['has_birth_place'] = True if 'location' in claim_type.lower() or 'address' in claim_type.lower(): result['has_current_location'] = True # Check affiliations for location data for aff in data.get('affiliations', []): if isinstance(aff, dict) and aff.get('location'): result['has_current_location'] = True break return result def main(): entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') if not entity_dir.exists(): print(f"ERROR: Directory not found: {entity_dir}") return # Collect all JSON files json_files = list(entity_dir.glob('*.json')) print(f"Found {len(json_files)} JSON files") # Group by LinkedIn slug slug_to_files = defaultdict(list) all_results = [] errors = [] for i, filepath in enumerate(json_files): if i % 1000 == 0: print(f"Processing {i}/{len(json_files)}...") result = analyze_person_file(filepath) if 'error' in result: errors.append(result) continue all_results.append(result) slug = result['slug'] slug_to_files[slug].append({ 'path': result['path'], 'timestamp': result['timestamp'] }) # Calculate statistics print("\n" + "="*60) print("PERSON ENTITY ANALYSIS REPORT") print("="*60) print(f"\nšŸ“ FILE COUNTS:") print(f" Total JSON files: {len(json_files)}") print(f" Successfully parsed: {len(all_results)}") print(f" Parse errors: {len(errors)}") print(f"\nšŸ‘¤ UNIQUE PERSONS:") print(f" Unique LinkedIn slugs: {len(slug_to_files)}") # Count duplicates duplicates = {k: v for k, v in slug_to_files.items() if len(v) > 1} print(f" Slugs with duplicates: {len(duplicates)}") print(f" Total duplicate files: {sum(len(v) - 1 for v in duplicates.values())}") # Heritage relevance heritage_true = sum(1 for r in all_results if r['heritage_relevant'] is True) heritage_false = sum(1 for r in all_results if r['heritage_relevant'] is False) heritage_none = sum(1 for r in all_results if r['heritage_relevant'] is None) print(f"\nšŸ›ļø HERITAGE RELEVANCE:") print(f" heritage_relevant: true = {heritage_true}") print(f" heritage_relevant: false = {heritage_false}") print(f" heritage_relevant: null/missing = {heritage_none}") # Data completeness has_affiliations = sum(1 for r in all_results if r['has_affiliations']) has_web_claims = sum(1 for r in all_results if r['has_web_claims']) has_birth_date = sum(1 for r in all_results if r['has_birth_date']) has_birth_place = sum(1 for r in all_results if r['has_birth_place']) has_location = sum(1 for r in all_results if r['has_current_location']) print(f"\nšŸ“Š DATA COMPLETENESS:") print(f" Has affiliations: {has_affiliations} ({100*has_affiliations/len(all_results):.1f}%)") print(f" Has web claims: {has_web_claims} ({100*has_web_claims/len(all_results):.1f}%)") print(f" Has birth date: {has_birth_date} ({100*has_birth_date/len(all_results):.1f}%)") print(f" Has birth place: {has_birth_place} ({100*has_birth_place/len(all_results):.1f}%)") print(f" Has any location: {has_location} ({100*has_location/len(all_results):.1f}%)") # Extraction agents agents = defaultdict(int) for r in all_results: agent = r.get('extraction_agent') or 'unknown' agents[agent] += 1 print(f"\nšŸ¤– EXTRACTION AGENTS:") for agent, count in sorted(agents.items(), key=lambda x: -x[1]): print(f" {agent}: {count}") # Extraction methods methods = defaultdict(int) for r in all_results: method = r.get('extraction_method') or 'unknown' methods[method] += 1 print(f"\nšŸ“ EXTRACTION METHODS:") for method, count in sorted(methods.items(), key=lambda x: -x[1]): print(f" {method}: {count}") # Heritage types breakdown heritage_types = defaultdict(int) for r in all_results: for ht in r.get('heritage_types', []): heritage_types[ht] += 1 print(f"\nšŸ›ļø HERITAGE TYPES (GLAMORCUBESFIXPHDNT):") type_labels = { 'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum', 'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown', 'B': 'Botanical/Zoo', 'E': 'Education', 'S': 'Society', 'F': 'Feature', 'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy', 'D': 'Digital', 'N': 'NGO', 'T': 'Taste/Smell' } for ht, count in sorted(heritage_types.items(), key=lambda x: -x[1]): label = type_labels.get(ht, 'Unknown') print(f" {ht} ({label}): {count}") # Show some duplicate examples print(f"\nšŸ“‹ DUPLICATE EXAMPLES (first 5):") for slug, files in list(duplicates.items())[:5]: print(f" {slug}:") for f in sorted(files, key=lambda x: x['timestamp']): print(f" - {f['timestamp']}") # Unique persons after deduplication (keeping latest) print(f"\nāœ… AFTER DEDUPLICATION:") print(f" Unique persons (keeping latest per slug): {len(slug_to_files)}") # Heritage relevant after dedup # Need to check which slug has heritage_relevant=True slug_heritage = {} for r in all_results: slug = r['slug'] ts = r['timestamp'] if slug not in slug_heritage or ts > slug_heritage[slug]['timestamp']: slug_heritage[slug] = { 'timestamp': ts, 'heritage_relevant': r['heritage_relevant'] } unique_heritage_true = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is True) unique_heritage_false = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is False) print(f" Heritage relevant (latest per slug): {unique_heritage_true}") print(f" Non-heritage (latest per slug): {unique_heritage_false}") print(f" Unknown (latest per slug): {len(slug_heritage) - unique_heritage_true - unique_heritage_false}") # PPID-eligible count print(f"\nšŸ†” PPID CANDIDATES:") print(f" Total unique heritage-relevant persons: {unique_heritage_true}") print(f" ID-class (living, no birth/death): {unique_heritage_true}") print(f" PID-class eligible (has birth+death): 0 (all living)") # Save detailed report report = { 'analysis_timestamp': datetime.now().isoformat(), 'total_files': len(json_files), 'unique_slugs': len(slug_to_files), 'duplicates_count': len(duplicates), 'heritage_true': heritage_true, 'heritage_false': heritage_false, 'unique_heritage_true': unique_heritage_true, 'unique_heritage_false': unique_heritage_false, 'has_affiliations': has_affiliations, 'has_web_claims': has_web_claims, 'has_birth_date': has_birth_date, 'has_location': has_location, 'agents': dict(agents), 'duplicate_slugs': {k: v for k, v in list(duplicates.items())[:50]}, # First 50 'errors': errors[:20] # First 20 errors } report_path = Path('/Users/kempersc/apps/glam/data/custodian/person/entity_analysis_report.json') with open(report_path, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nšŸ’¾ Detailed report saved to: {report_path}") if __name__ == '__main__': main()