glam/scripts/analyze_person_entities.py

#!/usr/bin/env python3
"""
Analyze person entity files in data/custodian/person/entity/

Tasks:
1. Count unique LinkedIn slugs vs total files
2. Identify duplicates (same person, multiple timestamps)
3. Count heritage_relevant: true vs false
4. Analyze data completeness (birth date, location, affiliations)
5. Generate statistics for PPID implementation planning
"""

import json
import os
import re
from collections import defaultdict
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime


def extract_slug_and_timestamp(filename: str) -> tuple[str, str]:
    """Extract LinkedIn slug and timestamp from filename.

    Format: {linkedin-slug}_{ISO-timestamp}.json
    Example: iris-van-meer-34329131_20251211T000000Z.json
    """
    # Remove .json extension
    base = filename.replace('.json', '')

    # Split on last underscore (timestamp is always last)
    parts = base.rsplit('_', 1)
    if len(parts) == 2:
        slug = unquote(parts[0])  # URL-decode the slug
        timestamp = parts[1]
        return slug, timestamp
    else:
        return unquote(base), ''


def parse_timestamp(ts: str) -> datetime:
    """Parse ISO timestamp like 20251211T000000Z."""
    try:
        return datetime.strptime(ts, '%Y%m%dT%H%M%SZ')
    except ValueError:
        return datetime.min


def analyze_person_file(filepath: Path) -> dict:
    """Analyze a single person entity file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (json.JSONDecodeError, UnicodeDecodeError) as e:
        return {'error': str(e), 'path': str(filepath)}

    # Extract key fields
    result = {
        'path': str(filepath),
        'slug': extract_slug_and_timestamp(filepath.name)[0],
        'timestamp': extract_slug_and_timestamp(filepath.name)[1],
        # Check multiple locations for heritage_relevant flag
        'heritage_relevant': (
            data.get('heritage_relevance', {}).get('is_heritage_relevant') or
            data.get('heritage_relevant')
        ),
        'heritage_types': data.get('heritage_relevance', {}).get('heritage_types', []),
        'has_name': bool(data.get('profile_data', {}).get('name') or
                        data.get('fallback_data', {}).get('name') or
                        data.get('source_staff_info', {}).get('name')),
        'has_headline': bool(data.get('profile_data', {}).get('headline') or
                            data.get('fallback_data', {}).get('headline') or
                            data.get('source_staff_info', {}).get('headline')),
        'has_affiliations': len(data.get('affiliations', [])) > 0,
        'affiliation_count': len(data.get('affiliations', [])),
        'has_web_claims': len(data.get('web_claims', [])) > 0,
        'web_claim_count': len(data.get('web_claims', [])),
        'has_extraction_metadata': bool(data.get('extraction_metadata')),
        'extraction_agent': data.get('extraction_metadata', {}).get('extraction_agent'),
        'extraction_method': data.get('extraction_metadata', {}).get('extraction_method'),
        'has_birth_date': False,  # Check if any claims have birth date
        'has_birth_place': False,
        'has_current_location': False,
        'name': (data.get('profile_data', {}).get('name') or
                 data.get('source_staff_info', {}).get('name') or ''),
        'custodian': data.get('source_staff_info', {}).get('custodian', ''),
    }

    # Check web claims for birth/location data
    for claim in data.get('web_claims', []):
        claim_type = claim.get('claim_type', '')
        if 'birth' in claim_type.lower() and 'date' in claim_type.lower():
            result['has_birth_date'] = True
        if 'birth' in claim_type.lower() and 'place' in claim_type.lower():
            result['has_birth_place'] = True
        if 'location' in claim_type.lower() or 'address' in claim_type.lower():
            result['has_current_location'] = True

    # Check affiliations for location data
    for aff in data.get('affiliations', []):
        if isinstance(aff, dict) and aff.get('location'):
            result['has_current_location'] = True
            break

    return result


def main():
    entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')

    if not entity_dir.exists():
        print(f"ERROR: Directory not found: {entity_dir}")
        return

    # Collect all JSON files
    json_files = list(entity_dir.glob('*.json'))
    print(f"Found {len(json_files)} JSON files")

    # Group by LinkedIn slug
    slug_to_files = defaultdict(list)
    all_results = []
    errors = []

    for i, filepath in enumerate(json_files):
        if i % 1000 == 0:
            print(f"Processing {i}/{len(json_files)}...")

        result = analyze_person_file(filepath)

        if 'error' in result:
            errors.append(result)
            continue

        all_results.append(result)
        slug = result['slug']
        slug_to_files[slug].append({
            'path': result['path'],
            'timestamp': result['timestamp']
        })

    # Calculate statistics
    print("\n" + "="*60)
    print("PERSON ENTITY ANALYSIS REPORT")
    print("="*60)

    print(f"\n📁 FILE COUNTS:")
    print(f"  Total JSON files: {len(json_files)}")
    print(f"  Successfully parsed: {len(all_results)}")
    print(f"  Parse errors: {len(errors)}")

    print(f"\n👤 UNIQUE PERSONS:")
    print(f"  Unique LinkedIn slugs: {len(slug_to_files)}")

    # Count duplicates
    duplicates = {k: v for k, v in slug_to_files.items() if len(v) > 1}
    print(f"  Slugs with duplicates: {len(duplicates)}")
    print(f"  Total duplicate files: {sum(len(v) - 1 for v in duplicates.values())}")

    # Heritage relevance
    heritage_true = sum(1 for r in all_results if r['heritage_relevant'] is True)
    heritage_false = sum(1 for r in all_results if r['heritage_relevant'] is False)
    heritage_none = sum(1 for r in all_results if r['heritage_relevant'] is None)

    print(f"\n🏛️ HERITAGE RELEVANCE:")
    print(f"  heritage_relevant: true = {heritage_true}")
    print(f"  heritage_relevant: false = {heritage_false}")
    print(f"  heritage_relevant: null/missing = {heritage_none}")

    # Data completeness
    has_affiliations = sum(1 for r in all_results if r['has_affiliations'])
    has_web_claims = sum(1 for r in all_results if r['has_web_claims'])
    has_birth_date = sum(1 for r in all_results if r['has_birth_date'])
    has_birth_place = sum(1 for r in all_results if r['has_birth_place'])
    has_location = sum(1 for r in all_results if r['has_current_location'])

    print(f"\n📊 DATA COMPLETENESS:")
    print(f"  Has affiliations: {has_affiliations} ({100*has_affiliations/len(all_results):.1f}%)")
    print(f"  Has web claims: {has_web_claims} ({100*has_web_claims/len(all_results):.1f}%)")
    print(f"  Has birth date: {has_birth_date} ({100*has_birth_date/len(all_results):.1f}%)")
    print(f"  Has birth place: {has_birth_place} ({100*has_birth_place/len(all_results):.1f}%)")
    print(f"  Has any location: {has_location} ({100*has_location/len(all_results):.1f}%)")

    # Extraction agents
    agents = defaultdict(int)
    for r in all_results:
        agent = r.get('extraction_agent') or 'unknown'
        agents[agent] += 1

    print(f"\n🤖 EXTRACTION AGENTS:")
    for agent, count in sorted(agents.items(), key=lambda x: -x[1]):
        print(f"  {agent}: {count}")

    # Extraction methods
    methods = defaultdict(int)
    for r in all_results:
        method = r.get('extraction_method') or 'unknown'
        methods[method] += 1

    print(f"\n📝 EXTRACTION METHODS:")
    for method, count in sorted(methods.items(), key=lambda x: -x[1]):
        print(f"  {method}: {count}")

    # Heritage types breakdown
    heritage_types = defaultdict(int)
    for r in all_results:
        for ht in r.get('heritage_types', []):
            heritage_types[ht] += 1

    print(f"\n🏛️ HERITAGE TYPES (GLAMORCUBESFIXPHDNT):")
    type_labels = {
        'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
        'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
        'B': 'Botanical/Zoo', 'E': 'Education', 'S': 'Society', 'F': 'Feature',
        'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy',
        'D': 'Digital', 'N': 'NGO', 'T': 'Taste/Smell'
    }
    for ht, count in sorted(heritage_types.items(), key=lambda x: -x[1]):
        label = type_labels.get(ht, 'Unknown')
        print(f"  {ht} ({label}): {count}")

    # Show some duplicate examples
    print(f"\n📋 DUPLICATE EXAMPLES (first 5):")
    for slug, files in list(duplicates.items())[:5]:
        print(f"  {slug}:")
        for f in sorted(files, key=lambda x: x['timestamp']):
            print(f"    - {f['timestamp']}")

    # Unique persons after deduplication (keeping latest)
    print(f"\n✅ AFTER DEDUPLICATION:")
    print(f"  Unique persons (keeping latest per slug): {len(slug_to_files)}")

    # Heritage relevant after dedup
    # Need to check which slug has heritage_relevant=True
    slug_heritage = {}
    for r in all_results:
        slug = r['slug']
        ts = r['timestamp']
        if slug not in slug_heritage or ts > slug_heritage[slug]['timestamp']:
            slug_heritage[slug] = {
                'timestamp': ts,
                'heritage_relevant': r['heritage_relevant']
            }

    unique_heritage_true = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is True)
    unique_heritage_false = sum(1 for v in slug_heritage.values() if v['heritage_relevant'] is False)

    print(f"  Heritage relevant (latest per slug): {unique_heritage_true}")
    print(f"  Non-heritage (latest per slug): {unique_heritage_false}")
    print(f"  Unknown (latest per slug): {len(slug_heritage) - unique_heritage_true - unique_heritage_false}")

    # PPID-eligible count
    print(f"\n🆔 PPID CANDIDATES:")
    print(f"  Total unique heritage-relevant persons: {unique_heritage_true}")
    print(f"  ID-class (living, no birth/death): {unique_heritage_true}")
    print(f"  PID-class eligible (has birth+death): 0 (all living)")

    # Save detailed report
    report = {
        'analysis_timestamp': datetime.now().isoformat(),
        'total_files': len(json_files),
        'unique_slugs': len(slug_to_files),
        'duplicates_count': len(duplicates),
        'heritage_true': heritage_true,
        'heritage_false': heritage_false,
        'unique_heritage_true': unique_heritage_true,
        'unique_heritage_false': unique_heritage_false,
        'has_affiliations': has_affiliations,
        'has_web_claims': has_web_claims,
        'has_birth_date': has_birth_date,
        'has_location': has_location,
        'agents': dict(agents),
        'duplicate_slugs': {k: v for k, v in list(duplicates.items())[:50]},  # First 50
        'errors': errors[:20]  # First 20 errors
    }

    report_path = Path('/Users/kempersc/apps/glam/data/custodian/person/entity_analysis_report.json')
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Detailed report saved to: {report_path}")


if __name__ == '__main__':
    main()