glam/scripts/add_person_observations_to_custodians.py

#!/usr/bin/env python3
"""
Add person_observations sections to matched custodian YAML files.

This script reads the matching results from custodian_yaml_matches_final.json
and adds person_observations sections to custodian YAML files that don't have them.

Per Rule 27 (Person-Custodian Data Architecture):
- Custodian YAML files store only references and affiliation provenance
- Person entity files (data/custodian/person/entity/) store full profile data
"""

import json
import yaml
import os
import sys
from datetime import datetime, timezone
from pathlib import Path


def load_staff_json(slug: str, parsed_dir: Path) -> dict | None:
    """Load staff JSON file for a given custodian slug."""
    # Try different file patterns
    patterns = [
        f"{slug}_staff_*.json",
        f"{slug.replace('-', '_')}_staff_*.json",
    ]

    for pattern in patterns:
        matches = list(parsed_dir.glob(pattern))
        if matches:
            # Get the most recent file
            latest = max(matches, key=lambda p: p.stat().st_mtime)
            with open(latest, 'r', encoding='utf-8') as f:
                return json.load(f)

    return None


def build_person_observations(staff_data: dict, custodian_slug: str) -> dict:
    """Build person_observations structure from staff JSON data."""

    metadata = staff_data.get('custodian_metadata', {})
    staff_list = staff_data.get('staff', [])
    analysis = staff_data.get('staff_analysis', {})

    # Count staff with LinkedIn URLs
    staff_with_linkedin = sum(1 for s in staff_list if s.get('linkedin_url'))

    # Build observation metadata
    observation_metadata = {
        'retrieval_agent': 'linkedin_html_parser',
        'retrieval_timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
        'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
        'html_file': None,
        'staff_count_total': metadata.get('associated_members', len(staff_list)),
        'staff_count_extracted': len(staff_list),
        'staff_count_with_linkedin': staff_with_linkedin,
        'staff_count_heritage_relevant': analysis.get('heritage_relevant_count', 0),
    }

    # Build staff list (abbreviated for custodian YAML - full data in entity files)
    staff_observations = []
    for i, staff in enumerate(staff_list):
        person_id = staff.get('staff_id', f"{custodian_slug}_staff_{i:04d}")

        # Extract LinkedIn slug from URL
        linkedin_url = staff.get('linkedin_url', '')
        linkedin_slug = ''
        if linkedin_url and '/in/' in linkedin_url:
            linkedin_slug = linkedin_url.split('/in/')[-1].rstrip('/')

        observation = {
            'person_id': person_id,
            'person_name': staff.get('name', 'Unknown'),
            'role_title': staff.get('headline', ''),
            'heritage_relevant': staff.get('heritage_relevant', False),
            'heritage_type': staff.get('heritage_type'),
            'current': True,
            'affiliation_provenance': {
                'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
                'retrieved_on': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
                'retrieval_agent': 'linkedin_html_parser',
            },
        }

        # Add LinkedIn URL if available
        if linkedin_url:
            observation['linkedin_profile_url'] = linkedin_url

            # Check for entity file
            if linkedin_slug:
                entity_path = f"data/custodian/person/entity/{linkedin_slug}_*.json"
                observation['linkedin_profile_path'] = None  # Placeholder - entity files may not exist yet

        staff_observations.append(observation)

    return {
        'observation_metadata': observation_metadata,
        'staff': staff_observations,
    }


def add_person_observations_to_yaml(yaml_path: Path, person_observations: dict) -> bool:
    """Add person_observations section to a custodian YAML file."""

    # Read existing YAML
    with open(yaml_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Check if already has person_observations
    if 'person_observations:' in content:
        print(f"  ⚠️  Already has person_observations, skipping")
        return False

    # Parse YAML
    data = yaml.safe_load(content)

    # Add person_observations
    data['person_observations'] = person_observations

    # Write back
    with open(yaml_path, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return True


def main():
    base_dir = Path('/Users/kempersc/apps/glam')
    parsed_dir = base_dir / 'data/custodian/person/affiliated/parsed'
    matches_file = parsed_dir / 'custodian_yaml_matches_final.json'

    # Load matching results
    with open(matches_file, 'r', encoding='utf-8') as f:
        matches = json.load(f)

    matched = matches['matched']

    print(f"Found {len(matched)} matched custodians")
    print("=" * 60)

    processed = 0
    skipped = 0
    errors = 0

    for m in matched:
        custodian = m['custodian']
        yaml_file = Path(m['yaml_file'])
        slug = custodian['slug']
        name = custodian['name']
        staff_count = custodian['staff_count']
        heritage_count = custodian['heritage_count']

        print(f"\n{name} ({staff_count} staff, {heritage_count} heritage)")
        print(f"  YAML: {yaml_file.name}")

        # Check if YAML file exists
        if not yaml_file.exists():
            print(f"  ❌ YAML file not found")
            errors += 1
            continue

        # Check if already has person_observations
        with open(yaml_file, 'r', encoding='utf-8') as f:
            if 'person_observations:' in f.read():
                print(f"  ⚠️  Already has person_observations")
                skipped += 1
                continue

        # Load staff JSON
        staff_data = load_staff_json(slug, parsed_dir)
        if not staff_data:
            print(f"  ❌ Staff JSON not found for slug: {slug}")
            errors += 1
            continue

        print(f"  Staff JSON: {slug}_staff_*.json ({len(staff_data.get('staff', []))} entries)")

        # Build person_observations
        person_observations = build_person_observations(staff_data, slug)

        # Add to YAML
        if add_person_observations_to_yaml(yaml_file, person_observations):
            print(f"  ✅ Added person_observations")
            processed += 1
        else:
            skipped += 1

    print("\n" + "=" * 60)
    print(f"Summary:")
    print(f"  Processed: {processed}")
    print(f"  Skipped (already done): {skipped}")
    print(f"  Errors: {errors}")


if __name__ == '__main__':
    main()