glam/scripts/link_person_observations.py

#!/usr/bin/env python3
"""
Link Person Observations to Custodian YAML Files

This script reads parsed staff files and links them to custodian YAML files
by adding person_observations sections with references to person entity files.

Web claims are stored in person entity files (single source of truth for person data).
Custodian files only store affiliation provenance (when/how person was associated).

Usage:
    python scripts/link_person_observations.py \
        --staff-file data/custodian/person/affiliated/parsed/nationaal-archief_staff_*.json \
        --custodian-file data/custodian/NL-ZH-DHA-A-NA.yaml \
        --dry-run

Rules Applied:
- Rule 5: Additive only - never delete enriched data
- Rule 12: Person data reference pattern (file paths, not inline duplication)
- Rule 20: Person entity profiles stored individually
- Rule 26: Person Data Provenance - web claims stored in entity files
"""

import argparse
import json
import glob
import os
import sys
from datetime import datetime, timezone
from pathlib import Path

import yaml

# Custom YAML representer for multiline strings
def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_representer)


def find_entity_file(linkedin_slug: str, entity_dir: Path) -> tuple[str | None, Path | None]:
    """Find the entity file for a LinkedIn slug.

    Returns tuple of (relative path from project root, absolute path) or (None, None) if not found.
    """
    pattern = str(entity_dir / f"{linkedin_slug}_*.json")
    matches = glob.glob(pattern)

    if not matches:
        return None, None

    # If multiple matches, take the most recent (sorted by filename which includes timestamp)
    matches.sort(reverse=True)
    abs_path = Path(matches[0])

    # Return path relative to project root
    rel_path = os.path.relpath(matches[0], entity_dir.parent.parent.parent.parent)
    return rel_path, abs_path


def load_staff_file(staff_file: Path) -> dict:
    """Load and parse staff JSON file."""
    with open(staff_file, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_custodian_file(custodian_file: Path) -> dict:
    """Load custodian YAML file."""
    with open(custodian_file, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def update_entity_file_with_claims(
    entity_path: Path,
    staff_member: dict,
    custodian_name: str,
    custodian_slug: str,
    timestamp: str,
    dry_run: bool = True
) -> bool:
    """Add web_claims and affiliation to person entity file.

    Returns True if update was successful.
    """
    if not entity_path or not entity_path.exists():
        return False

    try:
        with open(entity_path, 'r', encoding='utf-8') as f:
            entity_data = json.load(f)
    except (json.JSONDecodeError, IOError) as e:
        print(f"    WARNING: Could not read entity file {entity_path}: {e}")
        return False

    # Initialize web_claims if not present
    if 'web_claims' not in entity_data:
        entity_data['web_claims'] = []

    # Check if we already have claims from this source
    source_url = staff_member.get('linkedin_profile_url', '')
    existing_sources = {c.get('source_url') for c in entity_data['web_claims']}

    if source_url and source_url not in existing_sources:
        # Add name claim
        entity_data['web_claims'].append({
            'claim_type': 'full_name',
            'claim_value': staff_member['name'],
            'source_url': source_url,
            'retrieved_on': timestamp,
            'retrieval_agent': 'linkedin_html_parser',
        })

        # Add role/headline claim if present
        if staff_member.get('headline'):
            entity_data['web_claims'].append({
                'claim_type': 'role_title',
                'claim_value': staff_member['headline'],
                'source_url': source_url,
                'retrieved_on': timestamp,
                'retrieval_agent': 'linkedin_html_parser',
            })

    # Initialize affiliations if not present
    if 'affiliations' not in entity_data:
        entity_data['affiliations'] = []

    # Check if this affiliation already exists
    existing_affiliations = {
        (a.get('custodian_slug'), a.get('role_title'))
        for a in entity_data['affiliations']
    }

    affiliation_key = (custodian_slug, staff_member.get('headline', ''))
    if affiliation_key not in existing_affiliations:
        entity_data['affiliations'].append({
            'custodian_name': custodian_name,
            'custodian_slug': custodian_slug,
            'role_title': staff_member.get('headline', ''),
            'heritage_relevant': staff_member.get('heritage_relevant', False),
            'heritage_type': staff_member.get('heritage_type'),
            'current': True,
            'observed_on': timestamp,
            'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
        })

    if dry_run:
        return True

    # Write updated entity file
    try:
        with open(entity_path, 'w', encoding='utf-8') as f:
            json.dump(entity_data, f, indent=2, ensure_ascii=False)
        return True
    except IOError as e:
        print(f"    WARNING: Could not write entity file {entity_path}: {e}")
        return False


def generate_person_observations(
    staff_data: dict,
    entity_dir: Path,
    custodian_slug: str,
    custodian_name: str,
    dry_run: bool = True
) -> dict:
    """Generate person_observations section from staff data.

    Web claims are stored in person entity files.
    Custodian file only stores affiliation provenance.

    Returns a dict with observation_metadata and staff list.
    """
    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    observations = {
        'observation_metadata': {
            'retrieval_agent': 'linkedin_html_parser',
            'retrieval_timestamp': timestamp,
            'source_url': f"https://www.linkedin.com/company/{staff_data['custodian_metadata'].get('custodian_slug', custodian_slug)}/people/",
            'html_file': None,  # Not archived for this extraction
            'staff_count_total': staff_data['custodian_metadata'].get('associated_members', len(staff_data['staff'])),
            'staff_count_extracted': len(staff_data['staff']),
            'staff_count_with_linkedin': sum(1 for s in staff_data['staff'] if s.get('linkedin_slug')),
            'staff_count_with_entity_file': 0,  # Will be updated below
        },
        'staff': []
    }

    entity_file_count = 0
    entity_files_updated = 0

    for staff_member in staff_data['staff']:
        # Skip if name looks like company name (first entry often is)
        if staff_member['name'] == staff_data['custodian_metadata'].get('custodian_name'):
            continue

        # Skip anonymous/unknown entries
        if staff_member.get('name_type') == 'anonymous' or not staff_member.get('name'):
            continue

        # Basic person entry with affiliation provenance only
        person_entry = {
            'person_id': staff_member['staff_id'],
            'person_name': staff_member['name'],
            'role_title': staff_member.get('headline', ''),
            'heritage_relevant': staff_member.get('heritage_relevant', False),
            'heritage_type': staff_member.get('heritage_type'),
            'current': True,  # From current LinkedIn data
            # Affiliation provenance
            'affiliation_provenance': {
                'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
                'retrieved_on': timestamp,
                'retrieval_agent': 'linkedin_html_parser',
            }
        }

        # Add LinkedIn URL if available
        if staff_member.get('linkedin_profile_url'):
            person_entry['linkedin_profile_url'] = staff_member['linkedin_profile_url']

        # Find and link entity file if LinkedIn slug exists
        if staff_member.get('linkedin_slug'):
            rel_path, abs_path = find_entity_file(staff_member['linkedin_slug'], entity_dir)
            if rel_path and abs_path:
                person_entry['linkedin_profile_path'] = rel_path
                entity_file_count += 1

                # Update entity file with web_claims and affiliation
                if update_entity_file_with_claims(
                    abs_path,
                    staff_member,
                    custodian_name,
                    custodian_slug,
                    timestamp,
                    dry_run
                ):
                    entity_files_updated += 1

        observations['staff'].append(person_entry)

    observations['observation_metadata']['staff_count_with_entity_file'] = entity_file_count
    observations['observation_metadata']['entity_files_updated'] = entity_files_updated

    return observations


def update_custodian_file(
    custodian_file: Path,
    person_observations: dict,
    dry_run: bool = True
) -> bool:
    """Update custodian YAML file with person_observations.

    Returns True if update was successful (or would be in dry-run).
    """
    # Load existing content
    with open(custodian_file, 'r', encoding='utf-8') as f:
        content = f.read()

    data = yaml.safe_load(content)

    # Check if person_observations already exists
    if 'person_observations' in data:
        print(f"  WARNING: person_observations already exists in {custodian_file}")
        print(f"           Existing staff count: {len(data['person_observations'].get('staff', []))}")
        print(f"           New staff count: {len(person_observations['staff'])}")
        if not dry_run:
            response = input("  Overwrite? [y/N]: ").strip().lower()
            if response != 'y':
                print("  Skipping.")
                return False

    # Add person_observations
    data['person_observations'] = person_observations

    if dry_run:
        print(f"\n  DRY RUN - Would update {custodian_file}")
        print(f"  Staff entries: {len(person_observations['staff'])}")
        print(f"  With entity files: {person_observations['observation_metadata']['staff_count_with_entity_file']}")
        print(f"  Entity files would be updated: {person_observations['observation_metadata'].get('entity_files_updated', 0)}")
        return True

    # Write updated file
    with open(custodian_file, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)

    print(f"  Updated {custodian_file}")
    print(f"  Staff entries: {len(person_observations['staff'])}")
    print(f"  With entity files: {person_observations['observation_metadata']['staff_count_with_entity_file']}")
    print(f"  Entity files updated: {person_observations['observation_metadata'].get('entity_files_updated', 0)}")

    return True


def main():
    parser = argparse.ArgumentParser(
        description='Link person observations to custodian YAML files'
    )
    parser.add_argument(
        '--staff-file',
        required=True,
        help='Path to parsed staff JSON file (supports glob patterns)'
    )
    parser.add_argument(
        '--custodian-file',
        required=True,
        help='Path to custodian YAML file to update'
    )
    parser.add_argument(
        '--entity-dir',
        default='data/custodian/person/entity',
        help='Directory containing person entity files'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without making changes'
    )

    args = parser.parse_args()

    # Resolve paths
    project_root = Path(__file__).parent.parent

    # Find staff file (supports glob)
    staff_files = glob.glob(args.staff_file)
    if not staff_files:
        print(f"ERROR: No staff file found matching: {args.staff_file}")
        sys.exit(1)

    staff_file = Path(staff_files[0])  # Take first match
    if len(staff_files) > 1:
        print(f"  Note: Multiple staff files found, using: {staff_file}")

    custodian_file = Path(args.custodian_file)
    entity_dir = project_root / args.entity_dir

    # Validate files exist
    if not staff_file.exists():
        print(f"ERROR: Staff file not found: {staff_file}")
        sys.exit(1)

    if not custodian_file.exists():
        print(f"ERROR: Custodian file not found: {custodian_file}")
        sys.exit(1)

    if not entity_dir.exists():
        print(f"ERROR: Entity directory not found: {entity_dir}")
        sys.exit(1)

    print(f"Processing: {staff_file.name}")
    print(f"Target: {custodian_file.name}")
    print(f"Entity dir: {entity_dir}")

    # Load staff data
    staff_data = load_staff_file(staff_file)
    custodian_slug = staff_data['custodian_metadata'].get('custodian_slug', '')
    custodian_name = staff_data['custodian_metadata'].get('custodian_name', '')

    print(f"\nStaff file stats:")
    print(f"  Total staff: {len(staff_data['staff'])}")
    print(f"  With LinkedIn: {sum(1 for s in staff_data['staff'] if s.get('linkedin_slug'))}")

    # Generate person_observations (also updates entity files)
    observations = generate_person_observations(
        staff_data, entity_dir, custodian_slug, custodian_name, dry_run=args.dry_run
    )

    # Update custodian file
    update_custodian_file(custodian_file, observations, dry_run=args.dry_run)


if __name__ == '__main__':
    main()