glam/scripts/generate_linkedin_custodian_yaml.py

#!/usr/bin/env python3
"""
Generate custodian YAML files from LinkedIn parsed data.

Merges data from:
- Parsed staff JSON files (data/custodian/person/affiliated/parsed/*.json)
- About page data (website URLs, industry, employee count)
- URL verification results (alive/dead/redirect status)
- URL corrections (resolved dead links)

Output: data/custodian/linkedin/{slug}.yaml

Usage:
    python scripts/generate_linkedin_custodian_yaml.py
    python scripts/generate_linkedin_custodian_yaml.py --dry-run
    python scripts/generate_linkedin_custodian_yaml.py --limit 10
"""

import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
from typing import Any
import argparse

# Use ruamel.yaml for better YAML output with preserved order
try:
    from ruamel.yaml import YAML
    yaml = YAML()
    yaml.default_flow_style = False
    yaml.allow_unicode = True
    yaml.width = 120
    USE_RUAMEL = True
except ImportError:
    import yaml as pyyaml
    USE_RUAMEL = False

BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / "data/custodian/person/affiliated"
PARSED_DIR = DATA_DIR / "parsed"
ABOUT_DATA_FILE = DATA_DIR / "about_data/about_data_20251216T152238Z.json"
VERIFICATION_FILE = DATA_DIR / "verified_links/verification_results_20251216T152538Z.json"
CORRECTIONS_FILE = DATA_DIR / "verified_links/corrected_urls_20251216T160000Z.json"
OUTPUT_DIR = BASE_DIR / "data/custodian/linkedin"

# Industry to institution type mapping
INDUSTRY_TO_TYPE = {
    "Museums, Historical Sites, and Zoos": ["M"],
    "Museums and Institutions": ["M"],
    "Libraries": ["L"],
    "Archives": ["A"],
    "Research Services": ["R"],
    "Higher Education": ["E"],
    "Non-profit Organizations": ["F"],
    "Government Administration": ["O"],
    "Performing Arts": ["C"],
    "Fine Art": ["G"],
    "Civic and Social Organizations": ["S"],
    "Environmental Services": ["B"],
    "Religious Institutions": ["H"],
}


def load_parsed_staff() -> dict[str, dict]:
    """Load most recent parsed JSON per custodian slug.

    Returns dict keyed by custodian_slug with most recent staff data.
    """
    parsed_files: dict[str, list[tuple[str, Path]]] = defaultdict(list)

    for json_file in PARSED_DIR.glob("*.json"):
        # Extract slug from filename: {slug}_staff_{timestamp}.json
        match = re.match(r"(.+)_staff_(\d{8}T\d{6}Z)\.json", json_file.name)
        if match:
            slug = match.group(1)
            timestamp = match.group(2)
            parsed_files[slug].append((timestamp, json_file))

    # Keep only most recent file per slug
    result = {}
    for slug, files in parsed_files.items():
        if not files:
            continue
        # Sort by timestamp descending
        files.sort(key=lambda x: x[0], reverse=True)
        most_recent = files[0][1]

        try:
            with open(most_recent, 'r', encoding='utf-8') as f:
                data = json.load(f)
                result[slug] = data
        except (json.JSONDecodeError, FileNotFoundError) as e:
            print(f"WARNING: Failed to load {most_recent}: {e}", file=sys.stderr)

    return result


def load_about_data() -> dict[str, dict]:
    """Load about page data indexed by custodian_slug."""
    if not ABOUT_DATA_FILE.exists():
        print(f"WARNING: About data file not found: {ABOUT_DATA_FILE}", file=sys.stderr)
        return {}

    with open(ABOUT_DATA_FILE, 'r', encoding='utf-8') as f:
        raw = json.load(f)

    # Handle nested structure - data is in 'data' key
    entries = []
    if isinstance(raw, list):
        entries = raw
    elif isinstance(raw, dict) and 'data' in raw:
        entries = raw['data']
    elif isinstance(raw, dict):
        # Might be keyed by slug already
        return raw

    result = {}
    for entry in entries:
        slug = entry.get('custodian_slug')
        if slug:
            result[slug] = entry

    return result


def load_verification_results() -> dict[str, dict]:
    """Load URL verification results indexed by custodian_name."""
    if not VERIFICATION_FILE.exists():
        print(f"WARNING: Verification file not found: {VERIFICATION_FILE}", file=sys.stderr)
        return {}

    with open(VERIFICATION_FILE, 'r', encoding='utf-8') as f:
        raw = json.load(f)

    results = raw.get('results', [])
    return {r['custodian_name']: r for r in results}


def load_corrections() -> tuple[dict[str, dict], set[str], set[str]]:
    """Load URL corrections and special cases.

    Returns:
        - corrections: dict mapping custodian_name to correction info
        - permanently_closed: set of custodian names that are permanently closed
        - no_website: set of custodian names with no website
    """
    if not CORRECTIONS_FILE.exists():
        print(f"WARNING: Corrections file not found: {CORRECTIONS_FILE}", file=sys.stderr)
        return {}, set(), set()

    with open(CORRECTIONS_FILE, 'r', encoding='utf-8') as f:
        raw = json.load(f)

    corrections = {c['custodian_name']: c for c in raw.get('corrections', [])}
    permanently_closed = {c['custodian_name'] for c in raw.get('permanently_closed', [])}
    no_website = {c['custodian_name'] for c in raw.get('no_website', [])}

    # Also add google_business_defunct as having no usable website
    for item in raw.get('google_business_defunct', []):
        no_website.add(item['custodian_name'])

    return corrections, permanently_closed, no_website


def infer_institution_type(staff_data: dict, about_data: dict) -> list[str]:
    """Infer institution type from industry and staff heritage types."""
    types_found = set()

    # From industry
    industry = (about_data.get('industry') or
                staff_data.get('custodian_metadata', {}).get('industry'))
    if industry and industry in INDUSTRY_TO_TYPE:
        types_found.update(INDUSTRY_TO_TYPE[industry])

    # From staff heritage types
    staff_analysis = staff_data.get('staff_analysis', {})
    heritage_types = staff_analysis.get('staff_by_heritage_type', {})
    if heritage_types:
        # Take the most common heritage type
        sorted_types = sorted(heritage_types.items(), key=lambda x: x[1], reverse=True)
        if sorted_types:
            types_found.add(sorted_types[0][0])

    # Default to M (Museum) if nothing found
    if not types_found:
        types_found.add("M")

    return sorted(list(types_found))


def determine_website_status(
    custodian_name: str,
    about_data: dict,
    verification: dict,
    corrections: dict,
    no_website: set
) -> tuple[str | None, str, str | None]:
    """Determine final website URL and status.

    Returns:
        - website_url: Final URL to use (may be corrected)
        - status: 'verified', 'corrected', 'unverified', 'dead', 'none'
        - original_url: Original URL if corrected, else None
    """
    original_url = about_data.get('website_url')

    # Check if custodian has no website
    if custodian_name in no_website:
        return None, 'none', original_url

    # Check corrections first
    if custodian_name in corrections:
        correction = corrections[custodian_name]
        return correction['corrected_url'], 'corrected', correction.get('original_url', original_url)

    # No original URL from about page
    if not original_url:
        return None, 'none', None

    # Check verification status
    if custodian_name in verification:
        v = verification[custodian_name]
        if v.get('is_alive'):
            final_url = v.get('final_url') or original_url
            return final_url, 'verified', None if final_url == original_url else original_url
        else:
            # Dead link not in corrections
            return original_url, 'dead', None

    # Not verified
    return original_url, 'unverified', None


def build_heritage_staff_list(staff_data: dict) -> list[dict]:
    """Build list of heritage-relevant staff members."""
    staff = staff_data.get('staff', [])
    heritage_staff = []

    for person in staff:
        if not person.get('heritage_relevant'):
            continue

        entry = {
            'name': person.get('name'),
            'headline': person.get('headline'),
        }

        if person.get('linkedin_profile_url'):
            entry['linkedin_url'] = person['linkedin_profile_url']
        elif person.get('linkedin_slug'):
            entry['linkedin_url'] = f"https://www.linkedin.com/in/{person['linkedin_slug']}"

        if person.get('heritage_type'):
            entry['heritage_type'] = person['heritage_type']

        heritage_staff.append(entry)

    return heritage_staff


def generate_custodian_yaml(
    slug: str,
    staff_data: dict,
    about_data: dict,
    verification: dict,
    corrections: dict,
    no_website: set
) -> dict[str, Any]:
    """Generate YAML structure for a single custodian."""
    metadata = staff_data.get('custodian_metadata', {})
    source_meta = staff_data.get('source_metadata', {})
    staff_analysis = staff_data.get('staff_analysis', {})

    custodian_name = metadata.get('custodian_name') or metadata.get('name', slug)

    # Determine website
    website_url, website_status, original_url = determine_website_status(
        custodian_name, about_data, verification, corrections, no_website
    )

    # Build YAML structure
    result = {
        'ghcid_temp': f"ghcid:linkedin:{slug}",
        'name': custodian_name,
        'linkedin_slug': slug,
    }

    # Website info
    if website_url:
        result['website'] = website_url
    result['website_status'] = website_status
    if original_url:
        result['website_original'] = original_url

    # LinkedIn URL
    result['linkedin_url'] = f"https://www.linkedin.com/company/{slug}"

    # Industry
    industry = about_data.get('industry') or metadata.get('industry')
    if industry:
        result['industry'] = industry

    # Location
    location = metadata.get('location')
    if location and isinstance(location, dict):
        result['location'] = location

    # Institution type
    result['institution_type'] = infer_institution_type(staff_data, about_data)

    # Staff counts
    if metadata.get('follower_count'):
        result['follower_count'] = metadata['follower_count']

    result['staff_count'] = staff_analysis.get('total_staff_extracted', len(staff_data.get('staff', [])))
    result['heritage_staff_count'] = staff_analysis.get('heritage_relevant_count', 0)

    # Heritage staff list
    heritage_staff = build_heritage_staff_list(staff_data)
    if heritage_staff:
        result['heritage_staff'] = heritage_staff

    # Provenance
    now = datetime.now(timezone.utc).isoformat()
    result['provenance'] = {
        'schema_version': '1.0.0',
        'generated_at': now,
        'sources': {
            'linkedin_people_page': [{
                'source_type': source_meta.get('source_type', 'linkedin_company_people_page_html'),
                'source_file': source_meta.get('source_file'),
                'extraction_timestamp': source_meta.get('registered_timestamp'),
                'claims_extracted': ['staff', 'industry', 'location', 'follower_count'],
            }]
        }
    }

    # Add about page source if available
    if about_data and about_data.get('source_metadata'):
        about_source = about_data['source_metadata']
        result['provenance']['sources']['linkedin_about_page'] = [{
            'source_type': about_source.get('source_type', 'linkedin_company_about_page_html'),
            'source_file': about_source.get('source_file'),
            'extraction_timestamp': about_source.get('extraction_timestamp'),
            'claims_extracted': ['website_url', 'employee_count'],
        }]

    # Add verification source if website was verified
    if website_status in ('verified', 'corrected'):
        result['provenance']['sources']['url_verification'] = [{
            'source_type': 'http_verification' if website_status == 'verified' else 'manual_correction',
            'verification_timestamp': now,
            'claims_extracted': ['website_status', 'final_url'],
        }]

    return result


def write_yaml(data: dict, path: Path):
    """Write YAML file."""
    if USE_RUAMEL:
        with open(path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f)
    else:
        with open(path, 'w', encoding='utf-8') as f:
            pyyaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)


def main():
    parser = argparse.ArgumentParser(description='Generate custodian YAML from LinkedIn data')
    parser.add_argument('--dry-run', action='store_true', help='Print what would be done without writing files')
    parser.add_argument('--limit', type=int, help='Limit number of custodians to process')
    parser.add_argument('--slug', type=str, help='Process only a specific custodian slug')
    args = parser.parse_args()

    print("Loading data sources...")

    # Load all data sources
    staff_by_slug = load_parsed_staff()
    print(f"  Loaded {len(staff_by_slug)} parsed staff files")

    about_by_slug = load_about_data()
    print(f"  Loaded {len(about_by_slug)} about page entries")

    verification_by_name = load_verification_results()
    print(f"  Loaded {len(verification_by_name)} verification results")

    corrections, permanently_closed, no_website = load_corrections()
    print(f"  Loaded {len(corrections)} URL corrections")
    print(f"  Found {len(permanently_closed)} permanently closed")
    print(f"  Found {len(no_website)} with no website")

    # Create output directory
    if not args.dry_run:
        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Process custodians
    processed = 0
    skipped_closed = 0
    skipped_errors = 0

    slugs_to_process = [args.slug] if args.slug else sorted(staff_by_slug.keys())

    for slug in slugs_to_process:
        if args.limit and processed >= args.limit:
            break

        if slug not in staff_by_slug:
            print(f"WARNING: Slug not found: {slug}", file=sys.stderr)
            continue

        staff_data = staff_by_slug[slug]
        metadata = staff_data.get('custodian_metadata', {})
        name = metadata.get('custodian_name') or metadata.get('name', slug)

        # Skip permanently closed
        if name in permanently_closed:
            print(f"SKIPPED (permanently closed): {name}")
            skipped_closed += 1
            continue

        try:
            # Get matching about data
            about_data = about_by_slug.get(slug, {})

            # Generate YAML
            yaml_data = generate_custodian_yaml(
                slug,
                staff_data,
                about_data,
                verification_by_name,
                corrections,
                no_website
            )

            if args.dry_run:
                print(f"Would generate: {slug}.yaml ({name})")
                if processed < 3:  # Show first 3 examples
                    if USE_RUAMEL:
                        from io import StringIO
                        stream = StringIO()
                        yaml.dump(yaml_data, stream)
                        print(stream.getvalue()[:500])
                    else:
                        print(pyyaml.dump(yaml_data, default_flow_style=False)[:500])
                    print("---")
            else:
                output_path = OUTPUT_DIR / f"{slug}.yaml"
                write_yaml(yaml_data, output_path)
                print(f"Generated: {output_path.name}")

            processed += 1

        except Exception as e:
            print(f"ERROR processing {slug}: {e}", file=sys.stderr)
            skipped_errors += 1
            continue

    print("\n" + "="*60)
    print(f"SUMMARY:")
    print(f"  Processed: {processed}")
    print(f"  Skipped (closed): {skipped_closed}")
    print(f"  Skipped (errors): {skipped_errors}")
    if not args.dry_run:
        print(f"  Output directory: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()