glam/scripts/batch_parse_linkedin_manual.py

#!/usr/bin/env python3
"""
Batch process all LinkedIn company People HTML files from manual directory.

This script:
1. Scans manual directory for all HTML files
2. Extracts institution names from filenames
3. Runs parse_linkedin_html.py for each file
4. Creates person entity files for each staff member
5. Creates or updates custodian YAML files

Usage:
    python scripts/batch_parse_linkedin_manual.py [--limit N]

Options:
    --limit N    Only process first N files (for testing)
"""

import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional

try:
    import yaml
except ImportError:
    yaml = None


# Directory paths
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
PERSON_ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")


def extract_institution_name_from_filename(filename: str) -> Optional[str]:
    """Extract institution name from LinkedIn People HTML filename."""
    name = Path(filename).name
    name = name.replace('.html', '')
    name = re.sub(r'_?People _ LinkedIn$', '', name)
    name = re.sub(r'^\(\d+\)\s*', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    name = name.rstrip('_')
    return name if name else None


def generate_slug_from_name(name: str) -> str:
    """Generate URL-friendly slug from institution name."""
    slug = name.lower()
    slug = re.sub(r'[^a-z0-9\s-]', '', slug)
    slug = re.sub(r'[\s-]+', '-', slug)
    slug = slug.strip('-')
    return slug


def parse_html_file(html_path: Path, institution_name: str, slug: str) -> Optional[dict]:
    """Parse a single HTML file using parse_linkedin_html.py script."""
    output_path = Path(f"/tmp/{slug}_staff_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json")

    try:
        result = subprocess.run(
            [
                sys.executable,
                "/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
                str(html_path),
                "--custodian-name", institution_name,
                "--custodian-slug", slug,
                "--output", str(output_path)
            ],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"ERROR parsing {html_path.name}: {result.stderr}", file=sys.stderr)
            return None

        with open(output_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    except subprocess.TimeoutExpired:
        print(f"TIMEOUT parsing {html_path.name}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"ERROR parsing {html_path.name}: {e}", file=sys.stderr)
        return None


def find_existing_custodian(institution_name: str) -> Optional[Path]:
    """Find existing custodian YAML file by institution name."""
    if not yaml:
        return None

    for yaml_file in CUSTODIAN_DIR.glob("*.yaml"):
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

                if isinstance(data, list) and len(data) > 0:
                    data = data[0]

                if data and isinstance(data, dict) and 'name' in data:
                    name = data.get('name')
                    if name and name.lower() == institution_name.lower():
                        return yaml_file

                    alt_names = data.get('alternative_names', [])
                    if isinstance(alt_names, list):
                        for alt in alt_names:
                            alt_str = str(alt) if not isinstance(alt, str) else alt
                            if alt_str.lower() == institution_name.lower():
                                return yaml_file
        except Exception:
            continue

    return None


def create_person_entity(staff_member: dict, custodian_slug: str, custodian_name: str, institution_type: str) -> Optional[str]:
    """
    Create a person entity file following Rule 20 (Person Entity Profiles).

    Returns path to created file or None on error.
    """
    person_id = staff_member.get('staff_id')
    if not person_id:
        return None

    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    filename = f"{person_id}_{timestamp}.json"
    output_path = PERSON_ENTITY_DIR / filename

    person_entity = {
        'person_id': person_id,
        'extraction_metadata': {
            'extraction_agent': 'claude-opus-4.5',
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'extraction_source': f'LinkedIn company page: {custodian_name}',
            'source_url': staff_member.get('linkedin_profile_url'),
        },
        'profile_data': {
            'full_name': staff_member.get('name'),
            'name_type': staff_member.get('name_type'),
            'headline': staff_member.get('headline', ''),
            'linkedin_slug': staff_member.get('linkedin_slug'),
            'linkedin_profile_url': staff_member.get('linkedin_profile_url'),
            'connection_degree': staff_member.get('degree'),
            'mutual_connections': staff_member.get('mutual_connections', ''),
        },
        'heritage_relevance': {
            'is_heritage_relevant': staff_member.get('heritage_relevant', False),
            'heritage_type': staff_member.get('heritage_type'),
            'custodian_name': custodian_name,
            'institution_type': institution_type,
        },
        'affiliations': [
            {
                'custodian_name': custodian_name,
                'custodian_slug': custodian_slug,
                'role_title': staff_member.get('headline', ''),
                'affiliation_type': 'employment',
                'affiliation_source': 'LinkedIn company page',
                'affiliation_source_url': staff_member.get('linkedin_profile_url', ''),
            }
        ]
    }

    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(person_entity, f, indent=2, ensure_ascii=False)
        return str(output_path)
    except Exception as e:
        print(f"ERROR creating person entity: {e}", file=sys.stderr)
        return None


def create_or_update_custodian(institution_name: str, slug: str, parse_result: dict, html_file: str) -> Optional[Path]:
    """
    Create new custodian YAML file or update existing one.

    Returns path to custodian file.
    """
    existing_file = find_existing_custodian(institution_name)
    custodian_metadata = parse_result.get('custodian_metadata', {})
    staff_list = parse_result.get('staff', [])
    source_metadata = parse_result.get('source_metadata', {})

    staff_count = len([s for s in staff_list if s.get('name_type') != 'organization'])

    institution_type = 'UNKNOWN'
    staff_analysis = parse_result.get('staff_analysis', {})
    heritage_types = staff_analysis.get('staff_by_heritage_type', {})

    if heritage_types:
        # Map to GLAMORCUBESFIXPHDNT type
        type_mapping = {
            'G': 'GALLERY',
            'L': 'LIBRARY',
            'A': 'ARCHIVE',
            'M': 'MUSEUM',
            'O': 'OFFICIAL_INSTITUTION',
            'R': 'RESEARCH_CENTER',
            'C': 'CORPORATION',
            'E': 'EDUCATION_PROVIDER',
            'S': 'COLLECTING_SOCIETY',
            'D': 'DIGITAL_PLATFORM',
            'I': 'INTANGIBLE_HERITAGE_GROUP',
            'T': 'TASTE_SMELL',
            'B': 'BOTANICAL_ZOO',
            'H': 'HOLY_SITES',
            'F': 'FEATURES',
            'N': 'NGO',
            'X': 'MIXED',
            'P': 'PERSONAL_COLLECTION',
            'U': 'UNKNOWN'
        }

        for htype in heritage_types.keys():
            if heritage_types[htype] > 0:
                institution_type = type_mapping.get(htype, 'UNKNOWN')
                break

    if existing_file:
        print(f"  UPDATING: {existing_file.name}")

        with open(existing_file, 'r', encoding='utf-8') as f:
            if yaml:
                custodian_data = yaml.safe_load(f)
            else:
                custodian_data = json.load(f)

        if isinstance(custodian_data, list) and len(custodian_data) > 0:
            custodian_data = custodian_data[0]

        # Add or update staff section
        staff_section = {
            'staff_count': staff_count,
            'staff_source': {
                'source_type': 'linkedin_company_people_page_html',
                'source_file': html_file,
                'registered_timestamp': source_metadata.get('registered_timestamp'),
                'registration_method': 'html_parsing',
            },
            'staff': [
                {
                    'person_id': s.get('staff_id'),
                    'person_name': s.get('name'),
                    'role_title': s.get('headline', ''),
                    'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
                    'affiliation_provenance': {
                        'source': 'LinkedIn company page',
                        'source_url': s.get('linkedin_profile_url', ''),
                        'retrieved_on': datetime.now(timezone.utc).isoformat(),
                    }
                }
                for s in staff_list if s.get('name_type') != 'organization'
            ]
        }

        custodian_data['staff'] = staff_section
        custodian_data['provenance'] = custodian_data.get('provenance', {})
        custodian_data['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()

        with open(existing_file, 'w', encoding='utf-8') as f:
            if yaml:
                yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
            else:
                json.dump([custodian_data], f, indent=2, ensure_ascii=False)

        return existing_file
    else:
        # Create new custodian file
        print(f"  CREATING: NL-XX-XXX-?-{slug}.yaml (placeholder GHCID)")

        custodian_data = {
            'name': institution_name,
            'institution_type': institution_type,
            'description': f"Institution profile extracted from LinkedIn company page. Industry: {custodian_metadata.get('industry', 'Unknown')}",
            'ghcid': {
                'ghcid_current': 'NL-XX-XXX-PENDING',  # Placeholder - needs research
                'location_resolution': {
                    'method': 'PENDING',
                    'notes': 'GHCID not assigned - requires geographic research'
                }
            },
            'staff': {
                'staff_count': staff_count,
                'staff_source': {
                    'source_type': 'linkedin_company_people_page_html',
                    'source_file': html_file,
                    'registered_timestamp': source_metadata.get('registered_timestamp'),
                    'registration_method': 'html_parsing',
                },
                'staff': [
                    {
                        'person_id': s.get('staff_id'),
                        'person_name': s.get('name'),
                        'role_title': s.get('headline', ''),
                        'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
                        'affiliation_provenance': {
                            'source': 'LinkedIn company page',
                            'source_url': s.get('linkedin_profile_url', ''),
                            'retrieved_on': datetime.now(timezone.utc).isoformat(),
                        }
                    }
                    for s in staff_list if s.get('name_type') != 'organization'
                ]
            },
            'provenance': {
                'data_source': 'LINKEDIN_COMPANY_PAGE',
                'data_tier': 'TIER_3_CROWD_SOURCED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Batch HTML parsing from manual directory',
                'confidence_score': 0.70,
                'source_metadata': {
                    'linkedin_page_type': 'company_people_page',
                    'source_file': html_file,
                    'staff_extracted': staff_count,
                }
            }
        }

        # Generate output filename
        output_filename = f"NL-XX-UNKNOWN-{institution_type[0:3]}-{slug}.yaml"
        output_path = CUSTODIAN_DIR / output_filename

        with open(output_path, 'w', encoding='utf-8') as f:
            if yaml:
                yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
            else:
                json.dump([custodian_data], f, indent=2, ensure_ascii=False)

        print(f"  → Created: {output_filename}")
        return output_path


def main():
    """Main batch processing function."""

    # Parse command line arguments
    limit = None
    if '--limit' in sys.argv:
        idx = sys.argv.index('--limit')
        if idx + 1 < len(sys.argv):
            limit = int(sys.argv[idx + 1])

    # Ensure output directories exist
    PERSON_ENTITY_DIR.mkdir(parents=True, exist_ok=True)
    CUSTODIAN_DIR.mkdir(parents=True, exist_ok=True)

    # Get all HTML files
    html_files = sorted(MANUAL_DIR.glob("*.html"))

    if limit:
        html_files = html_files[:limit]
        print(f"LIMIT MODE: Processing first {limit} files (of {len(sorted(MANUAL_DIR.glob('*.html')))} total)")

    print(f"\nFound {len(html_files)} HTML files to process")
    print(f"Input directory: {MANUAL_DIR}")
    print(f"Output directories:")
    print(f"  - Person entities: {PERSON_ENTITY_DIR}")
    print(f"  - Custodian files: {CUSTODIAN_DIR}")
    print(f"\nStarting batch processing...")
    print()

    # Track statistics
    stats = {
        'total_files': len(html_files),
        'processed': 0,
        'errors': 0,
        'new_custodians': 0,
        'existing_custodians': 0,
        'total_staff': 0,
        'person_entities_created': 0,
        'anonymous_members': 0,
        'heritage_relevant_staff': 0,
        'custodians_by_type': defaultdict(int),
        'errors_list': [],
    }

    # Process each HTML file
    for i, html_file in enumerate(html_files, 1):
        try:
            print(f"[{i}/{len(html_files)}] Processing: {html_file.name}")

            # Extract institution name from filename
            institution_name = extract_institution_name_from_filename(html_file.name)
            if not institution_name:
                print(f"  SKIP: Could not extract name from filename")
                stats['errors'] += 1
                stats['errors_list'].append(html_file.name)
                continue

            # Generate slug
            slug = generate_slug_from_name(institution_name)

            # Parse HTML file
            parse_result = parse_html_file(html_file, institution_name, slug)
            if not parse_result:
                stats['errors'] += 1
                stats['errors_list'].append(html_file.name)
                continue

            stats['processed'] += 1

            staff_list = parse_result.get('staff', [])
            staff_analysis = parse_result.get('staff_analysis', {})

            stats['total_staff'] += staff_analysis.get('total_staff_extracted', 0)
            stats['anonymous_members'] += staff_analysis.get('anonymous_members', 0)
            stats['heritage_relevant_staff'] += staff_analysis.get('heritage_relevant_count', 0)

            # Create or update custodian
            custodian_file = create_or_update_custodian(institution_name, slug, parse_result, html_file.name)

            if custodian_file:
                # Check if new or existing
                existing = find_existing_custodian(institution_name)
                if existing:
                    stats['existing_custodians'] += 1
                else:
                    stats['new_custodians'] += 1

                    # Track institution type
                    staff_by_type = staff_analysis.get('staff_by_heritage_type', {})
                    if staff_by_type:
                        for htype in staff_by_type.keys():
                            if staff_by_type[htype] > 0:
                                # Map to GH type
                                type_map = {'M': 'MUSEUM', 'L': 'LIBRARY', 'A': 'ARCHIVE'}
                                stats['custodians_by_type'][type_map.get(htype, 'UNKNOWN')] += 1

            # Create person entity files for each staff member
            staff_count = 0
            for staff_member in staff_list:
                if staff_member.get('name_type') != 'organization':
                    staff_count += 1

                    # Only create person entity if heritage-relevant or has LinkedIn URL
                    if staff_member.get('heritage_relevant') or staff_member.get('linkedin_profile_url'):
                        person_file = create_person_entity(
                            staff_member, slug, institution_name,
                            parse_result.get('custodian_metadata', {}).get('institution_type', 'UNKNOWN')
                        )
                        if person_file:
                            stats['person_entities_created'] += 1

            if i % 50 == 0 or i == len(html_files):
                print()
                print(f"Progress: {i}/{len(html_files)} files processed")
                print(f"  New custodians: {stats['new_custodians']}")
                print(f"  Existing custodians: {stats['existing_custodians']}")
                print(f"  Total staff extracted: {stats['total_staff']}")
                print(f"  Person entities created: {stats['person_entities_created']}")
                print()

        except Exception as e:
            print(f"ERROR processing {html_file.name}: {e}", file=sys.stderr)
            stats['errors'] += 1
            stats['errors_list'].append(f"{html_file.name}: {e}")
            continue

    # Print final statistics
    print("\n" + "="*60)
    print("BATCH PROCESSING COMPLETE")
    print("="*60)
    print(f"Total files: {stats['total_files']}")
    print(f"Successfully processed: {stats['processed']}")
    print(f"Errors: {stats['errors']}")
    if stats['errors'] > 0 and stats['errors_list']:
        print(f"\nError details:")
        for err in stats['errors_list'][:10]:
            print(f"  - {err}")
        if len(stats['errors_list']) > 10:
            print(f"  ... and {len(stats['errors_list']) - 10} more errors")

    print()
    print(f"New custodians: {stats['new_custodians']}")
    print(f"Existing custodians: {stats['existing_custodians']}")
    print()
    print(f"Total staff extracted: {stats['total_staff']}")
    print(f"Heritage-relevant staff: {stats['heritage_relevant_staff']}")
    print(f"Anonymous members: {stats['anonymous_members']}")
    print(f"Person entity files created: {stats['person_entities_created']}")
    print()
    print("Custodians by type:")
    for ctype, count in sorted(stats['custodians_by_type'].items()):
        print(f"  {ctype}: {count}")
    print("="*60)

    return 0


if __name__ == '__main__':
    sys.exit(main())