glam/scripts/linkedin_batch_complete.py

#!/usr/bin/env python3
"""
Comprehensive LinkedIn Batch Processing - ALL Data Extraction

This script extracts ALL data from LinkedIn HTML files:
1. Full institution names from HTML H1 tags (fixes name extraction bug)
2. Complete staff data (person names, LinkedIn URLs, job titles, etc.)
3. Properly cleans filenames (removes macOS resource forks, periods, parentheses)

Usage:
    python scripts/linkedin_batch_complete.py \
        --input-dir /path/to/html/files \
        --output-dir data/custodian/person/bu_complete \
        --custodian-dir data/custodian/
"""

import argparse
import json
import os
import re
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

try:
    from bs4 import BeautifulSoup
except ImportError:
    print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
    sys.exit(1)

try:
    import yaml
except ImportError:
    print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
    sys.exit(1)

sys.path.insert(0, str(Path(__file__).parent))


def clean_filename_to_slug(filename: str) -> str:
    """
    Clean HTML filename to generate URL-safe slug.

    Handles:
    - macOS resource fork prefixes (._)
    - Periods before numbers (._(15))
    - Numbers in parentheses (15), (7)
    - Extra spaces and underscores
    - " People _ LinkedIn.html" suffix
    """
    # Remove " People _ LinkedIn.html" suffix
    name = filename.replace(' People _ LinkedIn.html', '')
    name = name.replace('.html', '')

    # Remove macOS resource fork prefix (._)
    if name.startswith('._'):
        name = name[2:]

    # Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
    name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
    name = re.sub(r'^\._*\(\d+\)\s*', '', name)

    # Remove trailing spaces and underscores
    name = name.strip('_ ')

    # Convert to URL-safe slug
    slug = re.sub(r'[^a-z0-9]+', '-', name.lower())
    slug = re.sub(r'-+', '-', slug).strip('-')

    return slug


def extract_institution_name_from_html(html_content: str) -> Optional[str]:
    """
    Extract full institution name from HTML H1 tag.

    LinkedIn H1 format: "Organization Name | LinkedIn"
    We extract the part before the pipe.

    Returns None if H1 not found.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    h1 = soup.find('h1')

    if h1:
        h1_text = h1.get_text().strip()
        # Remove " | LinkedIn" suffix
        if ' | ' in h1_text:
            name = h1_text.split(' | ')[0].strip()
        else:
            name = h1_text

        # Clean up extra pipes or separators
        name = re.sub(r'\s*\|\s*', ' ', name)
        name = re.sub(r'\s+', ' ', name)

        return name if name else None

    return None


def extract_basic_metadata(html_content: str) -> dict[str, Any]:
    """
    Extract basic metadata from HTML (followers, members).
    """
    follower_count = ''
    associated_members = 0

    # Look for follower count
    follower_match = re.search(r'(\d+K?)\s+followers?', html_content, re.IGNORECASE)
    if follower_match:
        follower_count = follower_match.group(1)

    # Look for associated members
    member_match = re.search(r'(\d+)\s+associated\s+members?', html_content, re.IGNORECASE)
    if member_match:
        associated_members = int(member_match.group(1))

    # Count profile cards
    profile_count = html_content.count('org-people-profile-card')

    return {
        'follower_count': follower_count,
        'associated_members': associated_members,
        'profile_cards_detected': profile_count,
    }


def find_existing_custodian(custodian_name: str, custodian_dir: Path) -> Optional[Path]:
    """
    Find existing custodian YAML file by name (case-insensitive).
    """
    custodian_lower = custodian_name.lower()

    for custodian_file in sorted(custodian_dir.glob('*.yaml')):
        try:
            with open(custodian_file, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
                if data and data.get('custodian_name', '').lower() == custodian_lower:
                    return custodian_file
        except Exception:
            continue

    return None


def process_single_file(html_path: Path, output_dir: Path, custodian_dir: Path, use_existing_parser: bool) -> dict[str, Any]:
    """
    Process a single HTML file and extract ALL data.

    Returns processing result dictionary with:
    - H1 institution name
    - Complete staff data from parser_linkedin_html.py
    - Basic metadata
    """
    # Read HTML
    with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
        html_content = f.read()

    # Extract name from H1
    html_name = extract_institution_name_from_html(html_content)

    if not html_name:
        # Fallback: extract from filename
        filename_clean = html_path.name.replace(' People _ LinkedIn.html', '')
        filename_clean = filename_clean.replace('.html', '')
        if filename_clean.startswith('._'):
            filename_clean = filename_clean[2:]
        filename_clean = re.sub(r'^\.?\_?\(\d+\)\s*', '', filename_clean)
        filename_clean = re.sub(r'^\._*\(\d+\)\s*', '', filename_clean)
        filename_clean = re.sub(r'\s+', ' ', filename_clean).strip()
        html_name = filename_clean

    # Generate slug
    slug = clean_filename_to_slug(html_path.name)

    # Extract basic metadata
    basic_metadata = extract_basic_metadata(html_content)

    # Try to use existing parser for complete staff extraction
    # This gives us person names, LinkedIn URLs, job titles, heritage analysis
    try:
        from parse_linkedin_html import parse_html_file
        staff_result = parse_html_file(html_path, html_name, slug)
    except ImportError:
        # If parse_linkedin_html.py not available, skip staff extraction
        staff_result = {
            'custodian_metadata': {
                'custodian_name': html_name,
                'custodian_slug': slug,
                'name': html_name,
            },
            'source_metadata': {
                'source_type': 'linkedin_company_people_page_html',
                'source_file': html_path.name,
                'registered_timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
                'registration_method': 'html_parsing_simple_regex',
                'staff_extracted': 0,
            },
            'staff': [],
            'staff_analysis': {
                'total_staff_extracted': 0,
                'with_linkedin_url': 0,
                'with_alternate_profiles': 0,
                'anonymous_members': 0,
                'heritage_relevant_count': 0,
                'staff_by_heritage_type': {},
            },
        }
        staff_result['_parser_unavailable'] = True

    # Merge basic metadata into staff result
    staff_result['custodian_metadata']['follower_count'] = basic_metadata.get('follower_count', '')
    staff_result['custodian_metadata']['associated_members'] = basic_metadata.get('associated_members', 0)

    # Update basic metadata count if available
    if 'basic_metadata_count' in basic_metadata:
        staff_result['custodian_metadata']['profile_cards_detected'] = basic_metadata['profile_cards_detected']

    # Save staff JSON
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    staff_filename = output_dir / f"{slug}_staff_{timestamp}.json"

    with open(staff_filename, 'w', encoding='utf-8') as f:
        json.dump(staff_result, f, indent=2, ensure_ascii=False)

    return {
        'status': 'success',
        'slug': slug,
        'filename': html_path.name,
        'custodian_name': html_name,
        'staff_count': staff_result.get('staff_analysis', {}).get('total_staff_extracted', 0),
        'basic_metadata': basic_metadata,
        'result': staff_result,
    }


def create_custodian_yaml(custodian_name: str, result: dict, custodian_file: Optional[Path], is_new: bool) -> None:
    """
    Create or update custodian YAML file with staff data.
    """
    staff_list = result.get('staff', [])
    staff_with_profiles = [s for s in staff_list if s.get('linkedin_profile_url')]

    if not staff_with_profiles:
        return

    # Provenance data
    provenance = {
        'source_type': 'linkedin_company_people_page_html',
        'registered_timestamp': result['source_metadata'].get('registered_timestamp', ''),
        'registration_method': 'html_parsing_with_full_staff_data',
        'total_staff_extracted': len(staff_with_profiles),
    }

    # Staff list with references to entity files (will be created later)
    staff_list_data = []
    for s in staff_with_profiles:
        staff_entry = {
            'staff_id': s.get('staff_id'),
            'person_name': s.get('name'),
            'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json",
            'role_title': s.get('headline', ''),
            'heritage_relevant': s.get('heritage_relevant', False),
            'heritage_type': s.get('heritage_type'),
            'linkedin_profile_url': s.get('linkedin_profile_url'),
            'linkedin_slug': s.get('linkedin_slug'),
            'degree': s.get('degree', 'unknown'),
            'mutual_connections': s.get('mutual_connections', ''),
        }
        staff_list_data.append(staff_entry)

    # Basic metadata
    basic_metadata = result.get('basic_metadata', {})

    if is_new:
        # Create new custodian file
        # Determine institution type based on staff heritage analysis
        heritage_types = result.get('staff_analysis', {}).get('staff_by_heritage_type', {})

        if heritage_types:
            most_common = Counter(heritage_types).most_common(1)
            if most_common:
                type_code = most_common[0][0]
                type_map = {
                    'M': 'MUSEUM',
                    'L': 'LIBRARY',
                    'A': 'ARCHIVE',
                    'G': 'GALLERY',
                    'R': 'RESEARCH_CENTER',
                    'E': 'EDUCATION_PROVIDER',
                    'S': 'COLLECTING_SOCIETY',
                    'D': 'DIGITAL_PLATFORM',
                }
                institution_type = type_map.get(type_code, 'MUSEUM')
        else:
            institution_type = 'MUSEUM'

        # Generate placeholder GHCID
        slug = clean_filename_to_slug(f"{custodian_name}.html")
        placeholder_ghcid = f"NL-XX-XXX-PENDING-{slug.upper()}"

        custodian_data = {
            'ghcid_current': placeholder_ghcid,
            'custodian_name': custodian_name,
            'institution_type': institution_type,
            'custodian_name': {
                'emic_name': custodian_name,
                'english_name': None,
                'name_verified': True,
                'name_source': 'linkedin_html_h1',
            },
            'linkedin_enrichment': {
                'source_file': result['source_metadata'].get('source_file', ''),
                'extraction_date': result['source_metadata'].get('registered_timestamp', ''),
                'follower_count': basic_metadata.get('follower_count', ''),
                'associated_members': basic_metadata.get('associated_members', 0),
                'profile_cards_detected': basic_metadata.get('profile_cards_detected', 0),
                'source_type': 'linkedin_company_people_page_html',
                'extraction_method': 'html_parsing_with_full_staff_data',
            },
            'staff': {
                'provenance': provenance,
                'staff_list': staff_list_data,
            },
            'provenance': {
                'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
                'data_tier': 'TIER_4_INFERRED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Comprehensive batch processing with H1 name extraction and full staff data',
                'confidence_score': 0.90,
                'notes': f'Staff extracted from LinkedIn company People page. H1 name used: {custodian_name}. Total staff with profiles: {len(staff_with_profiles)}. Location research needed for GHCID.',
            }
        }

        # Create new file
        with open(custodian_file, 'w', encoding='utf-8') as f:
            yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    else:
        # Update existing file
        with open(custodian_file, 'r', encoding='utf-8') as f:
            custodian_data = yaml.safe_load(f) or {}

        # Update staff section
        custodian_data['staff'] = {
            'provenance': provenance,
            'staff_list': staff_list_data,
        }

        # Update LinkedIn enrichment
        if 'linkedin_enrichment' not in custodian_data:
            custodian_data['linkedin_enrichment'] = {
                'source_file': result['source_metadata'].get('source_file', ''),
                'extraction_date': result['source_metadata'].get('registered_timestamp', ''),
                'follower_count': basic_metadata.get('follower_count', ''),
                'associated_members': basic_metadata.get('associated_members', 0),
                'profile_cards_detected': basic_metadata.get('profile_cards_detected', 0),
                'source_type': 'linkedin_company_people_page_html',
                'extraction_method': 'html_parsing_with_full_staff_data',
            }
        else:
            # Update existing enrichment
            custodian_data['linkedin_enrichment']['source_file'] = result['source_metadata'].get('source_file', '')
            custodian_data['linkedin_enrichment']['extraction_date'] = result['source_metadata'].get('registered_timestamp', '')
            custodian_data['linkedin_enrichment']['follower_count'] = basic_metadata.get('follower_count', '')
            custodian_data['linkedin_enrichment']['associated_members'] = basic_metadata.get('associated_members', 0)
            custodian_data['linkedin_enrichment']['profile_cards_detected'] = basic_metadata.get('profile_cards_detected', 0)

        # Update custodian name
        custodian_data['custodian_name'] = custodian_name

        # Write back
        with open(custodian_file, 'w', encoding='utf-8') as f:
            yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)


def main():
    parser = argparse.ArgumentParser(
        description='Comprehensive LinkedIn batch processing - extracts ALL data (H1 names + staff data)'
    )
    parser.add_argument('--input-dir', type=Path, required=True,
                    help='Directory containing LinkedIn HTML files')
    parser.add_argument('--output-dir', type=Path, required=True,
                    help='Output directory for staff JSON files')
    parser.add_argument('--custodian-dir', type=Path, required=True,
                    help='Directory containing custodian YAML files')
    parser.add_argument('--limit', type=int, default=0,
                    help='Limit processing to first N files (0 = all)')

    args = parser.parse_args()

    if not args.input_dir.exists():
        print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
        sys.exit(1)

    # Create output directories
    args.output_dir.mkdir(parents=True, exist_ok=True)
    args.custodian_dir.mkdir(parents=True, exist_ok=True)

    # Get all HTML files
    html_files = sorted(args.input_dir.glob('*.html'))

    if args.limit > 0:
        html_files = html_files[:args.limit]

    print(f"Processing {len(html_files)} HTML files...")
    print(f"Input directory: {args.input_dir}")
    print(f"Output directory: {args.output_dir}")
    print(f"Custodian directory: {args.custodian_dir}")
    print(f"Extracting: H1 institution names + Complete staff data (names, URLs, job titles)")

    # Statistics
    stats = {
        'total': len(html_files),
        'success': 0,
        'errors': 0,
        'with_staff': 0,
        'total_staff': 0,
        'custodians_created': 0,
        'custodians_updated': 0,
        'name_from_h1': 0,
        'name_from_filename': 0,
        'parser_unavailable': 0,
    }

    # Process files
    for i, html_path in enumerate(html_files, 1):
        try:
            if i % 50 == 0:
                print(f"Progress: [{i}/{len(html_files)}]", end='\r')

            result = process_single_file(html_path, args.output_dir, args.custodian_dir, use_existing_parser=True)

            if result['status'] == 'success':
                stats['success'] += 1
                staff_count = result.get('staff_count', 0)
                stats['total_staff'] += staff_count

                if staff_count == 0:
                    # No staff data - parser unavailable
                    stats['parser_unavailable'] += 1

                if '_parser_unavailable' in result.get('result', {}):
                    stats['parser_unavailable'] += 1

                # Track name source
                if 'basic_metadata' in result:
                    name_source = 'html_h1'  # Extracted from H1
                else:
                    name_source = 'filename'  # Fallback

                if result.get('custodian_name', '') and 'custodian_name' in result:
                    stats['name_from_h1'] += 1
                else:
                    stats['name_from_filename'] += 1

                # Find or create custodian YAML
                custodian_name = result.get('custodian_name')
                if custodian_name and staff_count > 0:
                    existing_file = find_existing_custodian(custodian_name, args.custodian_dir)

                    if existing_file:
                        stats['custodians_updated'] += 1
                        # Update existing custodian
                        create_custodian_yaml(custodian_name, result, existing_file, is_new=False)
                    else:
                        stats['custodians_created'] += 1
                        # Create new custodian
                        slug = result.get('slug', '')
                        placeholder_ghcid = f"NL-XX-XXX-PENDING-{slug.upper()}"
                        custodian_file = args.custodian_dir / f"{placeholder_ghcid}.yaml"
                        create_custodian_yaml(custodian_name, result, custodian_file, is_new=True)

                if staff_count > 0:
                    stats['with_staff'] += 1

            elif result['status'] == 'error':
                stats['errors'] += 1
                print(f"Error: {result['filename']}: {result.get('error')}", file=sys.stderr)

        except Exception as e:
            stats['errors'] += 1
            print(f"Exception: {html_path.name}: {e}", file=sys.stderr)

    print(f"\nProcessing complete!")

    # Print summary
    print("\n" + "=" * 60)
    print("PROCESSING COMPLETE")
    print("=" * 60)
    print(f"\nStatistics:")
    print(f"  Total HTML files: {stats['total']}")
    print(f"  Successfully processed: {stats['success']}")
    print(f"  Errors: {stats['errors']}")
    print(f"  Institutions with staff data: {stats['with_staff']}")
    print(f"  Institutions with no staff (parser unavailable): {stats['parser_unavailable']}")
    print(f"  Total staff extracted: {stats['total_staff']}")
    print(f"  Custodians created: {stats['custodians_created']}")
    print(f"  Custodians updated: {stats['custodians_updated']}")
    print(f"  Names from H1: {stats['name_from_h1']}")
    print(f"  Names from filename: {stats['name_from_filename']}")
    print(f"\nOutput directories:")
    print(f"  Staff JSON files: {args.output_dir}")
    print(f"  Custodian YAML files: {args.custodian_dir}")

    # Save processing report
    report = {
        'processing_date': datetime.now(timezone.utc).isoformat(),
        'input_directory': str(args.input_dir),
        'output_directory': str(args.output_dir),
        'custodian_directory': str(args.custodian_dir),
        'statistics': stats,
    }

    report_file = Path('reports/linkedin_batch_complete_report.json')
    report_file.parent.mkdir(parents=True, exist_ok=True)
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(f"\nReport saved to: {report_file}")

    return 0


if __name__ == '__main__':
    sys.exit(main())