glam/scripts/linkedin_batch_comprehensive.py

#!/usr/bin/env python3
"""
Comprehensive LinkedIn Batch Processing - Fix All Issues

This script fixes all identified issues with the previous batch processing:
1. Properly cleans filenames (removes macOS resource forks, periods, spaces, parentheses)
2. Extracts full institution name from HTML H1 tag (not from filename)
3. Re-processes all HTML files to extract correct staff data
4. Creates person entity files from staff JSON
5. Creates/updates custodian YAML files

Usage:
    python scripts/linkedin_batch_comprehensive.py \
        --input-dir /path/to/html/files \
        --output-dir data/custodian/person/bu_fixed \
        --entity-dir data/custodian/person/entity \
        --custodian-dir data/custodian/
"""

import argparse
import json
import re
import sys
import unicodedata
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

try:
    from bs4 import BeautifulSoup
except ImportError:
    print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
    sys.exit(1)

try:
    import yaml
except ImportError:
    print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
    sys.exit(1)

# Import existing parser (we'll enhance it)
sys.path.insert(0, str(Path(__file__).parent))
from parse_linkedin_html import parse_html_file, generate_staff_id


def clean_filename_to_name(filename: str) -> str:
    """
    Clean HTML filename to extract institution name.

    Handles:
    - macOS resource fork prefixes (._)
    - Periods before numbers (._(15))
    - Numbers in parentheses (15), (7)
    - Extra spaces and underscores
    - " People _ LinkedIn.html" suffix

    Examples:
        "._(15) Gemeente Enkhuizen_ People _ LinkedIn.html"
        -> "Gemeente Enkhuizen"

        "(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html"
        -> "ADVN archief voor nationale bewegingen"

        "15-arabian-oud_ People _ LinkedIn.html"
        -> "arabian oud"
    """
    # Remove " People _ LinkedIn.html" suffix
    name = filename.replace(' People _ LinkedIn.html', '')

    # Remove .html extension
    name = name.replace('.html', '')

    # Remove macOS resource fork prefix (._)
    if name.startswith('._'):
        name = name[2:]

    # Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
    name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
    name = re.sub(r'^\._*\(\d+\)\s*', '', name)

    # Remove trailing spaces and underscores
    name = name.strip('_ ')

    # Replace multiple spaces with single space
    name = re.sub(r'\s+', ' ', name)

    return name.strip()


def extract_institution_name_from_html(html_content: str) -> Optional[str]:
    """
    Extract full institution name from HTML H1 tag.

    LinkedIn H1 format: "Organization Name | LinkedIn"
    We extract the part before the pipe.

    Returns None if H1 not found.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    h1 = soup.find('h1')

    if h1:
        h1_text = h1.get_text().strip()
        # Remove " | LinkedIn" suffix
        if ' | ' in h1_text:
            name = h1_text.split(' | ')[0].strip()
        else:
            name = h1_text

        # Clean up extra pipes or separators
        name = re.sub(r'\s*\|\s*', ' ', name)
        name = re.sub(r'\s+', ' ', name)

        return name if name else None

    return None


def process_html_file(html_path: Path, output_dir: Path) -> dict[str, Any]:
    """
    Process a single HTML file to extract staff data.

    Extracts institution name from HTML H1 tag, not from filename.
    Cleans filename to generate slug.
    """
    # Extract name from HTML
    with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
        html_content = f.read()

    # Try to get name from HTML H1
    html_name = extract_institution_name_from_html(html_content)

    if not html_name:
        # Fallback: extract from filename
        html_name = clean_filename_to_name(html_path.name)
        print(f"Warning: H1 not found in {html_path.name}, using filename: {html_name}", file=sys.stderr)

    # Generate slug from cleaned filename
    slug_base = clean_filename_to_name(html_path.name)
    # Convert to URL-safe slug
    slug = re.sub(r'[^a-z0-9]+', '-', slug_base.lower())
    slug = re.sub(r'-+', '-', slug).strip('-')

    # Parse HTML using existing parser
    result = parse_html_file(html_path, html_name, slug)

    # Update custodian name in result with the one from HTML
    result['custodian_metadata']['custodian_name'] = html_name
    result['custodian_metadata']['name'] = html_name

    # Update source filename
    result['source_metadata']['source_file'] = html_path.name

    return result


def create_person_entity(staff_entry: dict, custodian_name: str, html_filename: str, entity_dir: Path) -> Optional[Path]:
    """
    Create a person entity JSON file from a staff entry.

    Follows Rule 20: Person Entity Profiles - Individual File Storage
    """
    name = staff_entry.get('name', '')
    if not name or name.startswith('LinkedIn Member'):
        # Skip anonymous profiles - they don't have entity profiles
        return None

    # Generate person profile path
    # Format: {linkedin-slug}_{ISO-timestamp}.json
    linkedin_slug = staff_entry.get('linkedin_slug', '')
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    entity_filename = f"{linkedin_slug}_{timestamp}.json"
    entity_path = entity_dir / entity_filename

    # Check if file already exists
    if entity_path.exists():
        return entity_path

    # Create person entity structure
    person_entity = {
        'extraction_agent': 'claude-opus-4.5',
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
        'profile_data': {
            'person_id': staff_entry.get('staff_id'),
            'full_name': name,
            'linkedin_slug': linkedin_slug,
            'linkedin_profile_url': staff_entry.get('linkedin_profile_url'),
            'headline': staff_entry.get('headline', ''),
            'degree': staff_entry.get('degree', 'unknown'),
            'mutual_connections': staff_entry.get('mutual_connections', ''),
        },
        'affiliations': [{
            'organization_name': custodian_name,
            'organization_slug': None,  # Will be filled during custodian matching
            'role_title': staff_entry.get('headline', ''),
            'affiliation_type': 'staff',
            'affiliation_provenance': {
                'source_type': 'linkedin_company_people_page_html',
                'source_file': html_filename,
                'registered_timestamp': timestamp,
                'registration_method': 'html_parsing',
            }
        }],
        'web_claims': [],  # Could be enhanced by scraping profile pages
        'extraction_metadata': {
            'heritage_relevant': staff_entry.get('heritage_relevant', False),
            'heritage_type': staff_entry.get('heritage_type'),
            'name_type': staff_entry.get('name_type', 'unknown'),
        }
    }

    # Add name correction if present
    if 'name_correction' in staff_entry:
        person_entity['extraction_metadata']['name_correction'] = staff_entry['name_correction']

    # Write entity file
    entity_path.parent.mkdir(parents=True, exist_ok=True)
    with open(entity_path, 'w', encoding='utf-8') as f:
        json.dump(person_entity, f, indent=2, ensure_ascii=False)

    return entity_path


def find_or_create_custodian(custodian_name: str, custodian_dir: Path, staff_data: dict) -> tuple[Path, bool]:
    """
    Find existing custodian YAML file or create new one.

    Returns (file_path, is_new)
    """
    # Try to find existing custodian by name (case-insensitive)
    existing_file = None
    for custodian_file in custodian_dir.glob('*.yaml'):
        try:
            with open(custodian_file, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
                if data and data.get('custodian_name', '').lower() == custodian_name.lower():
                    existing_file = custodian_file
                    break
        except:
            continue

    if existing_file:
        # Update existing file
        custodian_file = existing_file
        is_new = False

        # Read existing data
        with open(custodian_file, 'r', encoding='utf-8') as f:
            custodian_data = yaml.safe_load(f) or {}

        # Add/update staff section
        custodian_data['staff'] = {
            'provenance': {
                'source_type': 'linkedin_company_people_page_html',
                'registered_timestamp': staff_data['source_metadata']['registered_timestamp'],
                'registration_method': 'html_parsing',
                'total_staff_extracted': len(staff_data['staff']),
            },
            'staff_list': [
                {
                    'staff_id': s.get('staff_id'),
                    'person_name': s.get('name'),
                    'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json",
                    'role_title': s.get('headline', ''),
                    'heritage_relevant': s.get('heritage_relevant', False),
                    'heritage_type': s.get('heritage_type'),
                }
                for s in staff_data['staff']
                if s.get('linkedin_slug')  # Only include staff with profiles
            ]
        }

        # Update custodian name
        custodian_data['custodian_name'] = custodian_name

        # Write back
        with open(custodian_file, 'w', encoding='utf-8') as f:
            yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        return (custodian_file, False)
    else:
        # Create new custodian file
        # Generate placeholder GHCID (requires geographic research)
        slug = re.sub(r'[^a-z0-9]+', '-', custodian_name.lower())
        slug = re.sub(r'-+', '-', slug).strip('-')
        slug = slug[:30]  # Limit length
        placeholder_ghcid = f"NL-XX-XXX-PENDING-{slug.upper()}"

        custodian_data = {
            'ghcid_current': placeholder_ghcid,
            'custodian_name': custodian_name,
            'institution_type': 'MUSEUM',  # Default, will be refined based on staff
            'custodian_name': {
                'emic_name': custodian_name,
                'english_name': None,
                'name_verified': True,
                'name_source': 'linkedin_html_h1',
            },
            'staff': {
                'provenance': {
                    'source_type': 'linkedin_company_people_page_html',
                    'registered_timestamp': staff_data['source_metadata']['registered_timestamp'],
                    'registration_method': 'html_parsing',
                    'total_staff_extracted': len(staff_data['staff']),
                },
                'staff_list': [
                    {
                        'staff_id': s.get('staff_id'),
                        'person_name': s.get('name'),
                        'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json",
                        'role_title': s.get('headline', ''),
                        'heritage_relevant': s.get('heritage_relevant', False),
                        'heritage_type': s.get('heritage_type'),
                    }
                    for s in staff_data['staff']
                    if s.get('linkedin_slug')  # Only include staff with profiles
                ]
            },
            'provenance': {
                'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
                'data_tier': 'TIER_4_INFERRED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Comprehensive batch processing with HTML H1 name extraction',
                'confidence_score': 0.85,
                'notes': f'Staff extracted from LinkedIn company People page. Location research needed for GHCID. Total staff: {len(staff_data["staff"])}',
            }
        }

        # Determine institution type based on staff heritage analysis
        heritage_types = staff_data['staff_analysis'].get('staff_by_heritage_type', {})
        if heritage_types:
            # Find most common heritage type
            most_common = Counter(heritage_types).most_common(1)
            if most_common:
                type_code = most_common[0][0]
                type_map = {
                    'M': 'MUSEUM',
                    'L': 'LIBRARY',
                    'A': 'ARCHIVE',
                    'G': 'GALLERY',
                    'R': 'RESEARCH_CENTER',
                    'E': 'EDUCATION_PROVIDER',
                    'S': 'COLLECTING_SOCIETY',
                    'D': 'DIGITAL_PLATFORM',
                }
                if type_code in type_map:
                    custodian_data['institution_type'] = type_map[type_code]

        # Create new file
        custodian_file = custodian_dir / f"{placeholder_ghcid}.yaml"
        with open(custodian_file, 'w', encoding='utf-8') as f:
            yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        return (custodian_file, True)


def main():
    parser = argparse.ArgumentParser(
        description='Comprehensive LinkedIn batch processing - fixes name extraction and creates full dataset'
    )
    parser.add_argument('--input-dir', type=Path, required=True,
                    help='Directory containing LinkedIn HTML files')
    parser.add_argument('--output-dir', type=Path, required=True,
                    help='Output directory for staff JSON files')
    parser.add_argument('--entity-dir', type=Path, required=True,
                    help='Output directory for person entity files')
    parser.add_argument('--custodian-dir', type=Path, required=True,
                    help='Directory containing custodian YAML files')
    parser.add_argument('--limit', type=int, default=0,
                    help='Limit processing to first N files (0 = all)')

    args = parser.parse_args()

    if not args.input_dir.exists():
        print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
        sys.exit(1)

    # Create output directories
    args.output_dir.mkdir(parents=True, exist_ok=True)
    args.entity_dir.mkdir(parents=True, exist_ok=True)
    args.custodian_dir.mkdir(parents=True, exist_ok=True)

    # Get all HTML files
    html_files = sorted(args.input_dir.glob('*.html'))

    if args.limit > 0:
        html_files = html_files[:args.limit]

    print(f"Processing {len(html_files)} HTML files...")
    print(f"Input directory: {args.input_dir}")
    print(f"Staff output directory: {args.output_dir}")
    print(f"Entity output directory: {args.entity_dir}")
    print(f"Custodian directory: {args.custodian_dir}")

    # Statistics
    stats = {
        'total_html': len(html_files),
        'processed': 0,
        'errors': 0,
        'with_staff': 0,
        'total_staff': 0,
        'entities_created': 0,
        'custodians_updated': 0,
        'custodians_created': 0,
        'name_fixes': 0,  # Files where H1 name differs from filename
    }

    for i, html_path in enumerate(html_files, 1):
        try:
            print(f"[{i}/{len(html_files)}] Processing: {html_path.name}")

            # Step 1: Parse HTML and extract staff
            result = process_html_file(html_path, args.output_dir)

            # Generate staff JSON filename
            slug = result['custodian_metadata']['custodian_slug']
            timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
            staff_filename = args.output_dir / f"{slug}_staff_{timestamp}.json"

            # Save staff JSON
            with open(staff_filename, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)

            stats['processed'] += 1

            # Step 2: Create person entity files
            staff_list = result.get('staff', [])
            staff_with_profiles = [s for s in staff_list if s.get('linkedin_slug')]

            if staff_with_profiles:
                custodian_name = result['custodian_metadata'].get('custodian_name')

                for staff_entry in staff_with_profiles:
                    entity_path = create_person_entity(
                        staff_entry,
                        custodian_name,
                        html_path.name,
                        args.entity_dir
                    )
                    if entity_path:
                        stats['entities_created'] += 1

            # Step 3: Create or update custodian YAML
            if staff_with_profiles:
                custodian_file, is_new = find_or_create_custodian(
                    result['custodian_metadata'].get('custodian_name'),
                    args.custodian_dir,
                    result
                )

                if is_new:
                    stats['custodians_created'] += 1
                else:
                    stats['custodians_updated'] += 1

                stats['with_staff'] += 1
                stats['total_staff'] += len(staff_with_profiles)

            # Check if name was fixed (H1 different from filename)
            filename_name = clean_filename_to_name(html_path.name)
            html_name = result['custodian_metadata'].get('custodian_name')
            if html_name and filename_name and html_name != filename_name:
                stats['name_fixes'] += 1
                print(f"  Name fixed: '{filename_name}' -> '{html_name}'")

        except Exception as e:
            print(f"Error processing {html_path.name}: {e}", file=sys.stderr)
            stats['errors'] += 1

    # Print summary
    print("\n" + "="*60)
    print("PROCESSING COMPLETE")
    print("="*60)
    print(f"\nStatistics:")
    print(f"  Total HTML files: {stats['total_html']}")
    print(f"  Successfully processed: {stats['processed']}")
    print(f"  Errors: {stats['errors']}")
    print(f"  Institutions with staff: {stats['with_staff']}")
    print(f"  Total staff extracted: {stats['total_staff']}")
    print(f"  Person entities created: {stats['entities_created']}")
    print(f"  Custodians updated: {stats['custodians_updated']}")
    print(f"  Custodians created: {stats['custodians_created']}")
    print(f"  Name fixes applied: {stats['name_fixes']}")
    print(f"\nOutput directories:")
    print(f"  Staff JSON files: {args.output_dir}")
    print(f"  Person entity files: {args.entity_dir}")
    print(f"  Custodian YAML files: {args.custodian_dir}")

    return 0


if __name__ == '__main__':
    sys.exit(main())