glam/scripts/batch_parse_linkedin_manual_v2.py

#!/usr/bin/env python3
"""
Batch processor for LinkedIn HTML files - DATA COLLECTION PHASE

This script:
1. Processes all HTML files in manual directory
2. Runs parse_linkedin_html.py for each
3. Creates staff JSON files in bu/ directory
4. Generates summary report

NOTE: Name extraction refinement (using full "about" description instead
of abbreviated names) is a POST-PROCESSING step.
"""

import json
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections import defaultdict


# Directory paths
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu")
SUMMARY_FILE = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/batch_processing_summary.json")


def extract_institution_name_from_filename(filename: str) -> str:
    """Extract institution name from LinkedIn People HTML filename.

    Removes:
    - Leading number in parentheses: (10), (15), (16)
    - Trailing " People _ LinkedIn"
    - Trailing ".html"
    - Leading commas
    - Leading underscores

    Returns clean institution name.
    """
    name = Path(filename).name
    name = name.replace('.html', '')
    name = re.sub(r'_?People _ LinkedIn$', '', name)
    name = re.sub(r'^\(\d+\)\s*', '', name)
    name = re.sub(r'^,\s*', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    name = name.rstrip('_')
    name = name.lstrip('_')
    return name.strip()


def generate_slug_from_name(name: str) -> str:
    """Generate URL-friendly slug from institution name."""
    slug = name.lower()
    slug = re.sub(r'[^a-z0-9\s-]', '', slug)
    slug = re.sub(r'[\s-]+', '-', slug)
    slug = slug.strip('-')
    return slug


def process_single_file(html_file: Path, index: int, total: int) -> dict:
    """Process a single HTML file and return result summary."""

    institution_name = extract_institution_name_from_filename(html_file.name)
    if not institution_name or len(institution_name) < 3:
        return {
            'status': 'skipped',
            'file': html_file.name,
            'reason': f'Invalid name extracted: "{institution_name}"'
        }

    slug = generate_slug_from_name(institution_name)
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json"

    try:
        result = subprocess.run(
            [
                sys.executable,
                "/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
                str(html_file),
                "--custodian-name", institution_name,
                "--custodian-slug", slug,
                "--output", str(output_file)
            ],
            capture_output=True,
            text=True,
            timeout=30
        )

        if result.returncode != 0:
            return {
                'status': 'error',
                'file': html_file.name,
                'institution_name': institution_name,
                'reason': result.stderr[:200] if result.stderr else 'Unknown error',
                'returncode': result.returncode
            }

        # Load result to extract statistics
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            staff_analysis = data.get('staff_analysis', {})
            custodian_metadata = data.get('custodian_metadata', {})
            source_metadata = data.get('source_metadata', {})

            return {
                'status': 'success',
                'file': html_file.name,
                'institution_name': institution_name,
                'slug': slug,
                'output_file': output_file.name,
                'stats': {
                    'total_staff': staff_analysis.get('total_staff_extracted', 0),
                    'with_linkedin_url': staff_analysis.get('with_linkedin_url', 0),
                    'with_alternate_profiles': staff_analysis.get('with_alternate_profiles', 0),
                    'anonymous_members': staff_analysis.get('anonymous_members', 0),
                    'heritage_relevant': staff_analysis.get('heritage_relevant_count', 0),
                    'heritage_types': staff_analysis.get('staff_by_heritage_type', {}),
                    'associated_members': custodian_metadata.get('associated_members', 0),
                    'pymk_filtered': source_metadata.get('pymk_cards_filtered', 0),
                    'duplicates_merged': source_metadata.get('duplicate_profiles_merged', 0),
                    'industry': custodian_metadata.get('industry', 'Unknown'),
                    'follower_count': custodian_metadata.get('follower_count', 'Unknown'),
                }
            }
        except Exception as e:
            return {
                'status': 'parsed_error',
                'file': html_file.name,
                'institution_name': institution_name,
                'reason': f'Failed to load output JSON: {e}'
            }

    except subprocess.TimeoutExpired:
        return {
            'status': 'timeout',
            'file': html_file.name,
            'institution_name': institution_name,
            'reason': 'Processing timeout (30s)'
        }
    except Exception as e:
        return {
            'status': 'exception',
            'file': html_file.name,
            'institution_name': institution_name,
            'reason': str(e)
        }


def main():
    """Main batch processing function."""

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    html_files = sorted(MANUAL_DIR.glob("*.html"))

    print("="*70)
    print("LINKEDIN MANUAL DIRECTORY - BATCH DATA COLLECTION")
    print("="*70)
    print(f"\nInput directory: {MANUAL_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Total HTML files to process: {len(html_files)}")
    print(f"\nStarting batch processing at: {datetime.now(timezone.utc).isoformat()}")
    print()

    # Track statistics
    all_results = []
    stats = {
        'total_files': len(html_files),
        'processed': 0,
        'errors': 0,
        'skipped': 0,
        'total_staff': 0,
        'with_linkedin_url': 0,
        'heritage_relevant': 0,
        'anonymous_members': 0,
        'custodians_by_heritage_type': defaultdict(int),
        'errors_list': [],
    }

    # Process each file
    for i, html_file in enumerate(html_files, 1):
        result = process_single_file(html_file, i, len(html_files))
        all_results.append(result)

        if result['status'] == 'success':
            stats['processed'] += 1
            stats_result = result.get('stats', {})
            stats['total_staff'] += stats_result.get('total_staff', 0)
            stats['with_linkedin_url'] += stats_result.get('with_linkedin_url', 0)
            stats['heritage_relevant'] += stats_result.get('heritage_relevant', 0)
            stats['anonymous_members'] += stats_result.get('anonymous_members', 0)

            # Track heritage types
            heritage_types = stats_result.get('heritage_types', {})
            for htype, count in heritage_types.items():
                stats['custodians_by_heritage_type'][htype] += count

        elif result['status'] in ['error', 'timeout', 'exception', 'parsed_error']:
            stats['errors'] += 1
            stats['errors_list'].append({
                'file': result['file'],
                'status': result['status'],
                'reason': result.get('reason', '')
            })
        elif result['status'] == 'skipped':
            stats['skipped'] += 1

        # Progress reporting
        if i % 100 == 0:
            progress = (i / len(html_files)) * 100
            print(f"[{i:4d}/{len(html_files)}] {progress:5.1f}% - {result['status']:7s} - {result.get('institution_name', 'N/A')}")

    # Final report
    print()
    print("="*70)
    print("BATCH PROCESSING COMPLETE")
    print("="*70)
    print(f"\nTotal files: {stats['total_files']}")
    print(f"Successfully processed: {stats['processed']}")
    print(f"Skipped: {stats['skipped']}")
    print(f"Errors: {stats['errors']}")
    print()
    print(f"Total staff extracted: {stats['total_staff']}")
    print(f"Staff with LinkedIn URLs: {stats['with_linkedin_url']}")
    print(f"Heritage-relevant staff: {stats['heritage_relevant']}")
    print(f"Anonymous members: {stats['anonymous_members']}")
    print()
    print("Staff by heritage type:")
    for htype in sorted(stats['custodians_by_heritage_type'].keys()):
        count = stats['custodians_by_heritage_type'][htype]
        print(f"  {htype}: {count}")
    print()

    if stats['errors'] > 0:
        print(f"\nFirst 20 errors:")
        for err in stats['errors_list'][:20]:
            print(f"  [{err['status']:7s}] {err['file']} - {err['reason'][:80]}")
        if len(stats['errors_list']) > 20:
            print(f"  ... and {len(stats['errors_list']) - 20} more errors")
        print()

    # Save detailed summary
    summary = {
        'processing_timestamp': datetime.now(timezone.utc).isoformat(),
        'summary_stats': stats,
        'all_results': all_results,
    }

    with open(SUMMARY_FILE, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    print(f"\nDetailed summary saved to: {SUMMARY_FILE}")
    print(f"Staff JSON files saved to: {OUTPUT_DIR}")
    print("="*70)

    return 0


if __name__ == '__main__':
    sys.exit(main())