glam/scripts/linkedin_h1_fast.py

#!/usr/bin/env python3
"""
Fast LinkedIn H1 Name Extraction

This is a FAST version that:
1. Extracts H1 institution names from HTML files
2. Cleans filenames properly (removes macOS resource forks, periods, parentheses)
3. Creates custodian YAML files with basic metadata
4. Does NOT extract detailed staff (too slow for 3335 files)

This solves the critical issues:
- Name extraction from H1 tags (not filenames)
- Proper filename cleaning

Usage:
    python scripts/linkedin_h1_fast.py \
        --input-dir /path/to/html/files \
        --output-dir data/custodian/
"""

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

try:
    from bs4 import BeautifulSoup
except ImportError:
    print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
    sys.exit(1)

try:
    import yaml
except ImportError:
    print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
    sys.exit(1)


def clean_filename_to_slug(filename: str) -> str:
    """
    Clean HTML filename to generate URL-safe slug.

    Handles:
    - macOS resource fork prefixes (._)
    - Periods before numbers (._(15))
    - Numbers in parentheses (15), (7)
    - Extra spaces and underscores
    - " People _ LinkedIn.html" suffix

    Examples:
        "._(15) Gemeente Enkhuizen_ People _ LinkedIn.html"
        -> "gemeente-enkhuizen"

        "(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html"
        -> "advn-archief-voor-nationale-bewegingen"
    """
    # Remove " People _ LinkedIn.html" suffix
    name = filename.replace(' People _ LinkedIn.html', '')
    name = name.replace('.html', '')

    # Remove macOS resource fork prefix (._)
    if name.startswith('._'):
        name = name[2:]

    # Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
    name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
    name = re.sub(r'^\._*\(\d+\)\s*', '', name)

    # Remove trailing spaces and underscores
    name = name.strip('_ ')

    # Replace multiple spaces with single space
    name = re.sub(r'\s+', ' ', name)

    # Convert to URL-safe slug
    slug = re.sub(r'[^a-z0-9]+', '-', name.lower())
    slug = re.sub(r'-+', '-', slug).strip('-')

    return slug


def extract_h1_name(html_content: str) -> Optional[str]:
    """
    Extract institution name from HTML H1 tag.

    LinkedIn H1 format: "Organization Name | LinkedIn"
    We extract the part before the pipe.

    Returns None if H1 not found.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    h1 = soup.find('h1')

    if h1:
        h1_text = h1.get_text().strip()
        # Remove " | LinkedIn" suffix
        if ' | ' in h1_text:
            name = h1_text.split(' | ')[0].strip()
        else:
            name = h1_text

        # Clean up extra pipes or separators
        name = re.sub(r'\s*\|\s*', ' ', name)
        name = re.sub(r'\s+', ' ', name)

        return name if name else None

    return None


def process_single_file(html_path: Path, output_dir: Path) -> dict:
    """
    Process a single HTML file.

    Extracts H1 name and creates custodian YAML.
    """
    # Read HTML
    with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
        html_content = f.read()

    # Extract name from H1
    h1_name = extract_h1_name(html_content)

    if not h1_name:
        # Fallback: extract from filename
        filename_clean = html_path.name.replace(' People _ LinkedIn.html', '')
        filename_clean = filename_clean.replace('.html', '')
        if filename_clean.startswith('._'):
            filename_clean = filename_clean[2:]
        filename_clean = re.sub(r'^\.?\_?\(\d+\)\s*', '', filename_clean)
        filename_clean = re.sub(r'^\._*\(\d+\)\s*', '', filename_clean)
        filename_clean = re.sub(r'\s+', ' ', filename_clean).strip()
        h1_name = filename_clean

    # Generate slug
    slug = clean_filename_to_slug(html_path.name)

    # Try to extract basic metadata
    follower_count = ''
    associated_members = 0

    # Look for follower count (e.g., "86K followers")
    follower_match = re.search(r'(\d+K?)\s+followers?', html_content, re.IGNORECASE)
    if follower_match:
        follower_count = follower_match.group(1)

    # Look for associated members
    member_match = re.search(r'(\d+)\s+associated\s+members?', html_content, re.IGNORECASE)
    if member_match:
        associated_members = int(member_match.group(1))

    # Count staff mentions (rough count of LinkedIn profiles)
    # Look for profile cards
    profile_count = len(re.findall(r'org-people-profile-card', html_content))

    # Create custodian data
    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    custodian_data = {
        'ghcid_current': f"NL-XX-XXX-PENDING-{slug.upper()}",
        'custodian_name': h1_name,
        'institution_type': 'MUSEUM',  # Default, can be refined later
        'custodian_name': {
            'emic_name': h1_name,
            'english_name': None,
            'name_verified': True,
            'name_source': 'linkedin_html_h1',
        },
        'linkedin_enrichment': {
            'source_file': html_path.name,
            'extraction_date': timestamp,
            'follower_count': follower_count,
            'associated_members': associated_members,
            'profile_cards_detected': profile_count,
            'source_type': 'linkedin_company_people_page_html',
            'extraction_method': 'h1_name_extraction_only',
        },
        'provenance': {
            'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
            'data_tier': 'TIER_4_INFERRED',
            'extraction_date': timestamp,
            'extraction_method': 'Fast H1 name extraction',
            'confidence_score': 0.90,
            'notes': f'H1 institution name extracted from HTML. Profile cards detected: {profile_count}. Detailed staff extraction not performed due to performance constraints.',
        },
    }

    return {
        'status': 'success',
        'slug': slug,
        'filename': html_path.name,
        'custodian_name': h1_name,
        'custodian_data': custodian_data,
    }


def main():
    parser = argparse.ArgumentParser(
        description='Fast LinkedIn H1 name extraction - solves name extraction issues'
    )
    parser.add_argument('--input-dir', type=Path, required=True,
                    help='Directory containing LinkedIn HTML files')
    parser.add_argument('--output-dir', type=Path, required=True,
                    help='Output directory for custodian YAML files')
    parser.add_argument('--limit', type=int, default=0,
                    help='Limit processing to first N files (0 = all)')

    args = parser.parse_args()

    if not args.input_dir.exists():
        print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
        sys.exit(1)

    # Create output directory
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # Get all HTML files
    html_files = sorted(args.input_dir.glob('*.html'))

    if args.limit > 0:
        html_files = html_files[:args.limit]

    print(f"Processing {len(html_files)} HTML files...")
    print(f"Input directory: {args.input_dir}")
    print(f"Output directory: {args.output_dir}")
    print(f"This will extract H1 names and create custodian YAMLs")
    print(f"Estimated time: ~{len(html_files)} seconds (~{len(html_files)//60} minutes)")

    # Statistics
    stats = {
        'total': len(html_files),
        'success': 0,
        'errors': 0,
        'name_from_h1': 0,
        'name_from_filename': 0,
        'with_profiles': 0,
        'total_profiles_detected': 0,
    }

    # Process files
    for i, html_path in enumerate(html_files, 1):
        try:
            if i % 100 == 0:
                print(f"Progress: [{i}/{len(html_files)}]", end='\r')

            result = process_single_file(html_path, args.output_dir)

            if result['status'] == 'success':
                stats['success'] += 1
                stats['total_profiles_detected'] += result['custodian_data'].get('linkedin_enrichment', {}).get('profile_cards_detected', 0)

                # Save custodian YAML
                custodian_file = args.output_dir / f"{result['slug']}.yaml"
                with open(custodian_file, 'w', encoding='utf-8') as f:
                    yaml.dump(result['custodian_data'], f, allow_unicode=True, default_flow_style=False, sort_keys=False)

                # Track name source
                if 'linkedin_html_h1' in result['custodian_data'].get('custodian_name', {}).get('name_source', ''):
                    stats['name_from_h1'] += 1
                else:
                    stats['name_from_filename'] += 1

            elif result['status'] == 'error':
                stats['errors'] += 1
                print(f"Error: {result['filename']}: {result.get('error')}", file=sys.stderr)

        except Exception as e:
            stats['errors'] += 1
            print(f"Error: {html_path.name}: {e}", file=sys.stderr)

    print(f"\nProcessing complete!")

    # Print summary
    print("\n" + "=" * 60)
    print("PROCESSING COMPLETE")
    print("=" * 60)
    print(f"\nStatistics:")
    print(f"  Total HTML files: {stats['total']}")
    print(f"  Successfully processed: {stats['success']}")
    print(f"  Errors: {stats['errors']}")
    print(f"  Names from H1: {stats['name_from_h1']}")
    print(f"  Names from filename: {stats['name_from_filename']}")
    print(f"  Total profiles detected: {stats['total_profiles_detected']}")
    print(f"\nOutput directory: {args.output_dir}")

    # Save processing report
    report = {
        'processing_date': datetime.now(timezone.utc).isoformat(),
        'input_directory': str(args.input_dir),
        'output_directory': str(args.output_dir),
        'statistics': stats,
    }

    report_file = Path('reports/linkedin_h1_fast_report.json')
    report_file.parent.mkdir(parents=True, exist_ok=True)
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(f"\nReport saved to: {report_file}")

    return 0


if __name__ == '__main__':
    sys.exit(main())