glam/scripts/extract_about_page_data.py

#!/usr/bin/env python3
"""
Extract company metadata from LinkedIn About page HTML files.

Extracts:
- Website URL
- Industry
- Company size/employee count
- Headquarters location
- Description

Usage:
    python scripts/extract_about_page_data.py [--output-dir DIR]
"""

import argparse
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from html.parser import HTMLParser


def extract_custodian_name(filename: str) -> str | None:
    """Extract custodian name from LinkedIn About filename.

    Filename format: "(N) Custodian Name_ About _ LinkedIn.html"
    """
    match = re.match(r'^\(\d+\)\s*(.+?)_\s*About\s*_\s*LinkedIn\.html$', filename)
    if match:
        name = match.group(1).strip()
        # Clean up underscores that LinkedIn uses instead of colons
        name = name.replace('_ ', ': ').replace(' _', ':')
        return name
    return None


def generate_slug(name: str) -> str:
    """Generate URL-safe slug from custodian name."""
    normalized = unicodedata.normalize('NFD', name.lower())
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    slug = re.sub(r'[^a-z0-9]+', '-', ascii_name)
    slug = re.sub(r'-+', '-', slug).strip('-')
    return slug


def is_valid_url(url: str) -> bool:
    """Validate URL is properly formatted and not garbage."""
    if not url:
        return False
    # Reject URLs with newlines or escape sequences
    if '\n' in url or '\\n' in url or '\r' in url or '\\r' in url:
        return False
    # Reject suspiciously long URLs
    if len(url) > 150:
        return False
    # Must be a properly formatted URL with valid TLD
    if not re.match(r'^https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(/[^\s<>"\']*)?$', url):
        return False
    return True


def extract_website_url(html_content: str) -> str | None:
    """Extract the company website URL from About page HTML.

    Uses multiple strategies:
    1. Look for href in anchor tags pointing to external sites
    2. Look for JSON patterns with website URLs
    3. Fall back to general URL extraction with strict validation
    """
    # URLs to exclude (LinkedIn, CDN, social, analytics, etc.)
    exclude_patterns = [
        'linkedin', 'licdn', 'w3.org', 'bing.com', 'google.com',
        'facebook.com', 'twitter.com', 'instagram.com', 'youtube.com',
        'tiktok.com', 'pinterest.com', 'tumblr.com', 'reddit.com',
        'schema.org', 'cloudflare', 'analytics', 'tracking', 'leerob.com',
        'cdn.', '.svg', '.png', '.jpg', '.gif', '.css', '.js', '.woff',
        'fonts.googleapis', 'googletagmanager', 'doubleclick', 'adsense',
        'microsoft.com', 'apple.com', 'amazon.com', 'github.com',
        'openai.com', 'anthropic.com',
    ]

    def is_excluded(url: str) -> bool:
        url_lower = url.lower()
        return any(pattern in url_lower for pattern in exclude_patterns)

    # Strategy 1: Extract from href attributes in anchor tags (most reliable)
    # Pattern: href="http://www.example.com/" or href="https://example.org"
    href_pattern = re.compile(r'href="(https?://[^"]+)"')
    href_matches = href_pattern.findall(html_content)

    for url in href_matches:
        if is_excluded(url):
            continue
        # Clean trailing slash for consistency
        url = url.rstrip('/')
        if is_valid_url(url):
            return url

    # Strategy 2: Look for JSON patterns like "websiteUrl":"..." or "website":"..."
    json_url_patterns = [
        r'"websiteUrl"\s*:\s*"(https?://[^"]+)"',
        r'"website"\s*:\s*"(https?://[^"]+)"',
        r'"companyUrl"\s*:\s*"(https?://[^"]+)"',
        r'"homepageUrl"\s*:\s*"(https?://[^"]+)"',
    ]

    for pattern in json_url_patterns:
        matches = re.findall(pattern, html_content)
        for url in matches:
            if is_excluded(url):
                continue
            url = url.rstrip('/')
            if is_valid_url(url):
                return url

    # Strategy 3: General URL extraction (last resort, with strict validation)
    url_pattern = re.compile(r'https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?')
    all_urls = url_pattern.findall(html_content)

    candidate_urls = []
    for url in all_urls:
        if is_excluded(url):
            continue
        # Skip URLs ending with media extensions
        if url.lower().endswith(('.svg', '.png', '.jpg', '.jpeg', '.gif', '.css', '.js', '.woff', '.woff2')):
            continue
        url = url.rstrip('/')
        if is_valid_url(url):
            candidate_urls.append(url)

    if not candidate_urls:
        return None

    # Return the first unique URL (most likely the company website)
    seen = set()
    for url in candidate_urls:
        if url not in seen:
            seen.add(url)
            return url

    return None


def extract_about_data(html_file: Path) -> dict[str, Any] | None:
    """Extract metadata from a LinkedIn About page HTML file."""
    custodian_name = extract_custodian_name(html_file.name)
    if not custodian_name:
        return None

    try:
        with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()
    except Exception as e:
        print(f"Error reading {html_file}: {e}")
        return None

    slug = generate_slug(custodian_name)
    website_url = extract_website_url(html_content)

    # Extract other metadata if present in JSON-LD or structured data
    industry = None
    employee_count = None
    headquarters = None
    description = None

    # Try to find industry from JSON patterns
    industry_match = re.search(r'"industry"[:\s]*"([^"]+)"', html_content)
    if industry_match:
        industry = industry_match.group(1)

    # Try to find employee count
    employee_match = re.search(r'"employeeCount"[:\s]*"?(\d+)', html_content)
    if employee_match:
        employee_count = int(employee_match.group(1))

    # Try to find headquarters
    hq_match = re.search(r'"headquarters"[:\s]*"([^"]+)"', html_content)
    if hq_match:
        headquarters = hq_match.group(1)

    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    return {
        'custodian_name': custodian_name,
        'custodian_slug': slug,
        'website_url': website_url,
        'industry': industry,
        'employee_count': employee_count,
        'headquarters': headquarters,
        'description': description,
        'source_metadata': {
            'source_type': 'linkedin_company_about_page_html',
            'source_file': html_file.name,
            'extraction_timestamp': timestamp,
        }
    }


def main():
    parser = argparse.ArgumentParser(
        description='Extract metadata from LinkedIn About page HTML files'
    )
    parser.add_argument(
        '--manual-dir',
        type=Path,
        default=Path('data/custodian/person/affiliated/manual'),
        help='Directory containing LinkedIn HTML files'
    )
    parser.add_argument(
        '--output-dir',
        type=Path,
        default=Path('data/custodian/person/affiliated/about_data'),
        help='Directory to save extracted data'
    )
    parser.add_argument(
        '--limit', '-l',
        type=int,
        default=None,
        help='Limit number of files to process'
    )

    args = parser.parse_args()

    if not args.manual_dir.exists():
        print(f"Error: Manual directory not found: {args.manual_dir}", file=sys.stderr)
        sys.exit(1)

    args.output_dir.mkdir(parents=True, exist_ok=True)

    # Find all About page HTML files
    about_files = sorted(args.manual_dir.glob('*About*LinkedIn.html'))
    print(f"Found {len(about_files)} About page HTML files")

    if args.limit:
        about_files = about_files[:args.limit]
        print(f"Limited to {len(about_files)} files")

    results = []
    websites_found = 0

    for html_file in about_files:
        data = extract_about_data(html_file)
        if data:
            results.append(data)
            if data['website_url']:
                websites_found += 1
                print(f"✓ {data['custodian_name']}: {data['website_url']}")
            else:
                print(f"✗ {data['custodian_name']}: No website found")

    # Save all results
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    output_file = args.output_dir / f'about_data_{timestamp}.json'

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({
            'timestamp': timestamp,
            'total_processed': len(results),
            'websites_found': websites_found,
            'data': results,
        }, f, indent=2, ensure_ascii=False)

    print(f"\n{'='*60}")
    print(f"EXTRACTION COMPLETE")
    print(f"  Total processed: {len(results)}")
    print(f"  Websites found: {websites_found}")
    print(f"  Output: {output_file}")

    return 0


if __name__ == '__main__':
    sys.exit(main())