glam/scripts/extract_linkedin_urls_from_html.py

#!/usr/bin/env python3
"""
Extract LinkedIn profile URLs from saved LinkedIn company People page HTML files.

This script parses saved HTML files to extract name → profile URL mappings,
which can then be used to enrich staff data parsed from markdown files.

The HTML contains profile cards with structure like:
    <a href="https://www.linkedin.com/in/username?miniProfileUrn=...">
        <img alt="Person Name" ...>
    </a>
    <a aria-label="View Person Name's profile" href="...">Person Name</a>

Usage:
    python scripts/extract_linkedin_urls_from_html.py <html_file> [--output json_file]

Example:
    python scripts/extract_linkedin_urls_from_html.py \
        "data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \
        --output data/custodian/person/rijksmuseum_profile_urls.json
"""

import argparse
import json
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any
from html.parser import HTMLParser
from urllib.parse import urlparse, parse_qs, unquote


class LinkedInProfileExtractor(HTMLParser):
    """
    HTML parser to extract LinkedIn profile URLs and associated names.
    """

    def __init__(self):
        super().__init__()
        self.profiles: dict[str, dict] = {}  # url_slug -> {name, full_url, ...}
        self.current_href = None
        self.current_name = None
        self.in_link = False
        self.link_text = ""

        # Track all name-url associations
        self.name_to_urls: dict[str, list[str]] = defaultdict(list)
        self.url_to_names: dict[str, list[str]] = defaultdict(list)

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attrs_dict = dict(attrs)

        if tag == 'a':
            href = attrs_dict.get('href', '')
            if href and 'linkedin.com/in/' in href:
                self.in_link = True
                self.current_href = href
                self.link_text = ""

                # Extract name from aria-label if available
                aria_label = attrs_dict.get('aria-label', '')
                if aria_label:
                    # "View Kelly Davis' profile" -> "Kelly Davis"
                    match = re.match(r"View (.+?)'s profile", aria_label)
                    if match:
                        self.current_name = match.group(1)
                        self._record_association(self.current_name, href)

        elif tag == 'img':
            # Images have alt text with names
            alt = attrs_dict.get('alt', '')
            if alt and self.current_href:
                # Don't use generic alt text
                if alt.lower() not in ('profile photo', 'photo', 'image', ''):
                    self._record_association(alt, self.current_href)

    def handle_data(self, data: str) -> None:
        if self.in_link:
            self.link_text += data.strip()

    def handle_endtag(self, tag: str) -> None:
        if tag == 'a' and self.in_link:
            # Record link text as name
            if self.link_text and self.current_href:
                # Clean up the name
                name = self.link_text.strip()
                if name and len(name) > 1 and not name.isdigit():
                    self._record_association(name, self.current_href)

            self.in_link = False
            self.current_href = None
            self.link_text = ""

    def _record_association(self, name: str, url: str) -> None:
        """Record a name-URL association."""
        if not name or not url:
            return

        # Extract the clean slug from URL
        slug = extract_slug_from_url(url)
        if not slug:
            return

        # Clean name
        name = name.strip()
        if not name or len(name) < 2:
            return

        # Record both directions
        self.name_to_urls[name].append(slug)
        self.url_to_names[slug].append(name)

        # Store in profiles dict (will be deduplicated later)
        if slug not in self.profiles:
            self.profiles[slug] = {
                'slug': slug,
                'full_url': f"https://www.linkedin.com/in/{slug}",
                'names': set(),
                'is_aco_id': slug.startswith('ACo'),
            }
        self.profiles[slug]['names'].add(name)


def extract_slug_from_url(url: str) -> str | None:
    """
    Extract the profile slug from a LinkedIn URL.

    Handles:
    - https://www.linkedin.com/in/username
    - https://www.linkedin.com/in/username?miniProfileUrn=...
    - /in/username (relative URL)
    """
    # Handle relative URLs
    if url.startswith('/in/'):
        url = f"https://www.linkedin.com{url}"

    try:
        parsed = urlparse(url)
        path = parsed.path

        # Extract from /in/username
        match = re.match(r'/in/([^/?]+)', path)
        if match:
            return match.group(1)
    except Exception:
        pass

    return None


def parse_html_file(filepath: Path) -> dict[str, Any]:
    """
    Parse an HTML file and extract profile URL mappings.

    Returns a dict with:
    - profiles: dict[slug] -> {slug, full_url, names, is_aco_id}
    - name_to_slug: dict[name] -> slug (best match)
    - stats: extraction statistics
    """
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        html_content = f.read()

    # Parse with our custom parser
    parser = LinkedInProfileExtractor()
    try:
        parser.feed(html_content)
    except Exception as e:
        print(f"Warning: HTML parsing error: {e}", file=sys.stderr)

    # Also do regex extraction as backup
    # Pattern for profile URLs
    url_pattern = r'linkedin\.com/in/([a-zA-Z0-9_-]+)'
    regex_slugs = set(re.findall(url_pattern, html_content))

    # Add any regex-found slugs not in parser results
    for slug in regex_slugs:
        if slug not in parser.profiles:
            parser.profiles[slug] = {
                'slug': slug,
                'full_url': f"https://www.linkedin.com/in/{slug}",
                'names': set(),
                'is_aco_id': slug.startswith('ACo'),
            }

    # Build name -> slug mapping (prefer non-ACo slugs)
    name_to_slug: dict[str, str] = {}

    for name, slugs in parser.name_to_urls.items():
        # Get unique slugs
        unique_slugs = list(set(slugs))

        # Prefer non-ACo slugs
        non_aco = [s for s in unique_slugs if not s.startswith('ACo')]
        if non_aco:
            name_to_slug[name] = non_aco[0]
        elif unique_slugs:
            name_to_slug[name] = unique_slugs[0]

    # Convert sets to lists for JSON serialization
    profiles_serializable = {}
    for slug, data in parser.profiles.items():
        profiles_serializable[slug] = {
            **data,
            'names': list(data['names'])
        }

    # Compute stats
    total_profiles = len(parser.profiles)
    aco_profiles = len([s for s in parser.profiles if s.startswith('ACo')])
    named_profiles = len([p for p in parser.profiles.values() if p['names']])

    return {
        'profiles': profiles_serializable,
        'name_to_slug': name_to_slug,
        'slug_to_names': {slug: list(names) for slug, names in parser.url_to_names.items()},
        'stats': {
            'total_profiles': total_profiles,
            'clean_slugs': total_profiles - aco_profiles,
            'aco_ids': aco_profiles,
            'profiles_with_names': named_profiles,
            'unique_names_found': len(name_to_slug),
        }
    }


def normalize_name_for_matching(name: str) -> str:
    """Normalize a name for fuzzy matching."""
    import unicodedata

    # NFD decomposition and remove diacritics
    normalized = unicodedata.normalize('NFD', name.lower())
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Remove extra whitespace
    ascii_name = ' '.join(ascii_name.split())

    return ascii_name


def match_staff_to_urls(staff_json_path: Path, url_data: dict) -> dict[str, Any]:
    """
    Match existing staff entries to extracted URLs.

    Returns enrichment data with:
    - matched: staff entries with URL matches
    - unmatched_staff: staff entries without URL matches
    - unmatched_urls: URLs without staff matches
    """
    with open(staff_json_path, 'r', encoding='utf-8') as f:
        staff_data = json.load(f)

    staff_list = staff_data.get('staff', [])
    name_to_slug = url_data.get('name_to_slug', {})
    slug_to_names = url_data.get('slug_to_names', {})

    # Build normalized name lookup
    normalized_lookup: dict[str, str] = {}
    for name, slug in name_to_slug.items():
        norm_name = normalize_name_for_matching(name)
        normalized_lookup[norm_name] = slug

    matched = []
    unmatched_staff = []
    used_slugs = set()

    for staff in staff_list:
        name = staff.get('name', '')
        norm_name = normalize_name_for_matching(name)

        # Try exact match first
        slug = name_to_slug.get(name)

        # Try normalized match
        if not slug:
            slug = normalized_lookup.get(norm_name)

        if slug:
            staff_enriched = {
                **staff,
                'linkedin_profile_url': f"https://www.linkedin.com/in/{slug}",
                'linkedin_slug': slug,
            }
            matched.append(staff_enriched)
            used_slugs.add(slug)
        else:
            unmatched_staff.append(staff)

    # Find URLs without matches
    all_slugs = set(url_data.get('profiles', {}).keys())
    unmatched_urls = []
    for slug in all_slugs - used_slugs:
        profile = url_data['profiles'].get(slug, {})
        unmatched_urls.append({
            'slug': slug,
            'names': profile.get('names', []),
            'is_aco_id': profile.get('is_aco_id', False),
        })

    return {
        'matched': matched,
        'unmatched_staff': unmatched_staff,
        'unmatched_urls': unmatched_urls,
        'match_stats': {
            'total_staff': len(staff_list),
            'matched_count': len(matched),
            'unmatched_staff_count': len(unmatched_staff),
            'unmatched_url_count': len(unmatched_urls),
            'match_rate': len(matched) / len(staff_list) if staff_list else 0,
        }
    }


def main():
    parser = argparse.ArgumentParser(
        description='Extract LinkedIn profile URLs from saved HTML files'
    )
    parser.add_argument('html_file', type=Path, help='Path to saved HTML file')
    parser.add_argument('--output', '-o', type=Path, help='Output JSON file path')
    parser.add_argument('--staff-json', type=Path,
                        help='Optional: Staff JSON file to enrich with URLs')
    parser.add_argument('--enrich-output', type=Path,
                        help='Output path for enriched staff JSON')

    args = parser.parse_args()

    if not args.html_file.exists():
        print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr)
        sys.exit(1)

    print(f"Parsing HTML file: {args.html_file}")
    url_data = parse_html_file(args.html_file)

    # Print stats
    stats = url_data['stats']
    print(f"\nExtraction Results:")
    print(f"  Total profiles found: {stats['total_profiles']}")
    print(f"  Clean slugs: {stats['clean_slugs']}")
    print(f"  ACo IDs: {stats['aco_ids']}")
    print(f"  Profiles with names: {stats['profiles_with_names']}")
    print(f"  Unique names found: {stats['unique_names_found']}")

    # Save URL extraction results
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            json.dump(url_data, f, indent=2, ensure_ascii=False)
        print(f"\nSaved URL data to: {args.output}")

    # Enrich staff data if provided
    if args.staff_json:
        if not args.staff_json.exists():
            print(f"Error: Staff JSON not found: {args.staff_json}", file=sys.stderr)
            sys.exit(1)

        print(f"\nMatching staff to URLs...")
        match_results = match_staff_to_urls(args.staff_json, url_data)

        match_stats = match_results['match_stats']
        print(f"\nMatching Results:")
        print(f"  Total staff: {match_stats['total_staff']}")
        print(f"  Matched: {match_stats['matched_count']} ({match_stats['match_rate']:.1%})")
        print(f"  Unmatched staff: {match_stats['unmatched_staff_count']}")
        print(f"  Unmatched URLs: {match_stats['unmatched_url_count']}")

        # Show some unmatched staff names
        if match_results['unmatched_staff'][:5]:
            print(f"\n  Sample unmatched staff:")
            for staff in match_results['unmatched_staff'][:5]:
                print(f"    - {staff.get('name')}")

        # Save enriched data
        if args.enrich_output:
            with open(args.staff_json, 'r', encoding='utf-8') as f:
                original_data = json.load(f)

            # Create enriched version
            enriched_data = {
                **original_data,
                'staff': match_results['matched'] + match_results['unmatched_staff'],
                'url_enrichment_stats': match_stats,
            }

            with open(args.enrich_output, 'w', encoding='utf-8') as f:
                json.dump(enriched_data, f, indent=2, ensure_ascii=False)
            print(f"\nSaved enriched staff data to: {args.enrich_output}")

    return 0


if __name__ == '__main__':
    sys.exit(main())