glam/scripts/parse_linkedin_html.py

#!/usr/bin/env python3
"""
Extract complete LinkedIn staff data from saved company People page HTML files.

This script parses saved HTML files to extract complete staff profiles including:
- Name
- LinkedIn profile URL
- Headline/job title
- Connection degree
- Mutual connections

This replaces the need for MD file parsing - HTML contains ALL the data.

Usage:
    python scripts/parse_linkedin_html.py <html_file> \
        --custodian-name "Name" --custodian-slug "slug" \
        --output staff.json

Example:
    python scripts/parse_linkedin_html.py \
        "data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \
        --custodian-name "Rijksmuseum" \
        --custodian-slug "rijksmuseum" \
        --output data/custodian/person/rijksmuseum_staff.json
"""

import argparse
import json
import re
import sys
import unicodedata
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
from html.parser import HTMLParser
from urllib.parse import unquote


# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
    'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
    'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
    'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
          'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
          'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
    'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
          'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
          'collectiespecialist', 'collectie'],
    'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
    'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
          'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
    'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
    'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
          'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
          'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
          'associate professor', 'hoogleraar', 'educatie', 'educator'],
    'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
    'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
          'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
}

NON_HERITAGE_KEYWORDS = [
    'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
    'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
    'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
    'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]

# Organizations that are explicitly NOT heritage institutions
# These should never be classified as heritage-relevant
NON_HERITAGE_ORGANIZATIONS = [
    # Banks & Financial
    'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
    # Security companies
    'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
    # Police/Government (non-cultural)
    'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
    # Political parties
    'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
    'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
    # Tech companies (non-heritage)
    'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
    'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
    'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
    # Telecom
    'kpn', 'vodafone', 't-mobile', 'ziggo',
    # Postal / Logistics
    'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
    # Healthcare
    'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
    # Retail
    'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
    # Consulting / Professional services
    'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
    'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
    # Recruitment / HR
    'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
    # Energy / Utilities
    'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
    # Transport
    'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
    # Other
    'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
]

# Heritage organization keywords - organizations that ARE heritage institutions
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
HERITAGE_ORGANIZATION_KEYWORDS = [
    # Archives
    'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
    'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
    # Museums
    'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
    'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
    # Libraries
    'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
    # Film/AV heritage
    'eye film', 'filmmuseum', 'eye ', 'sound and vision',
    # Heritage platforms
    'erfgoed', 'heritage', 'cultural', 'cultureel',
    # Research institutes (heritage-focused)
    'knaw', 'humanities cluster', 'meertens', 'huygens',
]

# LinkedIn status phrases that pollute name fields (extracted from img alt text)
# These should be removed from names and stored as metadata
LINKEDIN_STATUS_PHRASES = [
    ' is open to work',
    ' is hiring',
    ' is looking for new opportunities',
    ' is looking for opportunities',
    ' is actively looking',
    ' is available for work',
    ' open to work',
    ' - open to work',
    ' • Open to work',
    ' - Hiring',
    ' • Hiring',
]

# Known compound slugs that cannot be parsed by simple hyphen splitting
# These are manually verified name mappings
KNOWN_COMPOUND_SLUGS = {
    'jponjee': 'J. Ponjee',
    'sharellyemanuelson': 'Sharelly Emanuelson',
    'addieroelofsen': 'Addie Roelofsen',
    'adheliap': 'Adhelia P.',
    'anejanboomsma': 'Anejan Boomsma',
    'fredericlogghe': 'Frederic Logghe',
    'dirkjanheinen': 'Dirkjan Heinen',
}

# Dutch name particles that should remain lowercase when not at start of name
DUTCH_NAME_PARTICLES = {'van', 'de', 'den', 'der', 'het', 't', "'t"}


def clean_linkedin_status_from_name(name: str) -> tuple[str, str | None]:
    """
    Remove LinkedIn status phrases from name and return clean name + status.

    Args:
        name: Raw name possibly containing LinkedIn status

    Returns:
        Tuple of (clean_name, linkedin_status or None)

    Examples:
        "John Doe is open to work" -> ("John Doe", "open_to_work")
        "Jane Smith is hiring" -> ("Jane Smith", "hiring")
        "Bob Jones" -> ("Bob Jones", None)
    """
    if not name:
        return (name, None)

    name_lower = name.lower()

    for phrase in LINKEDIN_STATUS_PHRASES:
        phrase_lower = phrase.lower()
        if phrase_lower in name_lower:
            # Find position and remove
            idx = name_lower.find(phrase_lower)
            clean_name = name[:idx].strip()

            # Determine status type
            if 'hiring' in phrase_lower:
                status = 'hiring'
            elif 'open to work' in phrase_lower or 'looking' in phrase_lower or 'available' in phrase_lower:
                status = 'open_to_work'
            else:
                status = 'active'

            return (clean_name, status)

    return (name, None)


def slug_to_name(slug: str) -> tuple[str, bool]:
    """
    Convert LinkedIn slug to a human-readable name.

    This is used when the extracted name from HTML doesn't match the slug
    (e.g., name contamination from logged-in user's name appearing for
    privacy-restricted profiles).

    Args:
        slug: LinkedIn profile slug (e.g., 'jan-van-der-berg-abc123')

    Returns:
        Tuple of (derived_name, is_reliable)
        - is_reliable is True if the slug had clear hyphen-separated parts
        - is_reliable is False for compound slugs without hyphens

    Examples:
        'jan-van-der-berg-abc123' -> ('Jan van der Berg', True)
        'charlotte-van-beek-55370314' -> ('Charlotte van Beek', True)
        'jponjee' -> ('J. Ponjee', True)  # Known compound slug
        'unknownslug' -> ('Unknown', False)  # Cannot parse
    """
    # URL decode the slug (handles %20, etc.)
    decoded_slug = unquote(slug)

    # Check known compound slugs first
    if decoded_slug in KNOWN_COMPOUND_SLUGS:
        return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)

    # Check if slug has hyphens (parseable)
    if '-' not in decoded_slug:
        return ("Unknown", False)

    # Remove trailing alphanumeric ID (e.g., '-abc123', '-55370314')
    # Pattern: hyphen followed by 6+ hex chars or 5+ digits at end
    clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
    clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)

    # Split by hyphen
    parts = [p for p in clean_slug.split('-') if p]
    if not parts:
        return ("Unknown", False)

    # Capitalize each part, keeping Dutch particles lowercase when not first
    name_parts = []
    for i, part in enumerate(parts):
        if part.lower() in DUTCH_NAME_PARTICLES and i > 0:
            # Dutch particle (van, de, den, der) - keep lowercase unless first word
            name_parts.append(part.lower())
        else:
            # Regular name part - capitalize
            name_parts.append(part.capitalize())

    return (' '.join(name_parts), True)


def name_matches_slug(name: str, slug: str) -> bool:
    """
    Check if an extracted name plausibly matches a LinkedIn slug.

    This is used to detect "name contamination" - when the logged-in user's
    name is extracted from the HTML instead of the actual profile owner's name.

    Args:
        name: Extracted name from HTML (e.g., 'Simon Kemper')
        slug: LinkedIn profile slug (e.g., 'jan-van-der-berg-abc123')

    Returns:
        True if the name appears to match the slug
        False if the name does NOT match (possible contamination)

    Examples:
        name_matches_slug('Jan van der Berg', 'jan-van-der-berg-abc123') -> True
        name_matches_slug('Simon Kemper', 'jan-van-der-berg-abc123') -> False
    """
    if not name or not slug:
        return False

    # Special case: "LinkedIn Member" is valid for anonymous profiles
    if name == 'LinkedIn Member':
        return True

    # URL decode and lowercase the slug
    decoded_slug = unquote(slug).lower()

    # Remove trailing ID from slug
    clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
    clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)

    # Normalize the name: lowercase, remove punctuation, convert spaces to hyphens
    name_lower = name.lower()
    name_normalized = re.sub(r'[.\'`]', '', name_lower)  # Remove periods, apostrophes
    name_normalized = re.sub(r'\s+', '-', name_normalized)  # Spaces to hyphens

    # Extract name parts (at least 2 chars each)
    name_parts = [p for p in name_normalized.split('-') if len(p) >= 2]
    if not name_parts:
        return False

    # The first name part (first name) should appear in the slug
    # This is the primary check for contamination
    first_name = name_parts[0]

    return first_name in clean_slug


class LinkedInProfileCardParser(HTMLParser):
    """
    Parse LinkedIn profile cards from saved HTML.

    Each profile card has structure:
    - org-people-profile-card__profile-image-N (contains img with alt=name, href=profile_url)
    - artdeco-entity-lockup__title (contains name text and profile link)
    - artdeco-entity-lockup__badge (contains connection degree)
    - artdeco-entity-lockup__subtitle (contains headline)
    - Mutual connections text

    Anonymous "LinkedIn Member" profiles have a different structure:
    - org-people-profile-card__profile-image-N is on an <img> tag (NOT an <a> tag)
    - No href link (privacy-protected)
    - Name appears as "LinkedIn Member" in the title
    - Still have subtitle (headline) content

    NOTE: The "People you may know" h2 header in LinkedIn company pages is actually
    the section title for the associated members list, NOT a separate recommendations
    section. All profile cards under this header are real associated members.
    """

    def __init__(self):
        super().__init__()
        self.profiles: list[dict] = []
        self.current_profile: dict = {}

        # State tracking
        self.in_profile_card = False
        self.in_title = False
        self.in_subtitle = False
        self.in_badge = False
        self.in_caption = False
        self.in_mutual = False

        self.current_text = ""
        self.card_index = -1

        # For custodian metadata extraction
        self.custodian_metadata: dict = {}
        self.in_header = True
        self.header_texts: list[str] = []

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attrs_dict = dict(attrs)
        attr_id = attrs_dict.get('id') or ''
        attr_class = attrs_dict.get('class') or ''

        # Detect profile card start - can be on <a> tag (regular) OR <img> tag (anonymous)
        if 'org-people-profile-card__profile-image' in attr_id:
            self.in_profile_card = True
            self.in_header = False
            match = re.search(r'profile-image-(\d+)', attr_id)
            if match:
                new_index = int(match.group(1))
                if new_index != self.card_index:
                    # Save previous profile if exists
                    if self.current_profile.get('name'):
                        self.profiles.append(self.current_profile)
                    self.current_profile = {}
                    self.card_index = new_index

            # Extract URL from href (only on <a> tags - regular profiles)
            href = attrs_dict.get('href', '')
            if href and 'linkedin.com/in/' in href:
                slug = self._extract_slug(href)
                if slug:
                    self.current_profile['linkedin_slug'] = slug
                    self.current_profile['linkedin_profile_url'] = f"https://www.linkedin.com/in/{slug}"

            # If this is an <img> tag with the profile-image ID, it's likely an anonymous member
            # We'll capture this and the name will come from the title section as "LinkedIn Member"
            if tag == 'img':
                # Mark as potential anonymous (will be confirmed when we see "LinkedIn Member" in title)
                self.current_profile['_may_be_anonymous'] = True

        # Extract name from img alt (for regular profiles with named photos)
        if tag == 'img' and self.in_profile_card:
            alt = attrs_dict.get('alt', '')
            if alt and alt not in ('', 'photo', 'Profile photo'):
                # Clean LinkedIn status phrases from name
                clean_name, linkedin_status = clean_linkedin_status_from_name(alt)
                self.current_profile['name'] = clean_name
                if linkedin_status:
                    self.current_profile['linkedin_status'] = linkedin_status

        # Title section (contains name link or "LinkedIn Member" text)
        if 'artdeco-entity-lockup__title' in attr_class:
            self.in_title = True
            self.current_text = ""

        # Badge section (contains degree)
        if 'artdeco-entity-lockup__badge' in attr_class:
            self.in_badge = True
            self.current_text = ""

        # Subtitle section (contains headline)
        if 'artdeco-entity-lockup__subtitle' in attr_class:
            self.in_subtitle = True
            self.current_text = ""

        # Caption/mutual connections
        if 'artdeco-entity-lockup__caption' in attr_class or 'mutual' in attr_class.lower():
            self.in_mutual = True
            self.current_text = ""

        # Check for mutual connections in span
        if tag == 'span' and 'mutual' in attr_class.lower():
            self.in_mutual = True
            self.current_text = ""

    def handle_data(self, data: str) -> None:
        text = data.strip()
        if not text:
            return

        # Collect header texts for metadata
        if self.in_header:
            self.header_texts.append(text)

        if self.in_title:
            self.current_text += " " + text
        elif self.in_badge:
            self.current_text += " " + text
        elif self.in_subtitle:
            self.current_text += " " + text
        elif self.in_mutual:
            self.current_text += " " + text

    def handle_endtag(self, tag: str) -> None:
        if tag == 'div':
            if self.in_title:
                text = self.current_text.strip()
                if text and 'name' not in self.current_profile:
                    # Clean up name
                    text = re.sub(r'\s+', ' ', text)
                    if len(text) > 1 and not text.startswith('View '):
                        # Clean LinkedIn status phrases from name
                        clean_name, linkedin_status = clean_linkedin_status_from_name(text)
                        self.current_profile['name'] = clean_name
                        if linkedin_status and 'linkedin_status' not in self.current_profile:
                            self.current_profile['linkedin_status'] = linkedin_status
                        # Check if this is "LinkedIn Member" (anonymous profile)
                        if clean_name == 'LinkedIn Member':
                            self.current_profile['is_anonymous'] = True
                self.in_title = False
                self.current_text = ""

            if self.in_badge:
                text = self.current_text.strip()
                degree = self._parse_degree(text)
                if degree:
                    self.current_profile['degree'] = degree
                self.in_badge = False
                self.current_text = ""

            if self.in_subtitle:
                text = self.current_text.strip()
                if text and len(text) > 2:
                    # Clean up headline
                    text = re.sub(r'\s+', ' ', text)
                    self.current_profile['headline'] = text
                self.in_subtitle = False
                self.current_text = ""

        if tag == 'span' and self.in_mutual:
            text = self.current_text.strip()
            if text and 'mutual' in text.lower():
                self.current_profile['mutual_connections'] = text
            self.in_mutual = False
            self.current_text = ""

    def _extract_slug(self, url: str) -> Optional[str]:
        """Extract profile slug from URL."""
        match = re.search(r'linkedin\.com/in/([^?/]+)', url)
        if match:
            return match.group(1)
        return None

    def _parse_degree(self, text: str) -> Optional[str]:
        """Parse connection degree from text."""
        if '1st' in text:
            return '1st'
        if '2nd' in text:
            return '2nd'
        if '3rd' in text:
            return '3rd+'
        return None

    def finalize(self) -> list[dict]:
        """Finalize parsing and return all profiles."""
        # Save last profile
        if self.current_profile.get('name'):
            self.profiles.append(self.current_profile)

        # Parse custodian metadata from header
        self._parse_header_metadata()

        # Validate and fix names that may be contaminated
        self._validate_and_fix_names()

        return self.profiles

    def _validate_and_fix_names(self) -> None:
        """
        Validate extracted names against LinkedIn slugs and fix contamination.

        Name contamination occurs when saving LinkedIn HTML while logged in:
        privacy-restricted profiles may show the logged-in user's name/photo
        instead of the actual profile owner's info.

        Detection: If extracted name doesn't match the slug, it's likely contaminated.
        Fix: Derive the correct name from the slug.
        """
        for profile in self.profiles:
            name = profile.get('name', '')
            slug = profile.get('linkedin_slug', '')

            # Skip anonymous profiles (no slug)
            if not slug:
                continue

            # Skip "LinkedIn Member" - valid anonymous name
            if name == 'LinkedIn Member':
                continue

            # Check if the extracted name matches the slug
            if not name_matches_slug(name, slug):
                # Name contamination detected - derive correct name from slug
                derived_name, is_reliable = slug_to_name(slug)

                if is_reliable and derived_name != "Unknown":
                    # Record the correction
                    profile['_original_contaminated_name'] = name
                    profile['name'] = derived_name
                    profile['_name_derived_from_slug'] = True
                else:
                    # Could not reliably derive name - mark for review
                    profile['_name_may_be_contaminated'] = True
                    profile['_original_name'] = name

    def _parse_header_metadata(self) -> None:
        """Extract custodian metadata from header texts."""
        for text in self.header_texts:
            # Skip JSON blobs and very long texts (data artifacts)
            if text.startswith('{') or len(text) > 200:
                continue

            # Follower count
            match = re.match(r'^([\d,\.]+K?)\s*followers?$', text, re.IGNORECASE)
            if match:
                self.custodian_metadata['follower_count'] = match.group(1)
                continue

            # Employee count
            match = re.match(r'^([\d,\-]+)\s*employees?$', text, re.IGNORECASE)
            if match:
                self.custodian_metadata['employee_count'] = match.group(1)
                continue

            # Associated members
            match = re.match(r'^(\d+)\s*associated\s+members?$', text, re.IGNORECASE)
            if match:
                self.custodian_metadata['associated_members'] = int(match.group(1))
                continue

            # Industry - must be a clean standalone text, not embedded in JSON
            industry_keywords = ['Museums', 'Archives', 'Libraries', 'Historical Sites', 'Heritage', 'Zoos']
            if any(kw.lower() in text.lower() for kw in industry_keywords):
                # Ensure it's a clean industry text (not JSON or HTML)
                if not text.startswith('{') and not '<' in text and len(text) < 100:
                    if 'industry' not in self.custodian_metadata:
                        self.custodian_metadata['industry'] = text.strip()
                continue

            # Location (City, Region)
            match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', text)
            if match and 'location' not in self.custodian_metadata:
                self.custodian_metadata['location'] = {
                    'city': match.group(1),
                    'region': match.group(2)
                }


def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
    """
    Detect if a headline is heritage-relevant and what type.

    Two-stage classification:
    1. Check if organization is explicitly non-heritage (blocklist)
    2. Check if role/organization matches heritage patterns

    For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
    This prevents generic IT workers at banks/police from being classified as heritage.
    """
    if not headline:
        return (False, None)

    headline_lower = headline.lower()

    # Stage 1: Check for non-heritage organizations (blocklist)
    for org in NON_HERITAGE_ORGANIZATIONS:
        if org.lower() in headline_lower:
            return (False, None)

    # Stage 2: Check for non-heritage role indicators
    for keyword in NON_HERITAGE_KEYWORDS:
        if keyword.lower() in headline_lower:
            return (False, None)

    # Stage 3: Check if this is a heritage organization
    is_heritage_org = False
    for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
        if org_keyword.lower() in headline_lower:
            is_heritage_org = True
            break

    # Check heritage keywords by type (order matters - more specific first)
    # 'D' (Digital) is checked last and requires heritage org validation
    type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E']  # D removed from main loop

    for heritage_type in type_order:
        keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
        for keyword in keywords:
            if keyword.lower() in headline_lower:
                return (True, heritage_type)

    # Special handling for 'D' (Digital) - ONLY if at a heritage organization
    if is_heritage_org:
        digital_keywords = HERITAGE_KEYWORDS.get('D', [])
        for keyword in digital_keywords:
            if keyword.lower() in headline_lower:
                return (True, 'D')

    # Generic heritage terms (without specific type)
    generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
               'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
    for keyword in generic:
        if keyword in headline_lower:
            return (True, None)

    return (False, None)


def is_abbreviated_name(name: str) -> bool:
    """Check if name contains abbreviations."""
    parts = name.split()
    for part in parts:
        clean_part = part.rstrip('.')
        if len(clean_part) <= 1 and clean_part.isalpha():
            return True
        if part.endswith('.') and len(part) <= 2:
            return True
    return False


def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
    """Generate unique staff ID."""
    normalized = unicodedata.normalize('NFD', name.lower())
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
    name_slug = re.sub(r'_+', '_', name_slug).strip('_')
    if len(name_slug) > 30:
        name_slug = name_slug[:30].rstrip('_')
    return f"{custodian_slug}_staff_{index:04d}_{name_slug}"


def parse_html_file(filepath: Path, custodian_name: str, custodian_slug: str) -> dict[str, Any]:
    """
    Parse LinkedIn company People page HTML and extract all staff data.

    Handles:
    - Duplicate profile merging (same person with multiple LinkedIn accounts)
    - Anonymous "LinkedIn Member" entries (each counted separately)

    Returns complete staff JSON structure.
    """
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        html_content = f.read()

    # Parse HTML
    parser = LinkedInProfileCardParser()
    try:
        parser.feed(html_content)
    except Exception as e:
        print(f"Warning: HTML parsing error: {e}", file=sys.stderr)

    raw_profiles = parser.finalize()
    custodian_metadata = parser.custodian_metadata

    # First pass: Group profiles by LinkedIn SLUG to detect duplicates
    # The same profile may appear multiple times on a page (LinkedIn UI quirk)
    # We merge by slug, NOT by name, because different people can have the same name
    # BUT: Do NOT merge "LinkedIn Member" (anonymous) - each is unique
    slug_to_profiles: dict[str, list[dict]] = {}

    for profile in raw_profiles:
        name = profile.get('name', '').strip()
        slug = profile.get('linkedin_slug', '')
        is_anonymous = profile.get('is_anonymous', False) or name == 'LinkedIn Member'

        if not name:
            continue

        if is_anonymous:
            # Each anonymous profile gets a unique key (cannot deduplicate without slug)
            unique_key = f"_anonymous_{len(slug_to_profiles)}"
            slug_to_profiles[unique_key] = [profile]
        elif slug:
            # Deduplicate by slug - same slug = same person appearing multiple times
            if slug not in slug_to_profiles:
                slug_to_profiles[slug] = []
            slug_to_profiles[slug].append(profile)
        else:
            # No slug (shouldn't happen for non-anonymous) - use unique key
            unique_key = f"_no_slug_{len(slug_to_profiles)}"
            slug_to_profiles[unique_key] = [profile]

    # Second pass: Build staff list with merged duplicates
    staff: list[dict] = []
    anonymous_count = 0
    duplicate_profiles_count = 0

    for slug_key, profiles in slug_to_profiles.items():
        if slug_key.startswith('_anonymous_'):
            # Anonymous profile
            profile = profiles[0]
            anonymous_count += 1
            display_name = f"LinkedIn Member #{anonymous_count}"
            name_type = 'anonymous'

            headline = profile.get('headline', '')
            is_heritage, heritage_type = detect_heritage_type(headline)
            if not headline and custodian_name:
                is_heritage = True
                heritage_type = 'M'

            staff_entry = {
                'staff_id': generate_staff_id(display_name, len(staff), custodian_slug),
                'name': display_name,
                'name_type': name_type,
                'degree': profile.get('degree', 'unknown'),
                'headline': headline,
                'mutual_connections': profile.get('mutual_connections', ''),
                'heritage_relevant': is_heritage,
                'heritage_type': heritage_type,
            }
            staff.append(staff_entry)
        elif slug_key.startswith('_no_slug_'):
            # Profile without slug (rare edge case)
            profile = profiles[0]
            name = profile.get('name', 'Unknown')

            if is_abbreviated_name(name):
                name_type = 'abbreviated'
            else:
                name_type = 'full'

            headline = profile.get('headline', '')
            is_heritage, heritage_type = detect_heritage_type(headline)
            if not headline and custodian_name:
                is_heritage = True
                heritage_type = 'M'

            staff_entry = {
                'staff_id': generate_staff_id(name, len(staff), custodian_slug),
                'name': name,
                'name_type': name_type,
                'degree': profile.get('degree', 'unknown'),
                'headline': headline,
                'mutual_connections': profile.get('mutual_connections', ''),
                'heritage_relevant': is_heritage,
                'heritage_type': heritage_type,
            }
            staff.append(staff_entry)
        else:
            # Regular profile with slug - may have duplicates to merge
            # (same profile appearing multiple times on page)
            primary = profiles[0]
            name = primary.get('name', slug_key)

            # Determine name type
            if primary.get('_name_derived_from_slug'):
                name_type = 'derived_from_slug'
            elif is_abbreviated_name(name):
                name_type = 'abbreviated'
            else:
                name_type = 'full'

            headline = primary.get('headline', '')
            is_heritage, heritage_type = detect_heritage_type(headline)
            if not headline and custodian_name:
                is_heritage = True
                heritage_type = 'M'

            staff_entry = {
                'staff_id': generate_staff_id(name, len(staff), custodian_slug),
                'name': name,
                'name_type': name_type,
                'degree': primary.get('degree', 'unknown'),
                'headline': headline,
                'mutual_connections': primary.get('mutual_connections', ''),
                'heritage_relevant': is_heritage,
                'heritage_type': heritage_type,
            }

            # Add primary LinkedIn URL
            if primary.get('linkedin_profile_url'):
                staff_entry['linkedin_profile_url'] = primary['linkedin_profile_url']
                staff_entry['linkedin_slug'] = primary['linkedin_slug']

            # Add name correction metadata if name was derived from slug
            if primary.get('_name_derived_from_slug'):
                staff_entry['name_correction'] = {
                    'original_contaminated_name': primary.get('_original_contaminated_name', ''),
                    'derived_from_slug': True,
                    'correction_method': 'slug_to_name',
                }
            elif primary.get('_name_may_be_contaminated'):
                staff_entry['name_correction'] = {
                    'may_be_contaminated': True,
                    'original_name': primary.get('_original_name', name),
                    'note': 'Name could not be reliably derived from slug - manual review needed',
                }

            # If same profile appeared multiple times, count as duplicates merged
            if len(profiles) > 1:
                duplicate_profiles_count += len(profiles) - 1

            staff.append(staff_entry)

    # Build final output structure
    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    # Calculate PYMK filtered count
    pymk_filtered = custodian_metadata.get('_pymk_cards_filtered', 0)

    result = {
        'custodian_metadata': {
            'custodian_name': custodian_name,
            'custodian_slug': custodian_slug,
            'name': custodian_metadata.get('name', custodian_name),
            'industry': custodian_metadata.get('industry', ''),
            'location': custodian_metadata.get('location', {}),
            'follower_count': custodian_metadata.get('follower_count', ''),
            'associated_members': custodian_metadata.get('associated_members', 0),
        },
        'source_metadata': {
            'source_type': 'linkedin_company_people_page_html',
            'source_file': str(filepath.name),
            'registered_timestamp': timestamp,
            'registration_method': 'html_parsing',
            'staff_extracted': len(staff),
            'pymk_cards_filtered': pymk_filtered,
            'duplicate_profiles_merged': duplicate_profiles_count,
        },
        'staff': staff,
        'staff_analysis': {
            'total_staff_extracted': len(staff),
            'with_linkedin_url': sum(1 for s in staff if 'linkedin_profile_url' in s),
            'with_alternate_profiles': sum(1 for s in staff if 'alternate_profiles' in s),
            'anonymous_members': anonymous_count,
            'heritage_relevant_count': sum(1 for s in staff if s.get('heritage_relevant')),
            'staff_by_heritage_type': dict(Counter(
                s.get('heritage_type') for s in staff if s.get('heritage_type')
            )),
            'names_derived_from_slug': sum(
                1 for s in staff
                if s.get('name_correction', {}).get('derived_from_slug')
            ),
            'names_possibly_contaminated': sum(
                1 for s in staff
                if s.get('name_correction', {}).get('may_be_contaminated')
            ),
        }
    }

    return result


def main():
    parser = argparse.ArgumentParser(
        description='Parse LinkedIn company People page HTML to extract staff data'
    )
    parser.add_argument('html_file', type=Path, help='Path to saved HTML file')
    parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
    parser.add_argument('--custodian-slug', required=True, help='Slug for staff ID generation')
    parser.add_argument('--output', '-o', type=Path, help='Output JSON file path')

    args = parser.parse_args()

    if not args.html_file.exists():
        print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr)
        sys.exit(1)

    print(f"Parsing: {args.html_file}")
    result = parse_html_file(args.html_file, args.custodian_name, args.custodian_slug)

    # Print summary
    print(f"\nExtraction Results:")
    print(f"  Total staff: {result['staff_analysis']['total_staff_extracted']}")
    print(f"  With LinkedIn URL: {result['staff_analysis']['with_linkedin_url']}")
    print(f"  With alternate profiles: {result['staff_analysis']['with_alternate_profiles']}")
    print(f"  Anonymous members: {result['staff_analysis']['anonymous_members']}")
    print(f"  Heritage-relevant: {result['staff_analysis']['heritage_relevant_count']}")

    # Show filtering/merging stats
    pymk_filtered = result['source_metadata'].get('pymk_cards_filtered', 0)
    duplicates_merged = result['source_metadata'].get('duplicate_profiles_merged', 0)
    if pymk_filtered > 0:
        print(f"\n  'People you may know' cards filtered: {pymk_filtered}")
    if duplicates_merged > 0:
        print(f"  Duplicate profiles merged: {duplicates_merged}")

    # Show name correction stats
    names_derived = result['staff_analysis'].get('names_derived_from_slug', 0)
    names_contaminated = result['staff_analysis'].get('names_possibly_contaminated', 0)
    if names_derived > 0 or names_contaminated > 0:
        print(f"\n  Name Corrections:")
        if names_derived > 0:
            print(f"    Names derived from slug (contamination fixed): {names_derived}")
        if names_contaminated > 0:
            print(f"    Names possibly contaminated (manual review needed): {names_contaminated}")

    expected = result['custodian_metadata'].get('associated_members', 0)
    if expected:
        extracted = result['staff_analysis']['total_staff_extracted']
        print(f"\n  Expected (associated members): {expected}")
        print(f"  Extracted: {extracted}")
        diff = extracted - expected
        if diff == 0:
            print(f"  Match: EXACT")
        elif diff > 0:
            print(f"  Difference: +{diff} (more than expected)")
        else:
            print(f"  Difference: {diff} (fewer than expected)")

    print(f"\n  Heritage types: {result['staff_analysis']['staff_by_heritage_type']}")

    # Save output
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"\nSaved to: {args.output}")
    else:
        # Print to stdout
        print(json.dumps(result, indent=2, ensure_ascii=False))

    return 0


if __name__ == '__main__':
    sys.exit(main())