glam/scripts/parse_custodian_staff.py

#!/usr/bin/env python3
"""
Parse LinkedIn company staff pages from raw manual register files.

This script processes raw text exports from LinkedIn company "People" pages
and extracts structured staff data for heritage custodian institutions.

The output follows Rule 15 (Connection Data Registration) patterns but adapted
for custodian staff rather than individual connections.

Usage:
    python scripts/parse_custodian_staff.py <input_file> <output_file> \
        --custodian-name "Name" --custodian-slug "slug"

Example:
    python scripts/parse_custodian_staff.py \
        data/custodian/person/manual_hc/collectie_overijssel-20251210T0055.md \
        data/custodian/person/collectie_overijssel_staff_20251210T0055.json \
        --custodian-name "Collectie Overijssel" \
        --custodian-slug "collectie-overijssel"
"""

import argparse
import json
import re
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import unicodedata


# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
    # G - Gallery
    'G': [
        'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
        'exhibition space', 'tentoonstellingsruimte'
    ],
    # L - Library
    'L': [
        'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
        'KB ', 'national library', 'universiteitsbiblio', 'UB '
    ],
    # A - Archive
    'A': [
        'archive', 'archief', 'archivist', 'archivaris', 'archival',
        'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
        'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
        'sound and vision', 'nationaal archief', 'stadsarchief',
        'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG',
        'archiefspecialist', 'archiefmedewerker', 'archiefinspecteur'
    ],
    # M - Museum
    'M': [
        'museum', 'musea', 'curator', 'conservator', 'collection manager',
        'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
        'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
        'collectiespecialist', 'collectie'
    ],
    # O - Official Institution
    'O': [
        'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
        'province', 'provincie', 'OCW', 'ministerie van'
    ],
    # R - Research Center
    'R': [
        'research', 'onderzoek', 'researcher', 'onderzoeker',
        'KNAW', 'humanities cluster', 'NWO', 'think tank',
        'documentatie', 'documentation', 'kenniscentrum', 'historicus'
    ],
    # C - Corporation (Corporate heritage)
    'C': [
        'corporate archive', 'bedrijfsarchief', 'company history',
        'shell', 'philips', 'heineken'
    ],
    # E - Education Provider
    'E': [
        'university', 'universiteit', 'professor', 'lecturer', 'docent',
        'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
        'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
        'leiden university', 'utrecht university', 'UU ', 'TU ',
        'reinwardt', 'film academy', 'filmacademie', 'graduate',
        'assistant professor', 'associate professor', 'hoogleraar',
        'educatie', 'educator'
    ],
    # S - Collecting Society
    'S': [
        'society', 'vereniging', 'genootschap', 'historical society',
        'historische vereniging', 'heemkunde'
    ],
    # D - Digital Platform
    'D': [
        'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
        'developer', 'engineer', 'data ', 'AI ', 'machine learning',
        'digitalisering', 'datamanagement', 'data analist'
    ],
}

# Non-heritage keywords (to mark as heritage_relevant=False)
NON_HERITAGE_KEYWORDS = [
    'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
    'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
    'consultant', 'coach', 'therapy', 'health', 'medical',
    'food', 'restaurant', 'retail', 'fashion', 'real estate',
    'insurance', 'banking', 'investment', 'e-commerce',
    'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]

# Lines that indicate LinkedIn UI noise (to skip entirely)
NOISE_EXACT = {
    '0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
    'Notifications', 'Me', 'For Business', 'Learning', 'People',
    '1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
    'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
    'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
    'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
    'More', 'Compose message', 'Actively hiring', 'Home', 'About', 'Posts',
    'Jobs', 'People', 'Insights', 'Where they live', 'Where they studied',
    'What they do', 'People you may know',
}

NOISE_PATTERNS = [
    r'^\d+$',  # Just a number
    r'^\d+ notifications?$',
    r'^LinkedIn Corporation',
    r'^You are on the messaging overlay',
    r'Status is online$',
    r'^MessagingYou are on the messaging',
    r'^Are these results helpful',
    r'^Your feedback helps',
    r'^\d+K? followers?$',
    r'^Page \d+ of \d+$',
    r'^Search employees by',
    r'^\d+ associated members$',
    r'logo$',
]


def is_noise_line(line: str) -> bool:
    """Check if a line is LinkedIn UI noise that should be skipped."""
    line = line.strip()
    if not line:
        return True

    if line in NOISE_EXACT:
        return True

    for pattern in NOISE_PATTERNS:
        if re.match(pattern, line, re.IGNORECASE):
            return True

    return False


def is_action_button(line: str) -> bool:
    """Check if line is an action button."""
    return line.strip() in ('Connect', 'Message', 'Follow')


def is_mutual_connections_line(line: str) -> bool:
    """Check if line describes mutual connections."""
    patterns = [
        r'mutual connections?$',
        r'is a mutual connection$',
        r'are mutual connections$',
        r'other connection[s]? work here$',
    ]
    for pattern in patterns:
        if re.search(pattern, line, re.IGNORECASE):
            return True
    return False


def is_follower_count(line: str) -> bool:
    """Check if line is a follower count."""
    return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))


def is_employee_count(line: str) -> bool:
    """Check if line is an employee count."""
    return bool(re.match(r'^[\d,\-]+ employees?$', line.strip(), re.IGNORECASE))


def is_anonymous_name(name: str) -> bool:
    """Check if name is an anonymous LinkedIn Member."""
    anonymous_patterns = [
        r'^linkedin\s*member$',
        r'^member$',
        r'^anonymous$',
    ]
    name_lower = name.lower().strip()
    return any(re.match(p, name_lower) for p in anonymous_patterns)


def is_abbreviated_name(name: str) -> bool:
    """
    Check if name contains abbreviations (privacy-protected).

    Patterns detected:
    - "Amy B." (first name + single initial)
    - "Elisabeth V." (ends with initial)
    - "Tina M. Bastajian" (middle initial)
    - "S. Buse Yildirim" (first initial)
    """
    parts = name.split()
    if not parts:
        return False

    for part in parts:
        clean_part = part.rstrip('.')
        if len(clean_part) <= 1 and clean_part.isalpha():
            return True
        if part.endswith('.') and len(part) <= 2:
            return True

    return False


def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
    """
    Generate a unique identifier for a staff member.

    Format: {custodian_slug}_staff_{index:04d}_{name_slug}

    Examples:
        - collectie-overijssel_staff_0001_vincent_robijn
        - nationaal-archief_staff_0042_afelonne_doek
    """
    # Normalize unicode and convert to ASCII-safe slug
    normalized = unicodedata.normalize('NFD', name.lower())
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Replace spaces and special chars with underscores
    name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
    name_slug = re.sub(r'_+', '_', name_slug).strip('_')

    # Truncate if too long
    if len(name_slug) > 30:
        name_slug = name_slug[:30].rstrip('_')

    return f"{custodian_slug}_staff_{index:04d}_{name_slug}"


def parse_degree(text: str) -> Optional[str]:
    """Extract connection degree from line.

    Handles formats:
    - "2nd degree connection · 2nd"
    - "3rd+ degree connection · 3rd"  (note: 3rd+ in first part, 3rd in second)
    - "· 2nd" or "• 2nd"
    - Standalone "· 3rd"
    """
    # Pattern 1: "Name 2nd degree connection" or "3rd+ degree connection"
    # The key fix: rd\+? allows "3rd" OR "3rd+" - the + is optional after rd
    match = re.search(r'(\d+(?:st|nd|rd\+?))\s*degree\s+connection', text, re.IGNORECASE)
    if match:
        degree = match.group(1).lower()
        # Normalize: "3rd+" stays as "3rd+"
        if degree == '3rd+':
            return '3rd+'
        return degree

    # Pattern 2: "· 2nd" or "• 2nd" or "· 3rd"
    match = re.search(r'[·•]\s*(1st|2nd|3rd\+?)', text)
    if match:
        degree = match.group(1)
        # Normalize: "3rd" from "· 3rd" should be "3rd+" (these are always 3rd+ connections)
        if degree == '3rd':
            return '3rd+'
        return degree

    # Pattern 3: Standalone "· 2nd"
    match = re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', text.strip())
    if match:
        degree = match.group(1)
        if degree == '3rd':
            return '3rd+'
        return degree

    return None


def extract_name_from_degree_line(line: str) -> str:
    """Extract just the name from a line like 'John Doe 2nd degree connection · 2nd'."""
    # Remove degree suffix patterns
    name = re.sub(r'\s*\d+(?:st|nd|rd|\+)?\s*degree\s+connection.*$', '', line.strip(), flags=re.IGNORECASE)
    name = re.sub(r'\s*[·•]\s*(1st|2nd|3rd\+?)$', '', name)
    # Remove emoji indicators
    name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜🏛️]+\s*', ' ', name)
    # Remove "is open to work" suffix
    name = re.sub(r'\s+is open to work$', '', name, flags=re.IGNORECASE)
    return name.strip()


def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
    """Detect if a headline is heritage-relevant and what type."""
    headline_lower = headline.lower()

    # Check for non-heritage indicators first
    for keyword in NON_HERITAGE_KEYWORDS:
        if keyword.lower() in headline_lower:
            return (False, None)

    # Check heritage keywords by type (order matters - more specific first)
    type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']

    for heritage_type in type_order:
        keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
        for keyword in keywords:
            if keyword.lower() in headline_lower:
                return (True, heritage_type)

    # Generic heritage terms
    generic_heritage = [
        'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
        'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
        'preservation', 'conservation', 'behoud', 'restauratie'
    ]
    for keyword in generic_heritage:
        if keyword in headline_lower:
            return (True, None)

    return (False, None)


def extract_custodian_metadata(lines: list[str]) -> dict[str, Any]:
    """
    Extract custodian organization metadata from the header section.

    Expected patterns:
    - "Collectie Overijssel logo"
    - "Collectie Overijssel"
    - "Met het heden je verleden in" (description/tagline)
    - "Museums, Historical Sites, and Zoos" (industry)
    - "Zwolle, Overijssel" (location)
    - "2K followers"
    - "51-200 employees"
    - "58 associated members"
    """
    metadata: dict[str, Any] = {}

    for i, line in enumerate(lines[:30]):  # Only check first 30 lines for header
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Logo line - extract name
        if line.endswith(' logo'):
            metadata['name'] = line[:-5].strip()
            continue

        # Employee count
        employee_match = re.match(r'^([\d,\-]+)\s*employees?$', line, re.IGNORECASE)
        if employee_match:
            metadata['employee_count'] = employee_match.group(1)
            continue

        # Follower count
        follower_match = re.match(r'^([\d,\.]+K?)\s*followers?$', line, re.IGNORECASE)
        if follower_match:
            metadata['follower_count'] = follower_match.group(1)
            continue

        # Associated members count
        member_match = re.match(r'^(\d+)\s*associated\s+members?$', line, re.IGNORECASE)
        if member_match:
            metadata['associated_members'] = int(member_match.group(1))
            continue

        # Industry detection (common patterns)
        industry_patterns = [
            'Museums', 'Archives', 'Libraries', 'Historical Sites',
            'Government', 'Cultural', 'Heritage', 'Education',
            'Research', 'Non-profit', 'Zoos'
        ]
        if any(p.lower() in line.lower() for p in industry_patterns):
            if 'industry' not in metadata:
                metadata['industry'] = line
            continue

        # Location pattern: "City, Region" or "City, Country"
        loc_match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', line)
        if loc_match and 'location' not in metadata:
            metadata['location'] = {
                'city': loc_match.group(1),
                'region': loc_match.group(2)
            }
            continue

    return metadata


def is_likely_name_line(line: str) -> bool:
    """
    Check if a line looks like a person's name.

    Patterns:
    - Capitalized words (proper nouns)
    - Contains spaces (first + last name)
    - Not too long (names rarely exceed 50 chars)
    - Doesn't contain obvious non-name patterns
    """
    line = line.strip()
    if not line or len(line) > 60:
        return False

    # Skip obvious non-names
    non_name_patterns = [
        r'^Page \d+',
        r'^\d+\s*(st|nd|rd|th)',
        r'degree connection',
        r'mutual connection',
        r'followers?$',
        r'employees?$',
        r'^Search',
        r'^Where they',
        r'^What they',
        r'work here$',
        r'^Connect$',
        r'^Message$',
        r'^Follow$',
        r'logo$',
    ]
    for pattern in non_name_patterns:
        if re.search(pattern, line, re.IGNORECASE):
            return False

    # Names typically start with capital letter
    if not line[0].isupper() and not line[0].isalpha():
        return False

    # Check for reasonable name structure
    # Most names have 2-5 words
    words = line.split()
    if len(words) < 1 or len(words) > 6:
        return False

    return True


def parse_staff_file(filepath: Path, custodian_name: str, custodian_slug: str) -> tuple[list[dict], dict]:
    """
    Parse a LinkedIn company staff page raw text file.

    The file structure has TWO formats:

    Format 1 (Company People page - Collectie Overijssel style):
        Name (line N)
        Name (line N+1, duplicate - optional)
        2nd degree connection · 2nd (line N+2 - STANDALONE degree line)
        Headline (line N+3)
        Mutual connections (line N+4)

    Format 2 (Nationaal Archief style):
        Name (line N)
        Name 2nd degree connection (line N+1 - name WITH degree)
        · 2nd (line N+2)
        Headline (line N+3)
        Mutual connections (line N+4)
        Connect (action button)

    Args:
        filepath: Path to the raw staff file
        custodian_name: Name of the custodian organization
        custodian_slug: Slug for generating staff IDs

    Returns:
        Tuple of (staff_list, custodian_metadata)
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.rstrip('\n') for line in f]

    # Extract custodian metadata from header
    custodian_metadata = extract_custodian_metadata(lines)
    if 'name' not in custodian_metadata:
        custodian_metadata['name'] = custodian_name

    staff: list[dict[str, Any]] = []
    seen_names: set[str] = set()
    staff_index = 0

    # Track anonymous members separately to assign unique IDs
    anonymous_count = 0

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # PATTERN A: "LinkedIn Member" entries (anonymous, no degree line)
        # These appear outside the viewer's connection network
        if line == 'LinkedIn Member':
            # Check if next line is a headline (job title) or placeholder
            headline_line = ''
            lines_to_skip = 1  # At minimum, skip the "LinkedIn Member" line

            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                # Check if next line is a placeholder headline (empty/dash patterns)
                is_placeholder_headline = next_line in ('--', '-- ', '-', '.', 'notitle', '')

                # Check if it's a valid headline (contains custodian keywords or job indicators)
                custodian_keywords = custodian_name.lower().split()
                is_relevant_headline = (
                    any(kw in next_line.lower() for kw in custodian_keywords) or
                    any(kw in next_line.lower() for kw in ['bij', 'at', 'voor'])
                )

                # If placeholder, treat as empty headline but still include the member
                if is_placeholder_headline:
                    headline_line = ''  # No headline available
                    lines_to_skip = 2  # Skip both LinkedIn Member and placeholder line
                elif is_relevant_headline:
                    headline_line = next_line
                    lines_to_skip = 2  # Skip both LinkedIn Member and headline
                else:
                    # Next line is not a headline (maybe start of new entry) - member has no headline
                    headline_line = ''
                    lines_to_skip = 1  # Only skip LinkedIn Member line

            # Always create member record for LinkedIn Member entries
            anonymous_count += 1
            anonymous_id = f"anonymous_{anonymous_count:04d}"
            staff_id = generate_staff_id(anonymous_id, staff_index, custodian_slug)
            staff_index += 1

            member = {
                'staff_id': staff_id,
                'name': f"LinkedIn Member #{anonymous_count}",
                'name_type': 'anonymous',
                'degree': 'outside_network',  # No degree = outside connection circles
                'heritage_relevant': False,  # Will be updated below
            }

            # Add headline only if we have one
            if headline_line:
                member['headline'] = headline_line
                # Process heritage relevance
                is_relevant, heritage_type = detect_heritage_type(headline_line)
                member['heritage_relevant'] = is_relevant
                if heritage_type:
                    member['heritage_type'] = heritage_type

            staff.append(member)
            i += lines_to_skip
            continue

        # PATTERN B: Regular entries with degree lines
        degree = parse_degree(line)
        if degree:
            # Try to extract name from THIS line first (Format 2: "Name 2nd degree connection")
            name = extract_name_from_degree_line(line)

            # If no valid name on this line, look BACK for the name (Format 1)
            # Check: empty, same as original line, OR not a valid name pattern
            if not name or name == line or not is_likely_name_line(name):
                name = None  # Reset to ensure we look back
                # Look back for the name - it should be 1-2 lines above
                for lookback in range(1, 4):
                    if i - lookback >= 0:
                        prev_line = lines[i - lookback].strip()
                        if prev_line and is_likely_name_line(prev_line):
                            # Remove any trailing "is open to work" etc
                            name = re.sub(r'\s+is open to work$', '', prev_line, flags=re.IGNORECASE)
                            break

            # Skip if we couldn't find a valid name
            if not name or not is_likely_name_line(name):
                i += 1
                continue

            # Skip duplicates
            if name in seen_names:
                i += 1
                continue

            # Skip if name matches custodian name (org's own entry)
            if name.lower() == custodian_name.lower():
                i += 1
                continue

            # Determine name type
            if is_anonymous_name(name):
                name_type = 'anonymous'
            elif is_abbreviated_name(name):
                name_type = 'abbreviated'
            else:
                name_type = 'full'

            # Generate unique staff ID
            staff_id = generate_staff_id(name, staff_index, custodian_slug)
            staff_index += 1

            # Build staff member record
            member: dict[str, Any] = {
                'staff_id': staff_id,
                'name': name,
                'name_type': name_type,
                'degree': degree,
            }

            i += 1  # Move past degree line

            # Check if next line is just "· 2nd" (separate degree line) - skip it
            if i < len(lines) and re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', lines[i].strip()):
                i += 1

            # Skip empty lines
            while i < len(lines) and not lines[i].strip():
                i += 1

            # Next non-empty line should be headline (job title)
            if i < len(lines):
                headline_line = lines[i].strip()
                # Make sure it's not noise or the start of another person entry
                # NOTE: Don't filter by is_likely_name_line - headlines can look like names!
                if (not is_noise_line(headline_line) and
                    not parse_degree(headline_line) and
                    not is_action_button(headline_line) and
                    not is_mutual_connections_line(headline_line) and
                    not is_follower_count(headline_line) and
                    headline_line not in ('-', '.')):  # Skip placeholder headlines
                    member['headline'] = headline_line
                    i += 1

            # Skip to mutual connections or next entry
            while i < len(lines):
                check_line = lines[i].strip()

                # Capture mutual connections info
                if is_mutual_connections_line(check_line):
                    member['mutual_connections'] = check_line
                    i += 1
                    continue

                # Stop if we find a degree pattern (next staff member) OR LinkedIn Member
                if parse_degree(check_line) or check_line == 'LinkedIn Member':
                    break

                # Skip action buttons and noise
                i += 1

            # Process heritage relevance from headline
            headline = member.get('headline', '')
            if headline:
                is_relevant, heritage_type = detect_heritage_type(headline)
                member['heritage_relevant'] = is_relevant
                if heritage_type:
                    member['heritage_type'] = heritage_type
            else:
                member['heritage_relevant'] = False

            staff.append(member)
            seen_names.add(name)
        else:
            i += 1

    return staff, custodian_metadata


def compute_staff_analysis(staff: list[dict]) -> dict:
    """Compute analysis statistics for staff members."""
    total = len(staff)
    heritage_relevant = [s for s in staff if s.get('heritage_relevant', False)]
    heritage_count = len(heritage_relevant)

    # Count by heritage type
    type_counts: Counter[str] = Counter()
    for s in heritage_relevant:
        ht = s.get('heritage_type')
        if ht:
            type_counts[ht] += 1

    # Count by degree
    degree_counts: Counter[str] = Counter()
    for s in staff:
        degree_counts[s.get('degree', 'unknown')] += 1

    # Count by name type
    name_type_counts: Counter[str] = Counter()
    for s in staff:
        name_type_counts[s.get('name_type', 'unknown')] += 1

    # Common job titles/roles
    role_counts: Counter[str] = Counter()
    for s in staff:
        headline = s.get('headline', '')
        if headline:
            # Extract key role words
            role_keywords = [
                'directeur', 'director', 'manager', 'coordinator', 'coördinator',
                'adviseur', 'advisor', 'medewerker', 'specialist', 'archivist',
                'archivaris', 'historicus', 'historian', 'curator', 'conservator',
                'beheerder', 'onderzoeker', 'researcher', 'projectleider'
            ]
            for keyword in role_keywords:
                if keyword.lower() in headline.lower():
                    role_counts[keyword.title()] += 1

    return {
        'total_staff_extracted': total,
        'heritage_relevant_count': heritage_count,
        'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
        'staff_by_heritage_type': dict(type_counts),
        'staff_by_degree': dict(degree_counts),
        'staff_by_name_type': dict(name_type_counts),
        'common_roles': dict(role_counts.most_common(10)),
    }


def create_output(
    staff: list[dict],
    custodian_metadata: dict,
    custodian_name: str,
    custodian_slug: str,
    input_file: Path,
) -> dict:
    """Create the full output JSON structure."""

    analysis = compute_staff_analysis(staff)

    # Extract timestamp from filename
    timestamp_match = re.search(r'(\d{8}T\d{4,6})', input_file.name)
    if timestamp_match:
        ts = timestamp_match.group(1)
        if len(ts) == 13:  # 20251210T0055 format
            scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:00Z"
        elif len(ts) == 15:  # 20251210T005500 format
            scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
        else:
            scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
    else:
        scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    output = {
        'custodian_metadata': {
            'custodian_name': custodian_name,
            'custodian_slug': custodian_slug,
            **custodian_metadata,
        },
        'source_metadata': {
            'source_type': 'linkedin_company_people_page',
            'registered_timestamp': scraped_ts,
            'registration_method': 'manual_linkedin_browse',
            'staff_extracted': len(staff),
            'notes': f"Staff extracted from LinkedIn company People page. Raw register in {input_file.name}"
        },
        'staff': staff,
        'staff_analysis': analysis,
        'provenance': {
            'data_source': 'LINKEDIN_MANUAL_REGISTER',
            'data_tier': 'TIER_3_CROWD_SOURCED',
            'extraction_date': scraped_ts,
            'extraction_method': 'manual_browse_copy_paste',
            'raw_source_file': input_file.name,
            'processed_by': 'parse_custodian_staff.py',
            'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
        }
    }

    return output


def main():
    parser = argparse.ArgumentParser(
        description='Parse LinkedIn company staff pages from raw manual register files.'
    )
    parser.add_argument('input_file', type=Path, help='Input raw text file')
    parser.add_argument('output_file', type=Path, help='Output JSON file')
    parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
    parser.add_argument('--custodian-slug', required=True, help='Slug for generating staff IDs')
    parser.add_argument('--dry-run', action='store_true', help='Parse but do not write output')

    args = parser.parse_args()

    if not args.input_file.exists():
        print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
        sys.exit(1)

    print(f"Parsing staff from: {args.input_file}")
    staff, custodian_metadata = parse_staff_file(
        args.input_file,
        args.custodian_name,
        args.custodian_slug
    )
    print(f"Extracted {len(staff)} unique staff members")

    if custodian_metadata:
        print(f"\nCustodian Metadata:")
        for key, value in custodian_metadata.items():
            print(f"  {key}: {value}")

    output = create_output(
        staff,
        custodian_metadata,
        args.custodian_name,
        args.custodian_slug,
        args.input_file,
    )

    analysis = output['staff_analysis']
    print(f"\nStaff Analysis:")
    print(f"  Total staff: {analysis['total_staff_extracted']}")
    print(f"  Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
    print(f"  By type: {analysis['staff_by_heritage_type']}")
    print(f"  By degree: {analysis['staff_by_degree']}")
    print(f"  By name type: {analysis['staff_by_name_type']}")

    if analysis['common_roles']:
        print(f"  Common roles:")
        for role, count in list(analysis['common_roles'].items())[:5]:
            print(f"    - {role}: {count}")

    if args.dry_run:
        print("\n[Dry run - not writing output]")
        print("\nSample staff (first 5):")
        for s in staff[:5]:
            print(f"  - {s['name']} ({s['degree']})")
            print(f"    Headline: {s.get('headline', 'N/A')[:60]}")
            print(f"    Heritage: {s.get('heritage_relevant', False)} ({s.get('heritage_type', '-')})")
    else:
        args.output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(args.output_file, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        print(f"\nWrote output to: {args.output_file}")


if __name__ == '__main__':
    main()