glam/scripts/parse_linkedin_connections.py

#!/usr/bin/env python3
"""
Parse LinkedIn connections from raw manual scrape files.

This script processes raw text exports from LinkedIn connection search pages
and extracts structured connection data following Rule 15 (Connection Data Registration).

Usage:
    python scripts/parse_linkedin_connections.py <input_file> <output_file> --target-name "Name" --target-slug "slug"

Example:
    python scripts/parse_linkedin_connections.py \
        data/custodian/person/manual_register/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.md \
        data/custodian/person/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.json \
        --target-name "Elif Rongen-Kaynakçi" \
        --target-slug "elif-rongen-kaynakci-35295a17"
"""

import argparse
import json
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional


# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
    # G - Gallery
    'G': [
        'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
        'exhibition space', 'tentoonstellingsruimte'
    ],
    # L - Library
    'L': [
        'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
        'KB ', 'national library', 'universiteitsbiblio', 'UB '
    ],
    # A - Archive
    'A': [
        'archive', 'archief', 'archivist', 'archivaris', 'archival',
        'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
        'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
        'sound and vision', 'nationaal archief', 'stadsarchief',
        'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG'
    ],
    # M - Museum
    'M': [
        'museum', 'musea', 'curator', 'conservator', 'collection manager',
        'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
        'tropenmuseum', 'allard pierson', 'museale'
    ],
    # O - Official Institution
    'O': [
        'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
        'province', 'provincie', 'OCW', 'ministerie van'
    ],
    # R - Research Center
    'R': [
        'research', 'onderzoek', 'researcher', 'onderzoeker',
        'KNAW', 'humanities cluster', 'NWO', 'think tank',
        'documentatie', 'documentation', 'kenniscentrum'
    ],
    # C - Corporation (Corporate heritage)
    'C': [
        'corporate archive', 'bedrijfsarchief', 'company history',
        'shell', 'philips', 'heineken'
    ],
    # E - Education Provider
    'E': [
        'university', 'universiteit', 'professor', 'lecturer', 'docent',
        'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
        'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
        'leiden university', 'utrecht university', 'UU ', 'TU ',
        'reinwardt', 'film academy', 'filmacademie', 'graduate',
        'assistant professor', 'associate professor', 'hoogleraar'
    ],
    # S - Collecting Society
    'S': [
        'society', 'vereniging', 'genootschap', 'historical society',
        'historische vereniging', 'heemkunde'
    ],
    # D - Digital Platform
    'D': [
        'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
        'developer', 'engineer', 'data ', 'AI ', 'machine learning'
    ],
}

# Non-heritage keywords (to mark as heritage_relevant=False)
NON_HERITAGE_KEYWORDS = [
    'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
    'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
    'consultant', 'coach', 'therapy', 'health', 'medical',
    'food', 'restaurant', 'retail', 'fashion', 'real estate',
    'insurance', 'banking', 'investment', 'e-commerce',
    'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]

# Organizations that are explicitly NOT heritage institutions
# These should never be classified as heritage-relevant
NON_HERITAGE_ORGANIZATIONS = [
    # Banks & Financial
    'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
    # Security companies
    'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
    # Police/Government (non-cultural)
    'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
    # Political parties
    'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
    'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
    # Tech companies (non-heritage)
    'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
    'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
    'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
    # Telecom
    'kpn', 'vodafone', 't-mobile', 'ziggo',
    # Postal / Logistics
    'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
    # Healthcare
    'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
    # Retail
    'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
    # Consulting / Professional services
    'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
    'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
    # Recruitment / HR
    'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
    # Energy / Utilities
    'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
    # Transport
    'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
    # Other
    'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
]

# Heritage organization keywords - organizations that ARE heritage institutions
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
HERITAGE_ORGANIZATION_KEYWORDS = [
    # Archives
    'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
    'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
    # Museums
    'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
    'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
    # Libraries
    'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
    # Film/AV heritage
    'eye film', 'filmmuseum', 'eye ', 'sound and vision',
    # Heritage platforms
    'erfgoed', 'heritage', 'cultural', 'cultureel',
    # Research institutes (heritage-focused)
    'knaw', 'humanities cluster', 'meertens', 'huygens',
]

# Lines that indicate LinkedIn UI noise (to skip entirely)
NOISE_EXACT = {
    '0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
    'Notifications', 'Me', 'For Business', 'Learning', 'People',
    '1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
    'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
    'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
    'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
    'More', 'Compose message', 'Actively hiring',
}

NOISE_PATTERNS = [
    r'^\d+$',  # Just a number
    r'^\d+ notifications?$',
    r'^LinkedIn Corporation',
    r'^You are on the messaging overlay',
    r'Status is online$',
    r'^MessagingYou are on the messaging',
    r'^Are these results helpful',
    r'^Your feedback helps',
    r'^\d+K? followers?$',
]


def is_noise_line(line: str) -> bool:
    """Check if a line is LinkedIn UI noise that should be skipped."""
    line = line.strip()
    if not line:
        return True

    if line in NOISE_EXACT:
        return True

    for pattern in NOISE_PATTERNS:
        if re.match(pattern, line, re.IGNORECASE):
            return True

    return False


def is_action_button(line: str) -> bool:
    """Check if line is an action button."""
    return line.strip() in ('Connect', 'Message', 'Follow')


def is_mutual_connections_line(line: str) -> bool:
    """Check if line describes mutual connections."""
    patterns = [
        r'mutual connections?$',
        r'is a mutual connection$',
        r'are mutual connections$',
    ]
    for pattern in patterns:
        if re.search(pattern, line, re.IGNORECASE):
            return True
    return False


def is_follower_count(line: str) -> bool:
    """Check if line is a follower count."""
    return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))


def is_anonymous_name(name: str) -> bool:
    """Check if name is an anonymous LinkedIn Member."""
    anonymous_patterns = [
        r'^linkedin\s*member$',
        r'^member$',
        r'^anonymous$',
    ]
    name_lower = name.lower().strip()
    return any(re.match(p, name_lower) for p in anonymous_patterns)


def is_abbreviated_name(name: str) -> bool:
    """
    Check if name contains abbreviations (privacy-protected).

    Patterns detected:
    - "Amy B." (first name + single initial)
    - "Elisabeth V." (ends with initial)
    - "Tina M. Bastajian" (middle initial)
    - "S. Buse Yildirim" (first initial)
    - "İ. Can Koç" (first initial with Turkish chars)
    """
    parts = name.split()
    if not parts:
        return False

    # Check for single-letter initial patterns
    for part in parts:
        # Remove any trailing periods for checking
        clean_part = part.rstrip('.')
        # Single letter or single letter with period = initial
        if len(clean_part) <= 1 and clean_part.isalpha():
            return True
        # Ends with period and is 2 chars (like "M.")
        if part.endswith('.') and len(part) <= 2:
            return True

    return False


def generate_connection_id(name: str, index: int, target_slug: str) -> str:
    """
    Generate a unique identifier for a connection.

    Format: {target_slug}_conn_{index:04d}_{name_slug}

    Examples:
        - elif-rongen-kaynakci-35295a17_conn_0042_amy_b
        - elif-rongen-kaynakci-35295a17_conn_0156_linkedin_member
    """
    import unicodedata

    # Normalize unicode and convert to ASCII-safe slug
    normalized = unicodedata.normalize('NFD', name.lower())
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Replace spaces and special chars with underscores
    name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
    name_slug = re.sub(r'_+', '_', name_slug).strip('_')

    # Truncate if too long
    if len(name_slug) > 30:
        name_slug = name_slug[:30].rstrip('_')

    return f"{target_slug}_conn_{index:04d}_{name_slug}"


def parse_degree(text: str) -> Optional[str]:
    """Extract connection degree from name line."""
    match = re.search(r'•\s*(1st|2nd|3rd\+)', text)
    if match:
        return match.group(1)
    return None


def extract_name_from_degree_line(line: str) -> str:
    """Extract just the name from a line like 'John Doe • 2nd'."""
    name = re.sub(r'\s*•\s*(1st|2nd|3rd\+)$', '', line.strip())
    # Remove emoji indicators like 🟥
    name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜]+\s*', ' ', name)
    return name.strip()


def is_location_line(line: str) -> bool:
    """Check if line looks like a location."""
    location_patterns = [
        r'Netherlands$',
        r'Germany$',
        r'Belgium$',
        r'United Kingdom$',
        r'France$',
        r'Denmark$',
        r'Türkiye$',
        r'Turkey$',
        r'Spain$',
        r'Italy$',
        r'Austria$',
        r'Switzerland$',
        r'Poland$',
        r', [A-Z][a-z]+(,| [A-Z])',  # City, Region pattern
        r'Area$',
        r'Region$',
        r'Metropolitan',
        r'The Randstad',
    ]
    for pattern in location_patterns:
        if re.search(pattern, line.strip(), re.IGNORECASE):
            return True
    return False


def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
    """
    Detect if a headline is heritage-relevant and what type.

    Two-stage classification:
    1. Check if organization is explicitly non-heritage (blocklist)
    2. Check if role/organization matches heritage patterns

    For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
    """
    headline_lower = headline.lower()

    # Stage 1: Check for non-heritage organizations (blocklist)
    for org in NON_HERITAGE_ORGANIZATIONS:
        if org.lower() in headline_lower:
            return (False, None)

    # Stage 2: Check for non-heritage role indicators
    for keyword in NON_HERITAGE_KEYWORDS:
        if keyword.lower() in headline_lower:
            return (False, None)

    # Stage 3: Check if this is a heritage organization
    is_heritage_org = False
    for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
        if org_keyword.lower() in headline_lower:
            is_heritage_org = True
            break

    # Check heritage keywords by type (order matters - more specific first)
    # 'D' (Digital) is checked last and requires heritage org validation
    type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E']  # D removed from here

    for heritage_type in type_order:
        keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
        for keyword in keywords:
            if keyword.lower() in headline_lower:
                return (True, heritage_type)

    # Special handling for 'D' (Digital) - ONLY if at a heritage organization
    # This prevents generic IT workers from being classified as heritage-relevant
    if is_heritage_org:
        digital_keywords = HERITAGE_KEYWORDS.get('D', [])
        for keyword in digital_keywords:
            if keyword.lower() in headline_lower:
                return (True, 'D')

    # Generic heritage terms (without specific type)
    generic_heritage = [
        'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
        'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
        'preservation', 'conservation', 'collection'
    ]
    for keyword in generic_heritage:
        if keyword in headline_lower:
            return (True, None)

    return (False, None)


def extract_organization(headline: str) -> Optional[str]:
    """Try to extract organization name from headline."""
    patterns = [
        r'\bat\s+(?:the\s+)?(.+?)(?:\s*[|/]|$)',
        r'\bbij\s+(?:het\s+|de\s+)?(.+?)(?:\s*[|/]|$)',
        r'\b@\s*(.+?)(?:\s*[|/]|$)',
    ]

    for pattern in patterns:
        match = re.search(pattern, headline, re.IGNORECASE)
        if match:
            org = match.group(1).strip()
            org = re.sub(r'\s*[|/].*$', '', org)
            if len(org) > 3:  # Avoid very short matches
                return org

    return None


def parse_connections_file(filepath: Path, target_name: str, target_slug: str) -> list[dict]:
    """
    Parse a LinkedIn connections raw text file using a line-by-line approach.

    The expected pattern for each connection is:
    1. Name (standalone, optional - sometimes missing)
    2. Name • degree (e.g., "John Doe • 2nd")
    3. Empty line
    4. Headline
    5. Empty line
    6. Location
    7. Empty line
    8. Action button (Connect/Message/Follow)
    9. Follower count (optional, for Follow)
    10. Mutual connections (optional)

    Args:
        filepath: Path to the raw connections file
        target_name: Name of the person whose connections we're parsing
        target_slug: LinkedIn slug of the target person (for generating connection_id)
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.rstrip('\n') for line in f]

    connections = []
    seen_names = set()
    connection_index = 0  # Counter for unique connection IDs

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Look for degree pattern - this is the definitive start of a connection
        degree = parse_degree(line)
        if degree:
            name = extract_name_from_degree_line(line)

            # Skip target's own name and duplicates
            if name == target_name or name in seen_names:
                i += 1
                continue

            # Determine name type (full, abbreviated, or anonymous)
            if is_anonymous_name(name):
                name_type = 'anonymous'
            elif is_abbreviated_name(name):
                name_type = 'abbreviated'
            else:
                name_type = 'full'

            # Generate unique connection ID
            connection_id = generate_connection_id(name, connection_index, target_slug)
            connection_index += 1

            # Found a new connection - now extract following fields
            connection: dict[str, Any] = {
                'connection_id': connection_id,
                'name': name,
                'name_type': name_type,
                'degree': degree,
            }

            i += 1  # Move past the name+degree line

            # Skip empty lines
            while i < len(lines) and not lines[i].strip():
                i += 1

            # Next non-empty line should be headline
            if i < len(lines):
                headline_line = lines[i].strip()
                # Make sure it's not noise or another connection
                if (not is_noise_line(headline_line) and
                    not parse_degree(headline_line) and
                    not is_action_button(headline_line) and
                    not is_mutual_connections_line(headline_line) and
                    not is_follower_count(headline_line)):
                    connection['headline'] = headline_line
                    i += 1

            # Skip empty lines
            while i < len(lines) and not lines[i].strip():
                i += 1

            # Next might be location
            if i < len(lines):
                loc_line = lines[i].strip()
                if (is_location_line(loc_line) and
                    not is_action_button(loc_line) and
                    not parse_degree(loc_line)):
                    connection['location'] = loc_line
                    i += 1

            # Skip remaining fields until next connection
            while i < len(lines):
                check_line = lines[i].strip()
                # Stop if we find a degree pattern (next connection)
                if parse_degree(check_line):
                    break
                i += 1

            # Process the connection
            headline = connection.get('headline', '')
            if headline:
                org = extract_organization(headline)
                if org:
                    connection['organization'] = org

                is_relevant, heritage_type = detect_heritage_type(headline)
                connection['heritage_relevant'] = is_relevant
                if heritage_type:
                    connection['heritage_type'] = heritage_type
            else:
                connection['heritage_relevant'] = False

            connections.append(connection)
            seen_names.add(name)
        else:
            i += 1

    return connections


def compute_network_analysis(connections: list[dict]) -> dict:
    """Compute network analysis statistics from connections."""
    total = len(connections)
    heritage_relevant = [c for c in connections if c.get('heritage_relevant', False)]
    heritage_count = len(heritage_relevant)

    # Count by heritage type
    type_counts: Counter[str] = Counter()
    for c in heritage_relevant:
        ht = c.get('heritage_type')
        if ht:
            type_counts[ht] += 1

    # Count by organization
    org_counts: dict[str, dict[str, int | str | None]] = {}
    for c in heritage_relevant:
        org = c.get('organization')
        if org:
            if org not in org_counts:
                org_counts[org] = {'count': 0, 'heritage_type': None}
            org_counts[org]['count'] = int(org_counts[org].get('count') or 0) + 1
            if c.get('heritage_type'):
                org_counts[org]['heritage_type'] = c['heritage_type']

    # Sort organizations by count
    top_orgs = sorted(
        [{'organization': k, 'count': v.get('count', 0), 'heritage_type': v.get('heritage_type')} for k, v in org_counts.items()],
        key=lambda x: int(x.get('count') or 0),
        reverse=True
    )[:15]

    return {
        'total_connections_extracted': total,
        'heritage_relevant_count': heritage_count,
        'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
        'connections_by_heritage_type': dict(type_counts),
        'top_organizations': top_orgs,
    }


def create_output(
    connections: list[dict],
    target_name: str,
    target_slug: str,
    input_file: Path,
    target_org: Optional[str] = None,
) -> dict:
    """Create the full output JSON structure."""

    network_analysis = compute_network_analysis(connections)

    source_url = "https://www.linkedin.com/search/results/people/?network=%5B%22F%22%2C%22S%22%2C%22O%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH"

    # Extract timestamp from filename
    timestamp_match = re.search(r'_(\d{8}T\d{6}Z)', input_file.name)
    if timestamp_match:
        ts = timestamp_match.group(1)
        scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
    else:
        scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    output = {
        'source_metadata': {
            'source_url': source_url,
            'scraped_timestamp': scraped_ts,
            'scrape_method': 'manual_linkedin_browse',
            'target_profile': target_slug,
            'target_name': target_name,
            'connections_extracted': len(connections),
            'notes': f"Extracted from LinkedIn connections search. Raw scrape in {input_file.name}"
        },
        'connections': connections,
        'network_analysis': network_analysis,
        'provenance': {
            'data_source': 'LINKEDIN_SCRAPE',
            'data_tier': 'TIER_3_CROWD_SOURCED',
            'extraction_date': scraped_ts,
            'extraction_method': 'manual_browse_copy_paste',
            'raw_source_file': input_file.name,
            'processed_by': 'parse_linkedin_connections.py',
            'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
        }
    }

    if target_org:
        output['source_metadata']['target_organization'] = target_org

    return output


def main():
    parser = argparse.ArgumentParser(
        description='Parse LinkedIn connections from raw manual scrape files.'
    )
    parser.add_argument('input_file', type=Path, help='Input raw text file')
    parser.add_argument('output_file', type=Path, help='Output JSON file')
    parser.add_argument('--target-name', required=True, help='Name of the person')
    parser.add_argument('--target-slug', required=True, help='LinkedIn slug')
    parser.add_argument('--target-org', help='Current organization')
    parser.add_argument('--dry-run', action='store_true', help='Parse but do not write')

    args = parser.parse_args()

    if not args.input_file.exists():
        print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
        sys.exit(1)

    print(f"Parsing connections from: {args.input_file}")
    connections = parse_connections_file(args.input_file, args.target_name, args.target_slug)
    print(f"Extracted {len(connections)} unique connections")

    output = create_output(
        connections,
        args.target_name,
        args.target_slug,
        args.input_file,
        args.target_org,
    )

    analysis = output['network_analysis']
    print(f"\nNetwork Analysis:")
    print(f"  Total connections: {analysis['total_connections_extracted']}")
    print(f"  Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
    print(f"  By type: {analysis['connections_by_heritage_type']}")

    if analysis['top_organizations']:
        print(f"  Top organizations:")
        for org in analysis['top_organizations'][:5]:
            print(f"    - {org['organization']}: {org['count']}")

    if args.dry_run:
        print("\n[Dry run - not writing output]")
        print("\nSample connections (first 5):")
        for c in connections[:5]:
            print(f"  - {c['name']} ({c['degree']})")
            print(f"    Headline: {c.get('headline', 'N/A')[:70]}")
            print(f"    Location: {c.get('location', 'N/A')}")
            print(f"    Heritage: {c.get('heritage_relevant', False)} ({c.get('heritage_type', '-')})")
    else:
        args.output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(args.output_file, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        print(f"\nWrote output to: {args.output_file}")


if __name__ == '__main__':
    main()