- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
680 lines
24 KiB
Python
680 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse LinkedIn connections from raw manual scrape files.
|
|
|
|
This script processes raw text exports from LinkedIn connection search pages
|
|
and extracts structured connection data following Rule 15 (Connection Data Registration).
|
|
|
|
Usage:
|
|
python scripts/parse_linkedin_connections.py <input_file> <output_file> --target-name "Name" --target-slug "slug"
|
|
|
|
Example:
|
|
python scripts/parse_linkedin_connections.py \
|
|
data/custodian/person/manual_register/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.md \
|
|
data/custodian/person/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.json \
|
|
--target-name "Elif Rongen-Kaynakçi" \
|
|
--target-slug "elif-rongen-kaynakci-35295a17"
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
|
|
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
|
HERITAGE_KEYWORDS = {
|
|
# G - Gallery
|
|
'G': [
|
|
'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
|
|
'exhibition space', 'tentoonstellingsruimte'
|
|
],
|
|
# L - Library
|
|
'L': [
|
|
'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
|
|
'KB ', 'national library', 'universiteitsbiblio', 'UB '
|
|
],
|
|
# A - Archive
|
|
'A': [
|
|
'archive', 'archief', 'archivist', 'archivaris', 'archival',
|
|
'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
|
|
'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
|
|
'sound and vision', 'nationaal archief', 'stadsarchief',
|
|
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG'
|
|
],
|
|
# M - Museum
|
|
'M': [
|
|
'museum', 'musea', 'curator', 'conservator', 'collection manager',
|
|
'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'museale'
|
|
],
|
|
# O - Official Institution
|
|
'O': [
|
|
'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
|
|
'province', 'provincie', 'OCW', 'ministerie van'
|
|
],
|
|
# R - Research Center
|
|
'R': [
|
|
'research', 'onderzoek', 'researcher', 'onderzoeker',
|
|
'KNAW', 'humanities cluster', 'NWO', 'think tank',
|
|
'documentatie', 'documentation', 'kenniscentrum'
|
|
],
|
|
# C - Corporation (Corporate heritage)
|
|
'C': [
|
|
'corporate archive', 'bedrijfsarchief', 'company history',
|
|
'shell', 'philips', 'heineken'
|
|
],
|
|
# E - Education Provider
|
|
'E': [
|
|
'university', 'universiteit', 'professor', 'lecturer', 'docent',
|
|
'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
|
|
'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
|
|
'leiden university', 'utrecht university', 'UU ', 'TU ',
|
|
'reinwardt', 'film academy', 'filmacademie', 'graduate',
|
|
'assistant professor', 'associate professor', 'hoogleraar'
|
|
],
|
|
# S - Collecting Society
|
|
'S': [
|
|
'society', 'vereniging', 'genootschap', 'historical society',
|
|
'historische vereniging', 'heemkunde'
|
|
],
|
|
# D - Digital Platform
|
|
'D': [
|
|
'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
|
|
'developer', 'engineer', 'data ', 'AI ', 'machine learning'
|
|
],
|
|
}
|
|
|
|
# Non-heritage keywords (to mark as heritage_relevant=False)
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
|
|
'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
|
|
'consultant', 'coach', 'therapy', 'health', 'medical',
|
|
'food', 'restaurant', 'retail', 'fashion', 'real estate',
|
|
'insurance', 'banking', 'investment', 'e-commerce',
|
|
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
|
]
|
|
|
|
# Organizations that are explicitly NOT heritage institutions
|
|
# These should never be classified as heritage-relevant
|
|
NON_HERITAGE_ORGANIZATIONS = [
|
|
# Banks & Financial
|
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
|
# Security companies
|
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
|
# Police/Government (non-cultural)
|
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
|
# Political parties
|
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
|
# Tech companies (non-heritage)
|
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
|
# Telecom
|
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
|
# Postal / Logistics
|
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
|
# Healthcare
|
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
|
# Retail
|
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
|
# Consulting / Professional services
|
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
|
# Recruitment / HR
|
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
|
# Energy / Utilities
|
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
|
# Transport
|
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
|
# Other
|
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
|
]
|
|
|
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
|
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
|
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
|
# Archives
|
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
|
# Museums
|
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
|
# Libraries
|
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
|
# Film/AV heritage
|
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
|
# Heritage platforms
|
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
|
# Research institutes (heritage-focused)
|
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
|
]
|
|
|
|
# Lines that indicate LinkedIn UI noise (to skip entirely)
|
|
NOISE_EXACT = {
|
|
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
|
|
'Notifications', 'Me', 'For Business', 'Learning', 'People',
|
|
'1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
|
|
'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
|
|
'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
|
|
'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
|
|
'More', 'Compose message', 'Actively hiring',
|
|
}
|
|
|
|
NOISE_PATTERNS = [
|
|
r'^\d+$', # Just a number
|
|
r'^\d+ notifications?$',
|
|
r'^LinkedIn Corporation',
|
|
r'^You are on the messaging overlay',
|
|
r'Status is online$',
|
|
r'^MessagingYou are on the messaging',
|
|
r'^Are these results helpful',
|
|
r'^Your feedback helps',
|
|
r'^\d+K? followers?$',
|
|
]
|
|
|
|
|
|
def is_noise_line(line: str) -> bool:
|
|
"""Check if a line is LinkedIn UI noise that should be skipped."""
|
|
line = line.strip()
|
|
if not line:
|
|
return True
|
|
|
|
if line in NOISE_EXACT:
|
|
return True
|
|
|
|
for pattern in NOISE_PATTERNS:
|
|
if re.match(pattern, line, re.IGNORECASE):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def is_action_button(line: str) -> bool:
|
|
"""Check if line is an action button."""
|
|
return line.strip() in ('Connect', 'Message', 'Follow')
|
|
|
|
|
|
def is_mutual_connections_line(line: str) -> bool:
|
|
"""Check if line describes mutual connections."""
|
|
patterns = [
|
|
r'mutual connections?$',
|
|
r'is a mutual connection$',
|
|
r'are mutual connections$',
|
|
]
|
|
for pattern in patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_follower_count(line: str) -> bool:
|
|
"""Check if line is a follower count."""
|
|
return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))
|
|
|
|
|
|
def is_anonymous_name(name: str) -> bool:
|
|
"""Check if name is an anonymous LinkedIn Member."""
|
|
anonymous_patterns = [
|
|
r'^linkedin\s*member$',
|
|
r'^member$',
|
|
r'^anonymous$',
|
|
]
|
|
name_lower = name.lower().strip()
|
|
return any(re.match(p, name_lower) for p in anonymous_patterns)
|
|
|
|
|
|
def is_abbreviated_name(name: str) -> bool:
|
|
"""
|
|
Check if name contains abbreviations (privacy-protected).
|
|
|
|
Patterns detected:
|
|
- "Amy B." (first name + single initial)
|
|
- "Elisabeth V." (ends with initial)
|
|
- "Tina M. Bastajian" (middle initial)
|
|
- "S. Buse Yildirim" (first initial)
|
|
- "İ. Can Koç" (first initial with Turkish chars)
|
|
"""
|
|
parts = name.split()
|
|
if not parts:
|
|
return False
|
|
|
|
# Check for single-letter initial patterns
|
|
for part in parts:
|
|
# Remove any trailing periods for checking
|
|
clean_part = part.rstrip('.')
|
|
# Single letter or single letter with period = initial
|
|
if len(clean_part) <= 1 and clean_part.isalpha():
|
|
return True
|
|
# Ends with period and is 2 chars (like "M.")
|
|
if part.endswith('.') and len(part) <= 2:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def generate_connection_id(name: str, index: int, target_slug: str) -> str:
|
|
"""
|
|
Generate a unique identifier for a connection.
|
|
|
|
Format: {target_slug}_conn_{index:04d}_{name_slug}
|
|
|
|
Examples:
|
|
- elif-rongen-kaynakci-35295a17_conn_0042_amy_b
|
|
- elif-rongen-kaynakci-35295a17_conn_0156_linkedin_member
|
|
"""
|
|
import unicodedata
|
|
|
|
# Normalize unicode and convert to ASCII-safe slug
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Replace spaces and special chars with underscores
|
|
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
|
|
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
|
|
|
|
# Truncate if too long
|
|
if len(name_slug) > 30:
|
|
name_slug = name_slug[:30].rstrip('_')
|
|
|
|
return f"{target_slug}_conn_{index:04d}_{name_slug}"
|
|
|
|
|
|
def parse_degree(text: str) -> Optional[str]:
|
|
"""Extract connection degree from name line."""
|
|
match = re.search(r'•\s*(1st|2nd|3rd\+)', text)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def extract_name_from_degree_line(line: str) -> str:
|
|
"""Extract just the name from a line like 'John Doe • 2nd'."""
|
|
name = re.sub(r'\s*•\s*(1st|2nd|3rd\+)$', '', line.strip())
|
|
# Remove emoji indicators like 🟥
|
|
name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜]+\s*', ' ', name)
|
|
return name.strip()
|
|
|
|
|
|
def is_location_line(line: str) -> bool:
|
|
"""Check if line looks like a location."""
|
|
location_patterns = [
|
|
r'Netherlands$',
|
|
r'Germany$',
|
|
r'Belgium$',
|
|
r'United Kingdom$',
|
|
r'France$',
|
|
r'Denmark$',
|
|
r'Türkiye$',
|
|
r'Turkey$',
|
|
r'Spain$',
|
|
r'Italy$',
|
|
r'Austria$',
|
|
r'Switzerland$',
|
|
r'Poland$',
|
|
r', [A-Z][a-z]+(,| [A-Z])', # City, Region pattern
|
|
r'Area$',
|
|
r'Region$',
|
|
r'Metropolitan',
|
|
r'The Randstad',
|
|
]
|
|
for pattern in location_patterns:
|
|
if re.search(pattern, line.strip(), re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
|
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
|
"""
|
|
Detect if a headline is heritage-relevant and what type.
|
|
|
|
Two-stage classification:
|
|
1. Check if organization is explicitly non-heritage (blocklist)
|
|
2. Check if role/organization matches heritage patterns
|
|
|
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
|
"""
|
|
headline_lower = headline.lower()
|
|
|
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
|
if org.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 2: Check for non-heritage role indicators
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
if keyword.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 3: Check if this is a heritage organization
|
|
is_heritage_org = False
|
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
|
if org_keyword.lower() in headline_lower:
|
|
is_heritage_org = True
|
|
break
|
|
|
|
# Check heritage keywords by type (order matters - more specific first)
|
|
# 'D' (Digital) is checked last and requires heritage org validation
|
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from here
|
|
|
|
for heritage_type in type_order:
|
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
|
for keyword in keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, heritage_type)
|
|
|
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
|
# This prevents generic IT workers from being classified as heritage-relevant
|
|
if is_heritage_org:
|
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
|
for keyword in digital_keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, 'D')
|
|
|
|
# Generic heritage terms (without specific type)
|
|
generic_heritage = [
|
|
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
|
|
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
|
|
'preservation', 'conservation', 'collection'
|
|
]
|
|
for keyword in generic_heritage:
|
|
if keyword in headline_lower:
|
|
return (True, None)
|
|
|
|
return (False, None)
|
|
|
|
|
|
def extract_organization(headline: str) -> Optional[str]:
|
|
"""Try to extract organization name from headline."""
|
|
patterns = [
|
|
r'\bat\s+(?:the\s+)?(.+?)(?:\s*[|/]|$)',
|
|
r'\bbij\s+(?:het\s+|de\s+)?(.+?)(?:\s*[|/]|$)',
|
|
r'\b@\s*(.+?)(?:\s*[|/]|$)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, headline, re.IGNORECASE)
|
|
if match:
|
|
org = match.group(1).strip()
|
|
org = re.sub(r'\s*[|/].*$', '', org)
|
|
if len(org) > 3: # Avoid very short matches
|
|
return org
|
|
|
|
return None
|
|
|
|
|
|
def parse_connections_file(filepath: Path, target_name: str, target_slug: str) -> list[dict]:
|
|
"""
|
|
Parse a LinkedIn connections raw text file using a line-by-line approach.
|
|
|
|
The expected pattern for each connection is:
|
|
1. Name (standalone, optional - sometimes missing)
|
|
2. Name • degree (e.g., "John Doe • 2nd")
|
|
3. Empty line
|
|
4. Headline
|
|
5. Empty line
|
|
6. Location
|
|
7. Empty line
|
|
8. Action button (Connect/Message/Follow)
|
|
9. Follower count (optional, for Follow)
|
|
10. Mutual connections (optional)
|
|
|
|
Args:
|
|
filepath: Path to the raw connections file
|
|
target_name: Name of the person whose connections we're parsing
|
|
target_slug: LinkedIn slug of the target person (for generating connection_id)
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
lines = [line.rstrip('\n') for line in f]
|
|
|
|
connections = []
|
|
seen_names = set()
|
|
connection_index = 0 # Counter for unique connection IDs
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i].strip()
|
|
|
|
# Look for degree pattern - this is the definitive start of a connection
|
|
degree = parse_degree(line)
|
|
if degree:
|
|
name = extract_name_from_degree_line(line)
|
|
|
|
# Skip target's own name and duplicates
|
|
if name == target_name or name in seen_names:
|
|
i += 1
|
|
continue
|
|
|
|
# Determine name type (full, abbreviated, or anonymous)
|
|
if is_anonymous_name(name):
|
|
name_type = 'anonymous'
|
|
elif is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
# Generate unique connection ID
|
|
connection_id = generate_connection_id(name, connection_index, target_slug)
|
|
connection_index += 1
|
|
|
|
# Found a new connection - now extract following fields
|
|
connection: dict[str, Any] = {
|
|
'connection_id': connection_id,
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': degree,
|
|
}
|
|
|
|
i += 1 # Move past the name+degree line
|
|
|
|
# Skip empty lines
|
|
while i < len(lines) and not lines[i].strip():
|
|
i += 1
|
|
|
|
# Next non-empty line should be headline
|
|
if i < len(lines):
|
|
headline_line = lines[i].strip()
|
|
# Make sure it's not noise or another connection
|
|
if (not is_noise_line(headline_line) and
|
|
not parse_degree(headline_line) and
|
|
not is_action_button(headline_line) and
|
|
not is_mutual_connections_line(headline_line) and
|
|
not is_follower_count(headline_line)):
|
|
connection['headline'] = headline_line
|
|
i += 1
|
|
|
|
# Skip empty lines
|
|
while i < len(lines) and not lines[i].strip():
|
|
i += 1
|
|
|
|
# Next might be location
|
|
if i < len(lines):
|
|
loc_line = lines[i].strip()
|
|
if (is_location_line(loc_line) and
|
|
not is_action_button(loc_line) and
|
|
not parse_degree(loc_line)):
|
|
connection['location'] = loc_line
|
|
i += 1
|
|
|
|
# Skip remaining fields until next connection
|
|
while i < len(lines):
|
|
check_line = lines[i].strip()
|
|
# Stop if we find a degree pattern (next connection)
|
|
if parse_degree(check_line):
|
|
break
|
|
i += 1
|
|
|
|
# Process the connection
|
|
headline = connection.get('headline', '')
|
|
if headline:
|
|
org = extract_organization(headline)
|
|
if org:
|
|
connection['organization'] = org
|
|
|
|
is_relevant, heritage_type = detect_heritage_type(headline)
|
|
connection['heritage_relevant'] = is_relevant
|
|
if heritage_type:
|
|
connection['heritage_type'] = heritage_type
|
|
else:
|
|
connection['heritage_relevant'] = False
|
|
|
|
connections.append(connection)
|
|
seen_names.add(name)
|
|
else:
|
|
i += 1
|
|
|
|
return connections
|
|
|
|
|
|
def compute_network_analysis(connections: list[dict]) -> dict:
|
|
"""Compute network analysis statistics from connections."""
|
|
total = len(connections)
|
|
heritage_relevant = [c for c in connections if c.get('heritage_relevant', False)]
|
|
heritage_count = len(heritage_relevant)
|
|
|
|
# Count by heritage type
|
|
type_counts: Counter[str] = Counter()
|
|
for c in heritage_relevant:
|
|
ht = c.get('heritage_type')
|
|
if ht:
|
|
type_counts[ht] += 1
|
|
|
|
# Count by organization
|
|
org_counts: dict[str, dict[str, int | str | None]] = {}
|
|
for c in heritage_relevant:
|
|
org = c.get('organization')
|
|
if org:
|
|
if org not in org_counts:
|
|
org_counts[org] = {'count': 0, 'heritage_type': None}
|
|
org_counts[org]['count'] = int(org_counts[org].get('count') or 0) + 1
|
|
if c.get('heritage_type'):
|
|
org_counts[org]['heritage_type'] = c['heritage_type']
|
|
|
|
# Sort organizations by count
|
|
top_orgs = sorted(
|
|
[{'organization': k, 'count': v.get('count', 0), 'heritage_type': v.get('heritage_type')} for k, v in org_counts.items()],
|
|
key=lambda x: int(x.get('count') or 0),
|
|
reverse=True
|
|
)[:15]
|
|
|
|
return {
|
|
'total_connections_extracted': total,
|
|
'heritage_relevant_count': heritage_count,
|
|
'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
|
|
'connections_by_heritage_type': dict(type_counts),
|
|
'top_organizations': top_orgs,
|
|
}
|
|
|
|
|
|
def create_output(
|
|
connections: list[dict],
|
|
target_name: str,
|
|
target_slug: str,
|
|
input_file: Path,
|
|
target_org: Optional[str] = None,
|
|
) -> dict:
|
|
"""Create the full output JSON structure."""
|
|
|
|
network_analysis = compute_network_analysis(connections)
|
|
|
|
source_url = "https://www.linkedin.com/search/results/people/?network=%5B%22F%22%2C%22S%22%2C%22O%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH"
|
|
|
|
# Extract timestamp from filename
|
|
timestamp_match = re.search(r'_(\d{8}T\d{6}Z)', input_file.name)
|
|
if timestamp_match:
|
|
ts = timestamp_match.group(1)
|
|
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
|
|
else:
|
|
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
output = {
|
|
'source_metadata': {
|
|
'source_url': source_url,
|
|
'scraped_timestamp': scraped_ts,
|
|
'scrape_method': 'manual_linkedin_browse',
|
|
'target_profile': target_slug,
|
|
'target_name': target_name,
|
|
'connections_extracted': len(connections),
|
|
'notes': f"Extracted from LinkedIn connections search. Raw scrape in {input_file.name}"
|
|
},
|
|
'connections': connections,
|
|
'network_analysis': network_analysis,
|
|
'provenance': {
|
|
'data_source': 'LINKEDIN_SCRAPE',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'extraction_date': scraped_ts,
|
|
'extraction_method': 'manual_browse_copy_paste',
|
|
'raw_source_file': input_file.name,
|
|
'processed_by': 'parse_linkedin_connections.py',
|
|
'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
}
|
|
}
|
|
|
|
if target_org:
|
|
output['source_metadata']['target_organization'] = target_org
|
|
|
|
return output
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Parse LinkedIn connections from raw manual scrape files.'
|
|
)
|
|
parser.add_argument('input_file', type=Path, help='Input raw text file')
|
|
parser.add_argument('output_file', type=Path, help='Output JSON file')
|
|
parser.add_argument('--target-name', required=True, help='Name of the person')
|
|
parser.add_argument('--target-slug', required=True, help='LinkedIn slug')
|
|
parser.add_argument('--target-org', help='Current organization')
|
|
parser.add_argument('--dry-run', action='store_true', help='Parse but do not write')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input_file.exists():
|
|
print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Parsing connections from: {args.input_file}")
|
|
connections = parse_connections_file(args.input_file, args.target_name, args.target_slug)
|
|
print(f"Extracted {len(connections)} unique connections")
|
|
|
|
output = create_output(
|
|
connections,
|
|
args.target_name,
|
|
args.target_slug,
|
|
args.input_file,
|
|
args.target_org,
|
|
)
|
|
|
|
analysis = output['network_analysis']
|
|
print(f"\nNetwork Analysis:")
|
|
print(f" Total connections: {analysis['total_connections_extracted']}")
|
|
print(f" Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
|
|
print(f" By type: {analysis['connections_by_heritage_type']}")
|
|
|
|
if analysis['top_organizations']:
|
|
print(f" Top organizations:")
|
|
for org in analysis['top_organizations'][:5]:
|
|
print(f" - {org['organization']}: {org['count']}")
|
|
|
|
if args.dry_run:
|
|
print("\n[Dry run - not writing output]")
|
|
print("\nSample connections (first 5):")
|
|
for c in connections[:5]:
|
|
print(f" - {c['name']} ({c['degree']})")
|
|
print(f" Headline: {c.get('headline', 'N/A')[:70]}")
|
|
print(f" Location: {c.get('location', 'N/A')}")
|
|
print(f" Heritage: {c.get('heritage_relevant', False)} ({c.get('heritage_type', '-')})")
|
|
else:
|
|
args.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
print(f"\nWrote output to: {args.output_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|