glam/scripts/parse_linkedin_connections.py
2025-12-10 13:01:13 +01:00

597 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Parse LinkedIn connections from raw manual scrape files.
This script processes raw text exports from LinkedIn connection search pages
and extracts structured connection data following Rule 15 (Connection Data Registration).
Usage:
python scripts/parse_linkedin_connections.py <input_file> <output_file> --target-name "Name" --target-slug "slug"
Example:
python scripts/parse_linkedin_connections.py \
data/custodian/person/manual_register/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.md \
data/custodian/person/elif-rongen-kaynakci-35295a17_connections_20251209T220000Z.json \
--target-name "Elif Rongen-Kaynakçi" \
--target-slug "elif-rongen-kaynakci-35295a17"
"""
import argparse
import json
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
# G - Gallery
'G': [
'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
'exhibition space', 'tentoonstellingsruimte'
],
# L - Library
'L': [
'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
'KB ', 'national library', 'universiteitsbiblio', 'UB '
],
# A - Archive
'A': [
'archive', 'archief', 'archivist', 'archivaris', 'archival',
'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
'sound and vision', 'nationaal archief', 'stadsarchief',
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG'
],
# M - Museum
'M': [
'museum', 'musea', 'curator', 'conservator', 'collection manager',
'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
'tropenmuseum', 'allard pierson', 'museale'
],
# O - Official Institution
'O': [
'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
'province', 'provincie', 'OCW', 'ministerie van'
],
# R - Research Center
'R': [
'research', 'onderzoek', 'researcher', 'onderzoeker',
'KNAW', 'humanities cluster', 'NWO', 'think tank',
'documentatie', 'documentation', 'kenniscentrum'
],
# C - Corporation (Corporate heritage)
'C': [
'corporate archive', 'bedrijfsarchief', 'company history',
'shell', 'philips', 'heineken'
],
# E - Education Provider
'E': [
'university', 'universiteit', 'professor', 'lecturer', 'docent',
'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
'leiden university', 'utrecht university', 'UU ', 'TU ',
'reinwardt', 'film academy', 'filmacademie', 'graduate',
'assistant professor', 'associate professor', 'hoogleraar'
],
# S - Collecting Society
'S': [
'society', 'vereniging', 'genootschap', 'historical society',
'historische vereniging', 'heemkunde'
],
# D - Digital Platform
'D': [
'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
'developer', 'engineer', 'data ', 'AI ', 'machine learning'
],
}
# Non-heritage keywords (to mark as heritage_relevant=False)
NON_HERITAGE_KEYWORDS = [
'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
'consultant', 'coach', 'therapy', 'health', 'medical',
'food', 'restaurant', 'retail', 'fashion', 'real estate',
'insurance', 'banking', 'investment', 'e-commerce',
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]
# Lines that indicate LinkedIn UI noise (to skip entirely)
NOISE_EXACT = {
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
'Notifications', 'Me', 'For Business', 'Learning', 'People',
'1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
'More', 'Compose message', 'Actively hiring',
}
NOISE_PATTERNS = [
r'^\d+$', # Just a number
r'^\d+ notifications?$',
r'^LinkedIn Corporation',
r'^You are on the messaging overlay',
r'Status is online$',
r'^MessagingYou are on the messaging',
r'^Are these results helpful',
r'^Your feedback helps',
r'^\d+K? followers?$',
]
def is_noise_line(line: str) -> bool:
"""Check if a line is LinkedIn UI noise that should be skipped."""
line = line.strip()
if not line:
return True
if line in NOISE_EXACT:
return True
for pattern in NOISE_PATTERNS:
if re.match(pattern, line, re.IGNORECASE):
return True
return False
def is_action_button(line: str) -> bool:
"""Check if line is an action button."""
return line.strip() in ('Connect', 'Message', 'Follow')
def is_mutual_connections_line(line: str) -> bool:
"""Check if line describes mutual connections."""
patterns = [
r'mutual connections?$',
r'is a mutual connection$',
r'are mutual connections$',
]
for pattern in patterns:
if re.search(pattern, line, re.IGNORECASE):
return True
return False
def is_follower_count(line: str) -> bool:
"""Check if line is a follower count."""
return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))
def is_anonymous_name(name: str) -> bool:
"""Check if name is an anonymous LinkedIn Member."""
anonymous_patterns = [
r'^linkedin\s*member$',
r'^member$',
r'^anonymous$',
]
name_lower = name.lower().strip()
return any(re.match(p, name_lower) for p in anonymous_patterns)
def is_abbreviated_name(name: str) -> bool:
"""
Check if name contains abbreviations (privacy-protected).
Patterns detected:
- "Amy B." (first name + single initial)
- "Elisabeth V." (ends with initial)
- "Tina M. Bastajian" (middle initial)
- "S. Buse Yildirim" (first initial)
- "İ. Can Koç" (first initial with Turkish chars)
"""
parts = name.split()
if not parts:
return False
# Check for single-letter initial patterns
for part in parts:
# Remove any trailing periods for checking
clean_part = part.rstrip('.')
# Single letter or single letter with period = initial
if len(clean_part) <= 1 and clean_part.isalpha():
return True
# Ends with period and is 2 chars (like "M.")
if part.endswith('.') and len(part) <= 2:
return True
return False
def generate_connection_id(name: str, index: int, target_slug: str) -> str:
"""
Generate a unique identifier for a connection.
Format: {target_slug}_conn_{index:04d}_{name_slug}
Examples:
- elif-rongen-kaynakci-35295a17_conn_0042_amy_b
- elif-rongen-kaynakci-35295a17_conn_0156_linkedin_member
"""
import unicodedata
# Normalize unicode and convert to ASCII-safe slug
normalized = unicodedata.normalize('NFD', name.lower())
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Replace spaces and special chars with underscores
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
# Truncate if too long
if len(name_slug) > 30:
name_slug = name_slug[:30].rstrip('_')
return f"{target_slug}_conn_{index:04d}_{name_slug}"
def parse_degree(text: str) -> Optional[str]:
"""Extract connection degree from name line."""
match = re.search(r'\s*(1st|2nd|3rd\+)', text)
if match:
return match.group(1)
return None
def extract_name_from_degree_line(line: str) -> str:
"""Extract just the name from a line like 'John Doe • 2nd'."""
name = re.sub(r'\s*•\s*(1st|2nd|3rd\+)$', '', line.strip())
# Remove emoji indicators like 🟥
name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜]+\s*', ' ', name)
return name.strip()
def is_location_line(line: str) -> bool:
"""Check if line looks like a location."""
location_patterns = [
r'Netherlands$',
r'Germany$',
r'Belgium$',
r'United Kingdom$',
r'France$',
r'Denmark$',
r'Türkiye$',
r'Turkey$',
r'Spain$',
r'Italy$',
r'Austria$',
r'Switzerland$',
r'Poland$',
r', [A-Z][a-z]+(,| [A-Z])', # City, Region pattern
r'Area$',
r'Region$',
r'Metropolitan',
r'The Randstad',
]
for pattern in location_patterns:
if re.search(pattern, line.strip(), re.IGNORECASE):
return True
return False
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
"""
Detect if a headline is heritage-relevant and what type.
"""
headline_lower = headline.lower()
# Check for non-heritage indicators
for keyword in NON_HERITAGE_KEYWORDS:
if keyword.lower() in headline_lower:
return (False, None)
# Check heritage keywords by type (order matters - more specific first)
type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']
for heritage_type in type_order:
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
for keyword in keywords:
if keyword.lower() in headline_lower:
return (True, heritage_type)
# Generic heritage terms
generic_heritage = [
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
'preservation', 'conservation', 'collection'
]
for keyword in generic_heritage:
if keyword in headline_lower:
return (True, None)
return (False, None)
def extract_organization(headline: str) -> Optional[str]:
"""Try to extract organization name from headline."""
patterns = [
r'\bat\s+(?:the\s+)?(.+?)(?:\s*[|/]|$)',
r'\bbij\s+(?:het\s+|de\s+)?(.+?)(?:\s*[|/]|$)',
r'\b@\s*(.+?)(?:\s*[|/]|$)',
]
for pattern in patterns:
match = re.search(pattern, headline, re.IGNORECASE)
if match:
org = match.group(1).strip()
org = re.sub(r'\s*[|/].*$', '', org)
if len(org) > 3: # Avoid very short matches
return org
return None
def parse_connections_file(filepath: Path, target_name: str, target_slug: str) -> list[dict]:
"""
Parse a LinkedIn connections raw text file using a line-by-line approach.
The expected pattern for each connection is:
1. Name (standalone, optional - sometimes missing)
2. Name • degree (e.g., "John Doe • 2nd")
3. Empty line
4. Headline
5. Empty line
6. Location
7. Empty line
8. Action button (Connect/Message/Follow)
9. Follower count (optional, for Follow)
10. Mutual connections (optional)
Args:
filepath: Path to the raw connections file
target_name: Name of the person whose connections we're parsing
target_slug: LinkedIn slug of the target person (for generating connection_id)
"""
with open(filepath, 'r', encoding='utf-8') as f:
lines = [line.rstrip('\n') for line in f]
connections = []
seen_names = set()
connection_index = 0 # Counter for unique connection IDs
i = 0
while i < len(lines):
line = lines[i].strip()
# Look for degree pattern - this is the definitive start of a connection
degree = parse_degree(line)
if degree:
name = extract_name_from_degree_line(line)
# Skip target's own name and duplicates
if name == target_name or name in seen_names:
i += 1
continue
# Determine name type (full, abbreviated, or anonymous)
if is_anonymous_name(name):
name_type = 'anonymous'
elif is_abbreviated_name(name):
name_type = 'abbreviated'
else:
name_type = 'full'
# Generate unique connection ID
connection_id = generate_connection_id(name, connection_index, target_slug)
connection_index += 1
# Found a new connection - now extract following fields
connection: dict[str, Any] = {
'connection_id': connection_id,
'name': name,
'name_type': name_type,
'degree': degree,
}
i += 1 # Move past the name+degree line
# Skip empty lines
while i < len(lines) and not lines[i].strip():
i += 1
# Next non-empty line should be headline
if i < len(lines):
headline_line = lines[i].strip()
# Make sure it's not noise or another connection
if (not is_noise_line(headline_line) and
not parse_degree(headline_line) and
not is_action_button(headline_line) and
not is_mutual_connections_line(headline_line) and
not is_follower_count(headline_line)):
connection['headline'] = headline_line
i += 1
# Skip empty lines
while i < len(lines) and not lines[i].strip():
i += 1
# Next might be location
if i < len(lines):
loc_line = lines[i].strip()
if (is_location_line(loc_line) and
not is_action_button(loc_line) and
not parse_degree(loc_line)):
connection['location'] = loc_line
i += 1
# Skip remaining fields until next connection
while i < len(lines):
check_line = lines[i].strip()
# Stop if we find a degree pattern (next connection)
if parse_degree(check_line):
break
i += 1
# Process the connection
headline = connection.get('headline', '')
if headline:
org = extract_organization(headline)
if org:
connection['organization'] = org
is_relevant, heritage_type = detect_heritage_type(headline)
connection['heritage_relevant'] = is_relevant
if heritage_type:
connection['heritage_type'] = heritage_type
else:
connection['heritage_relevant'] = False
connections.append(connection)
seen_names.add(name)
else:
i += 1
return connections
def compute_network_analysis(connections: list[dict]) -> dict:
"""Compute network analysis statistics from connections."""
total = len(connections)
heritage_relevant = [c for c in connections if c.get('heritage_relevant', False)]
heritage_count = len(heritage_relevant)
# Count by heritage type
type_counts: Counter[str] = Counter()
for c in heritage_relevant:
ht = c.get('heritage_type')
if ht:
type_counts[ht] += 1
# Count by organization
org_counts: dict[str, dict[str, int | str | None]] = {}
for c in heritage_relevant:
org = c.get('organization')
if org:
if org not in org_counts:
org_counts[org] = {'count': 0, 'heritage_type': None}
org_counts[org]['count'] = int(org_counts[org].get('count') or 0) + 1
if c.get('heritage_type'):
org_counts[org]['heritage_type'] = c['heritage_type']
# Sort organizations by count
top_orgs = sorted(
[{'organization': k, 'count': v.get('count', 0), 'heritage_type': v.get('heritage_type')} for k, v in org_counts.items()],
key=lambda x: int(x.get('count') or 0),
reverse=True
)[:15]
return {
'total_connections_extracted': total,
'heritage_relevant_count': heritage_count,
'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
'connections_by_heritage_type': dict(type_counts),
'top_organizations': top_orgs,
}
def create_output(
connections: list[dict],
target_name: str,
target_slug: str,
input_file: Path,
target_org: Optional[str] = None,
) -> dict:
"""Create the full output JSON structure."""
network_analysis = compute_network_analysis(connections)
source_url = "https://www.linkedin.com/search/results/people/?network=%5B%22F%22%2C%22S%22%2C%22O%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH"
# Extract timestamp from filename
timestamp_match = re.search(r'_(\d{8}T\d{6}Z)', input_file.name)
if timestamp_match:
ts = timestamp_match.group(1)
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
else:
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
output = {
'source_metadata': {
'source_url': source_url,
'scraped_timestamp': scraped_ts,
'scrape_method': 'manual_linkedin_browse',
'target_profile': target_slug,
'target_name': target_name,
'connections_extracted': len(connections),
'notes': f"Extracted from LinkedIn connections search. Raw scrape in {input_file.name}"
},
'connections': connections,
'network_analysis': network_analysis,
'provenance': {
'data_source': 'LINKEDIN_SCRAPE',
'data_tier': 'TIER_3_CROWD_SOURCED',
'extraction_date': scraped_ts,
'extraction_method': 'manual_browse_copy_paste',
'raw_source_file': input_file.name,
'processed_by': 'parse_linkedin_connections.py',
'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
}
}
if target_org:
output['source_metadata']['target_organization'] = target_org
return output
def main():
parser = argparse.ArgumentParser(
description='Parse LinkedIn connections from raw manual scrape files.'
)
parser.add_argument('input_file', type=Path, help='Input raw text file')
parser.add_argument('output_file', type=Path, help='Output JSON file')
parser.add_argument('--target-name', required=True, help='Name of the person')
parser.add_argument('--target-slug', required=True, help='LinkedIn slug')
parser.add_argument('--target-org', help='Current organization')
parser.add_argument('--dry-run', action='store_true', help='Parse but do not write')
args = parser.parse_args()
if not args.input_file.exists():
print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
sys.exit(1)
print(f"Parsing connections from: {args.input_file}")
connections = parse_connections_file(args.input_file, args.target_name, args.target_slug)
print(f"Extracted {len(connections)} unique connections")
output = create_output(
connections,
args.target_name,
args.target_slug,
args.input_file,
args.target_org,
)
analysis = output['network_analysis']
print(f"\nNetwork Analysis:")
print(f" Total connections: {analysis['total_connections_extracted']}")
print(f" Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
print(f" By type: {analysis['connections_by_heritage_type']}")
if analysis['top_organizations']:
print(f" Top organizations:")
for org in analysis['top_organizations'][:5]:
print(f" - {org['organization']}: {org['count']}")
if args.dry_run:
print("\n[Dry run - not writing output]")
print("\nSample connections (first 5):")
for c in connections[:5]:
print(f" - {c['name']} ({c['degree']})")
print(f" Headline: {c.get('headline', 'N/A')[:70]}")
print(f" Location: {c.get('location', 'N/A')}")
print(f" Heritage: {c.get('heritage_relevant', False)} ({c.get('heritage_type', '-')})")
else:
args.output_file.parent.mkdir(parents=True, exist_ok=True)
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nWrote output to: {args.output_file}")
if __name__ == '__main__':
main()