832 lines
30 KiB
Python
832 lines
30 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse LinkedIn company staff pages from raw manual register files.
|
|
|
|
This script processes raw text exports from LinkedIn company "People" pages
|
|
and extracts structured staff data for heritage custodian institutions.
|
|
|
|
The output follows Rule 15 (Connection Data Registration) patterns but adapted
|
|
for custodian staff rather than individual connections.
|
|
|
|
Usage:
|
|
python scripts/parse_custodian_staff.py <input_file> <output_file> \
|
|
--custodian-name "Name" --custodian-slug "slug"
|
|
|
|
Example:
|
|
python scripts/parse_custodian_staff.py \
|
|
data/custodian/person/manual_hc/collectie_overijssel-20251210T0055.md \
|
|
data/custodian/person/collectie_overijssel_staff_20251210T0055.json \
|
|
--custodian-name "Collectie Overijssel" \
|
|
--custodian-slug "collectie-overijssel"
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
import unicodedata
|
|
|
|
|
|
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
|
HERITAGE_KEYWORDS = {
|
|
# G - Gallery
|
|
'G': [
|
|
'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
|
|
'exhibition space', 'tentoonstellingsruimte'
|
|
],
|
|
# L - Library
|
|
'L': [
|
|
'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
|
|
'KB ', 'national library', 'universiteitsbiblio', 'UB '
|
|
],
|
|
# A - Archive
|
|
'A': [
|
|
'archive', 'archief', 'archivist', 'archivaris', 'archival',
|
|
'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
|
|
'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
|
|
'sound and vision', 'nationaal archief', 'stadsarchief',
|
|
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG',
|
|
'archiefspecialist', 'archiefmedewerker', 'archiefinspecteur'
|
|
],
|
|
# M - Museum
|
|
'M': [
|
|
'museum', 'musea', 'curator', 'conservator', 'collection manager',
|
|
'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
|
|
'collectiespecialist', 'collectie'
|
|
],
|
|
# O - Official Institution
|
|
'O': [
|
|
'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
|
|
'province', 'provincie', 'OCW', 'ministerie van'
|
|
],
|
|
# R - Research Center
|
|
'R': [
|
|
'research', 'onderzoek', 'researcher', 'onderzoeker',
|
|
'KNAW', 'humanities cluster', 'NWO', 'think tank',
|
|
'documentatie', 'documentation', 'kenniscentrum', 'historicus'
|
|
],
|
|
# C - Corporation (Corporate heritage)
|
|
'C': [
|
|
'corporate archive', 'bedrijfsarchief', 'company history',
|
|
'shell', 'philips', 'heineken'
|
|
],
|
|
# E - Education Provider
|
|
'E': [
|
|
'university', 'universiteit', 'professor', 'lecturer', 'docent',
|
|
'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
|
|
'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
|
|
'leiden university', 'utrecht university', 'UU ', 'TU ',
|
|
'reinwardt', 'film academy', 'filmacademie', 'graduate',
|
|
'assistant professor', 'associate professor', 'hoogleraar',
|
|
'educatie', 'educator'
|
|
],
|
|
# S - Collecting Society
|
|
'S': [
|
|
'society', 'vereniging', 'genootschap', 'historical society',
|
|
'historische vereniging', 'heemkunde'
|
|
],
|
|
# D - Digital Platform
|
|
'D': [
|
|
'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
|
|
'developer', 'engineer', 'data ', 'AI ', 'machine learning',
|
|
'digitalisering', 'datamanagement', 'data analist'
|
|
],
|
|
}
|
|
|
|
# Non-heritage keywords (to mark as heritage_relevant=False)
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
|
|
'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
|
|
'consultant', 'coach', 'therapy', 'health', 'medical',
|
|
'food', 'restaurant', 'retail', 'fashion', 'real estate',
|
|
'insurance', 'banking', 'investment', 'e-commerce',
|
|
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
|
]
|
|
|
|
# Lines that indicate LinkedIn UI noise (to skip entirely)
|
|
NOISE_EXACT = {
|
|
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
|
|
'Notifications', 'Me', 'For Business', 'Learning', 'People',
|
|
'1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
|
|
'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
|
|
'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
|
|
'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
|
|
'More', 'Compose message', 'Actively hiring', 'Home', 'About', 'Posts',
|
|
'Jobs', 'People', 'Insights', 'Where they live', 'Where they studied',
|
|
'What they do', 'People you may know',
|
|
}
|
|
|
|
NOISE_PATTERNS = [
|
|
r'^\d+$', # Just a number
|
|
r'^\d+ notifications?$',
|
|
r'^LinkedIn Corporation',
|
|
r'^You are on the messaging overlay',
|
|
r'Status is online$',
|
|
r'^MessagingYou are on the messaging',
|
|
r'^Are these results helpful',
|
|
r'^Your feedback helps',
|
|
r'^\d+K? followers?$',
|
|
r'^Page \d+ of \d+$',
|
|
r'^Search employees by',
|
|
r'^\d+ associated members$',
|
|
r'logo$',
|
|
]
|
|
|
|
|
|
def is_noise_line(line: str) -> bool:
|
|
"""Check if a line is LinkedIn UI noise that should be skipped."""
|
|
line = line.strip()
|
|
if not line:
|
|
return True
|
|
|
|
if line in NOISE_EXACT:
|
|
return True
|
|
|
|
for pattern in NOISE_PATTERNS:
|
|
if re.match(pattern, line, re.IGNORECASE):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def is_action_button(line: str) -> bool:
|
|
"""Check if line is an action button."""
|
|
return line.strip() in ('Connect', 'Message', 'Follow')
|
|
|
|
|
|
def is_mutual_connections_line(line: str) -> bool:
|
|
"""Check if line describes mutual connections."""
|
|
patterns = [
|
|
r'mutual connections?$',
|
|
r'is a mutual connection$',
|
|
r'are mutual connections$',
|
|
r'other connection[s]? work here$',
|
|
]
|
|
for pattern in patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_follower_count(line: str) -> bool:
|
|
"""Check if line is a follower count."""
|
|
return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))
|
|
|
|
|
|
def is_employee_count(line: str) -> bool:
|
|
"""Check if line is an employee count."""
|
|
return bool(re.match(r'^[\d,\-]+ employees?$', line.strip(), re.IGNORECASE))
|
|
|
|
|
|
def is_anonymous_name(name: str) -> bool:
|
|
"""Check if name is an anonymous LinkedIn Member."""
|
|
anonymous_patterns = [
|
|
r'^linkedin\s*member$',
|
|
r'^member$',
|
|
r'^anonymous$',
|
|
]
|
|
name_lower = name.lower().strip()
|
|
return any(re.match(p, name_lower) for p in anonymous_patterns)
|
|
|
|
|
|
def is_abbreviated_name(name: str) -> bool:
|
|
"""
|
|
Check if name contains abbreviations (privacy-protected).
|
|
|
|
Patterns detected:
|
|
- "Amy B." (first name + single initial)
|
|
- "Elisabeth V." (ends with initial)
|
|
- "Tina M. Bastajian" (middle initial)
|
|
- "S. Buse Yildirim" (first initial)
|
|
"""
|
|
parts = name.split()
|
|
if not parts:
|
|
return False
|
|
|
|
for part in parts:
|
|
clean_part = part.rstrip('.')
|
|
if len(clean_part) <= 1 and clean_part.isalpha():
|
|
return True
|
|
if part.endswith('.') and len(part) <= 2:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
|
|
"""
|
|
Generate a unique identifier for a staff member.
|
|
|
|
Format: {custodian_slug}_staff_{index:04d}_{name_slug}
|
|
|
|
Examples:
|
|
- collectie-overijssel_staff_0001_vincent_robijn
|
|
- nationaal-archief_staff_0042_afelonne_doek
|
|
"""
|
|
# Normalize unicode and convert to ASCII-safe slug
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Replace spaces and special chars with underscores
|
|
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
|
|
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
|
|
|
|
# Truncate if too long
|
|
if len(name_slug) > 30:
|
|
name_slug = name_slug[:30].rstrip('_')
|
|
|
|
return f"{custodian_slug}_staff_{index:04d}_{name_slug}"
|
|
|
|
|
|
def parse_degree(text: str) -> Optional[str]:
|
|
"""Extract connection degree from line.
|
|
|
|
Handles formats:
|
|
- "2nd degree connection · 2nd"
|
|
- "3rd+ degree connection · 3rd" (note: 3rd+ in first part, 3rd in second)
|
|
- "· 2nd" or "• 2nd"
|
|
- Standalone "· 3rd"
|
|
"""
|
|
# Pattern 1: "Name 2nd degree connection" or "3rd+ degree connection"
|
|
# The key fix: rd\+? allows "3rd" OR "3rd+" - the + is optional after rd
|
|
match = re.search(r'(\d+(?:st|nd|rd\+?))\s*degree\s+connection', text, re.IGNORECASE)
|
|
if match:
|
|
degree = match.group(1).lower()
|
|
# Normalize: "3rd+" stays as "3rd+"
|
|
if degree == '3rd+':
|
|
return '3rd+'
|
|
return degree
|
|
|
|
# Pattern 2: "· 2nd" or "• 2nd" or "· 3rd"
|
|
match = re.search(r'[·•]\s*(1st|2nd|3rd\+?)', text)
|
|
if match:
|
|
degree = match.group(1)
|
|
# Normalize: "3rd" from "· 3rd" should be "3rd+" (these are always 3rd+ connections)
|
|
if degree == '3rd':
|
|
return '3rd+'
|
|
return degree
|
|
|
|
# Pattern 3: Standalone "· 2nd"
|
|
match = re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', text.strip())
|
|
if match:
|
|
degree = match.group(1)
|
|
if degree == '3rd':
|
|
return '3rd+'
|
|
return degree
|
|
|
|
return None
|
|
|
|
|
|
def extract_name_from_degree_line(line: str) -> str:
|
|
"""Extract just the name from a line like 'John Doe 2nd degree connection · 2nd'."""
|
|
# Remove degree suffix patterns
|
|
name = re.sub(r'\s*\d+(?:st|nd|rd|\+)?\s*degree\s+connection.*$', '', line.strip(), flags=re.IGNORECASE)
|
|
name = re.sub(r'\s*[·•]\s*(1st|2nd|3rd\+?)$', '', name)
|
|
# Remove emoji indicators
|
|
name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜🏛️]+\s*', ' ', name)
|
|
# Remove "is open to work" suffix
|
|
name = re.sub(r'\s+is open to work$', '', name, flags=re.IGNORECASE)
|
|
return name.strip()
|
|
|
|
|
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
|
"""Detect if a headline is heritage-relevant and what type."""
|
|
headline_lower = headline.lower()
|
|
|
|
# Check for non-heritage indicators first
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
if keyword.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Check heritage keywords by type (order matters - more specific first)
|
|
type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']
|
|
|
|
for heritage_type in type_order:
|
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
|
for keyword in keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, heritage_type)
|
|
|
|
# Generic heritage terms
|
|
generic_heritage = [
|
|
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
|
|
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
|
|
'preservation', 'conservation', 'behoud', 'restauratie'
|
|
]
|
|
for keyword in generic_heritage:
|
|
if keyword in headline_lower:
|
|
return (True, None)
|
|
|
|
return (False, None)
|
|
|
|
|
|
def extract_custodian_metadata(lines: list[str]) -> dict[str, Any]:
|
|
"""
|
|
Extract custodian organization metadata from the header section.
|
|
|
|
Expected patterns:
|
|
- "Collectie Overijssel logo"
|
|
- "Collectie Overijssel"
|
|
- "Met het heden je verleden in" (description/tagline)
|
|
- "Museums, Historical Sites, and Zoos" (industry)
|
|
- "Zwolle, Overijssel" (location)
|
|
- "2K followers"
|
|
- "51-200 employees"
|
|
- "58 associated members"
|
|
"""
|
|
metadata: dict[str, Any] = {}
|
|
|
|
for i, line in enumerate(lines[:30]): # Only check first 30 lines for header
|
|
line = line.strip()
|
|
|
|
# Skip empty lines
|
|
if not line:
|
|
continue
|
|
|
|
# Logo line - extract name
|
|
if line.endswith(' logo'):
|
|
metadata['name'] = line[:-5].strip()
|
|
continue
|
|
|
|
# Employee count
|
|
employee_match = re.match(r'^([\d,\-]+)\s*employees?$', line, re.IGNORECASE)
|
|
if employee_match:
|
|
metadata['employee_count'] = employee_match.group(1)
|
|
continue
|
|
|
|
# Follower count
|
|
follower_match = re.match(r'^([\d,\.]+K?)\s*followers?$', line, re.IGNORECASE)
|
|
if follower_match:
|
|
metadata['follower_count'] = follower_match.group(1)
|
|
continue
|
|
|
|
# Associated members count
|
|
member_match = re.match(r'^(\d+)\s*associated\s+members?$', line, re.IGNORECASE)
|
|
if member_match:
|
|
metadata['associated_members'] = int(member_match.group(1))
|
|
continue
|
|
|
|
# Industry detection (common patterns)
|
|
industry_patterns = [
|
|
'Museums', 'Archives', 'Libraries', 'Historical Sites',
|
|
'Government', 'Cultural', 'Heritage', 'Education',
|
|
'Research', 'Non-profit', 'Zoos'
|
|
]
|
|
if any(p.lower() in line.lower() for p in industry_patterns):
|
|
if 'industry' not in metadata:
|
|
metadata['industry'] = line
|
|
continue
|
|
|
|
# Location pattern: "City, Region" or "City, Country"
|
|
loc_match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', line)
|
|
if loc_match and 'location' not in metadata:
|
|
metadata['location'] = {
|
|
'city': loc_match.group(1),
|
|
'region': loc_match.group(2)
|
|
}
|
|
continue
|
|
|
|
return metadata
|
|
|
|
|
|
def is_likely_name_line(line: str) -> bool:
|
|
"""
|
|
Check if a line looks like a person's name.
|
|
|
|
Patterns:
|
|
- Capitalized words (proper nouns)
|
|
- Contains spaces (first + last name)
|
|
- Not too long (names rarely exceed 50 chars)
|
|
- Doesn't contain obvious non-name patterns
|
|
"""
|
|
line = line.strip()
|
|
if not line or len(line) > 60:
|
|
return False
|
|
|
|
# Skip obvious non-names
|
|
non_name_patterns = [
|
|
r'^Page \d+',
|
|
r'^\d+\s*(st|nd|rd|th)',
|
|
r'degree connection',
|
|
r'mutual connection',
|
|
r'followers?$',
|
|
r'employees?$',
|
|
r'^Search',
|
|
r'^Where they',
|
|
r'^What they',
|
|
r'work here$',
|
|
r'^Connect$',
|
|
r'^Message$',
|
|
r'^Follow$',
|
|
r'logo$',
|
|
]
|
|
for pattern in non_name_patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
return False
|
|
|
|
# Names typically start with capital letter
|
|
if not line[0].isupper() and not line[0].isalpha():
|
|
return False
|
|
|
|
# Check for reasonable name structure
|
|
# Most names have 2-5 words
|
|
words = line.split()
|
|
if len(words) < 1 or len(words) > 6:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def parse_staff_file(filepath: Path, custodian_name: str, custodian_slug: str) -> tuple[list[dict], dict]:
|
|
"""
|
|
Parse a LinkedIn company staff page raw text file.
|
|
|
|
The file structure has TWO formats:
|
|
|
|
Format 1 (Company People page - Collectie Overijssel style):
|
|
Name (line N)
|
|
Name (line N+1, duplicate - optional)
|
|
2nd degree connection · 2nd (line N+2 - STANDALONE degree line)
|
|
Headline (line N+3)
|
|
Mutual connections (line N+4)
|
|
|
|
Format 2 (Nationaal Archief style):
|
|
Name (line N)
|
|
Name 2nd degree connection (line N+1 - name WITH degree)
|
|
· 2nd (line N+2)
|
|
Headline (line N+3)
|
|
Mutual connections (line N+4)
|
|
Connect (action button)
|
|
|
|
Args:
|
|
filepath: Path to the raw staff file
|
|
custodian_name: Name of the custodian organization
|
|
custodian_slug: Slug for generating staff IDs
|
|
|
|
Returns:
|
|
Tuple of (staff_list, custodian_metadata)
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
lines = [line.rstrip('\n') for line in f]
|
|
|
|
# Extract custodian metadata from header
|
|
custodian_metadata = extract_custodian_metadata(lines)
|
|
if 'name' not in custodian_metadata:
|
|
custodian_metadata['name'] = custodian_name
|
|
|
|
staff: list[dict[str, Any]] = []
|
|
seen_names: set[str] = set()
|
|
staff_index = 0
|
|
|
|
# Track anonymous members separately to assign unique IDs
|
|
anonymous_count = 0
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i].strip()
|
|
|
|
# PATTERN A: "LinkedIn Member" entries (anonymous, no degree line)
|
|
# These appear outside the viewer's connection network
|
|
if line == 'LinkedIn Member':
|
|
# Check if next line is a headline (job title) or placeholder
|
|
headline_line = ''
|
|
lines_to_skip = 1 # At minimum, skip the "LinkedIn Member" line
|
|
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
# Check if next line is a placeholder headline (empty/dash patterns)
|
|
is_placeholder_headline = next_line in ('--', '-- ', '-', '.', 'notitle', '')
|
|
|
|
# Check if it's a valid headline (contains custodian keywords or job indicators)
|
|
custodian_keywords = custodian_name.lower().split()
|
|
is_relevant_headline = (
|
|
any(kw in next_line.lower() for kw in custodian_keywords) or
|
|
any(kw in next_line.lower() for kw in ['bij', 'at', 'voor'])
|
|
)
|
|
|
|
# If placeholder, treat as empty headline but still include the member
|
|
if is_placeholder_headline:
|
|
headline_line = '' # No headline available
|
|
lines_to_skip = 2 # Skip both LinkedIn Member and placeholder line
|
|
elif is_relevant_headline:
|
|
headline_line = next_line
|
|
lines_to_skip = 2 # Skip both LinkedIn Member and headline
|
|
else:
|
|
# Next line is not a headline (maybe start of new entry) - member has no headline
|
|
headline_line = ''
|
|
lines_to_skip = 1 # Only skip LinkedIn Member line
|
|
|
|
# Always create member record for LinkedIn Member entries
|
|
anonymous_count += 1
|
|
anonymous_id = f"anonymous_{anonymous_count:04d}"
|
|
staff_id = generate_staff_id(anonymous_id, staff_index, custodian_slug)
|
|
staff_index += 1
|
|
|
|
member = {
|
|
'staff_id': staff_id,
|
|
'name': f"LinkedIn Member #{anonymous_count}",
|
|
'name_type': 'anonymous',
|
|
'degree': 'outside_network', # No degree = outside connection circles
|
|
'heritage_relevant': False, # Will be updated below
|
|
}
|
|
|
|
# Add headline only if we have one
|
|
if headline_line:
|
|
member['headline'] = headline_line
|
|
# Process heritage relevance
|
|
is_relevant, heritage_type = detect_heritage_type(headline_line)
|
|
member['heritage_relevant'] = is_relevant
|
|
if heritage_type:
|
|
member['heritage_type'] = heritage_type
|
|
|
|
staff.append(member)
|
|
i += lines_to_skip
|
|
continue
|
|
|
|
# PATTERN B: Regular entries with degree lines
|
|
degree = parse_degree(line)
|
|
if degree:
|
|
# Try to extract name from THIS line first (Format 2: "Name 2nd degree connection")
|
|
name = extract_name_from_degree_line(line)
|
|
|
|
# If no valid name on this line, look BACK for the name (Format 1)
|
|
# Check: empty, same as original line, OR not a valid name pattern
|
|
if not name or name == line or not is_likely_name_line(name):
|
|
name = None # Reset to ensure we look back
|
|
# Look back for the name - it should be 1-2 lines above
|
|
for lookback in range(1, 4):
|
|
if i - lookback >= 0:
|
|
prev_line = lines[i - lookback].strip()
|
|
if prev_line and is_likely_name_line(prev_line):
|
|
# Remove any trailing "is open to work" etc
|
|
name = re.sub(r'\s+is open to work$', '', prev_line, flags=re.IGNORECASE)
|
|
break
|
|
|
|
# Skip if we couldn't find a valid name
|
|
if not name or not is_likely_name_line(name):
|
|
i += 1
|
|
continue
|
|
|
|
# Skip duplicates
|
|
if name in seen_names:
|
|
i += 1
|
|
continue
|
|
|
|
# Skip if name matches custodian name (org's own entry)
|
|
if name.lower() == custodian_name.lower():
|
|
i += 1
|
|
continue
|
|
|
|
# Determine name type
|
|
if is_anonymous_name(name):
|
|
name_type = 'anonymous'
|
|
elif is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
# Generate unique staff ID
|
|
staff_id = generate_staff_id(name, staff_index, custodian_slug)
|
|
staff_index += 1
|
|
|
|
# Build staff member record
|
|
member: dict[str, Any] = {
|
|
'staff_id': staff_id,
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': degree,
|
|
}
|
|
|
|
i += 1 # Move past degree line
|
|
|
|
# Check if next line is just "· 2nd" (separate degree line) - skip it
|
|
if i < len(lines) and re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', lines[i].strip()):
|
|
i += 1
|
|
|
|
# Skip empty lines
|
|
while i < len(lines) and not lines[i].strip():
|
|
i += 1
|
|
|
|
# Next non-empty line should be headline (job title)
|
|
if i < len(lines):
|
|
headline_line = lines[i].strip()
|
|
# Make sure it's not noise or the start of another person entry
|
|
# NOTE: Don't filter by is_likely_name_line - headlines can look like names!
|
|
if (not is_noise_line(headline_line) and
|
|
not parse_degree(headline_line) and
|
|
not is_action_button(headline_line) and
|
|
not is_mutual_connections_line(headline_line) and
|
|
not is_follower_count(headline_line) and
|
|
headline_line not in ('-', '.')): # Skip placeholder headlines
|
|
member['headline'] = headline_line
|
|
i += 1
|
|
|
|
# Skip to mutual connections or next entry
|
|
while i < len(lines):
|
|
check_line = lines[i].strip()
|
|
|
|
# Capture mutual connections info
|
|
if is_mutual_connections_line(check_line):
|
|
member['mutual_connections'] = check_line
|
|
i += 1
|
|
continue
|
|
|
|
# Stop if we find a degree pattern (next staff member) OR LinkedIn Member
|
|
if parse_degree(check_line) or check_line == 'LinkedIn Member':
|
|
break
|
|
|
|
# Skip action buttons and noise
|
|
i += 1
|
|
|
|
# Process heritage relevance from headline
|
|
headline = member.get('headline', '')
|
|
if headline:
|
|
is_relevant, heritage_type = detect_heritage_type(headline)
|
|
member['heritage_relevant'] = is_relevant
|
|
if heritage_type:
|
|
member['heritage_type'] = heritage_type
|
|
else:
|
|
member['heritage_relevant'] = False
|
|
|
|
staff.append(member)
|
|
seen_names.add(name)
|
|
else:
|
|
i += 1
|
|
|
|
return staff, custodian_metadata
|
|
|
|
|
|
def compute_staff_analysis(staff: list[dict]) -> dict:
|
|
"""Compute analysis statistics for staff members."""
|
|
total = len(staff)
|
|
heritage_relevant = [s for s in staff if s.get('heritage_relevant', False)]
|
|
heritage_count = len(heritage_relevant)
|
|
|
|
# Count by heritage type
|
|
type_counts: Counter[str] = Counter()
|
|
for s in heritage_relevant:
|
|
ht = s.get('heritage_type')
|
|
if ht:
|
|
type_counts[ht] += 1
|
|
|
|
# Count by degree
|
|
degree_counts: Counter[str] = Counter()
|
|
for s in staff:
|
|
degree_counts[s.get('degree', 'unknown')] += 1
|
|
|
|
# Count by name type
|
|
name_type_counts: Counter[str] = Counter()
|
|
for s in staff:
|
|
name_type_counts[s.get('name_type', 'unknown')] += 1
|
|
|
|
# Common job titles/roles
|
|
role_counts: Counter[str] = Counter()
|
|
for s in staff:
|
|
headline = s.get('headline', '')
|
|
if headline:
|
|
# Extract key role words
|
|
role_keywords = [
|
|
'directeur', 'director', 'manager', 'coordinator', 'coördinator',
|
|
'adviseur', 'advisor', 'medewerker', 'specialist', 'archivist',
|
|
'archivaris', 'historicus', 'historian', 'curator', 'conservator',
|
|
'beheerder', 'onderzoeker', 'researcher', 'projectleider'
|
|
]
|
|
for keyword in role_keywords:
|
|
if keyword.lower() in headline.lower():
|
|
role_counts[keyword.title()] += 1
|
|
|
|
return {
|
|
'total_staff_extracted': total,
|
|
'heritage_relevant_count': heritage_count,
|
|
'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
|
|
'staff_by_heritage_type': dict(type_counts),
|
|
'staff_by_degree': dict(degree_counts),
|
|
'staff_by_name_type': dict(name_type_counts),
|
|
'common_roles': dict(role_counts.most_common(10)),
|
|
}
|
|
|
|
|
|
def create_output(
|
|
staff: list[dict],
|
|
custodian_metadata: dict,
|
|
custodian_name: str,
|
|
custodian_slug: str,
|
|
input_file: Path,
|
|
) -> dict:
|
|
"""Create the full output JSON structure."""
|
|
|
|
analysis = compute_staff_analysis(staff)
|
|
|
|
# Extract timestamp from filename
|
|
timestamp_match = re.search(r'(\d{8}T\d{4,6})', input_file.name)
|
|
if timestamp_match:
|
|
ts = timestamp_match.group(1)
|
|
if len(ts) == 13: # 20251210T0055 format
|
|
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:00Z"
|
|
elif len(ts) == 15: # 20251210T005500 format
|
|
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
|
|
else:
|
|
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
else:
|
|
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
output = {
|
|
'custodian_metadata': {
|
|
'custodian_name': custodian_name,
|
|
'custodian_slug': custodian_slug,
|
|
**custodian_metadata,
|
|
},
|
|
'source_metadata': {
|
|
'source_type': 'linkedin_company_people_page',
|
|
'registered_timestamp': scraped_ts,
|
|
'registration_method': 'manual_linkedin_browse',
|
|
'staff_extracted': len(staff),
|
|
'notes': f"Staff extracted from LinkedIn company People page. Raw register in {input_file.name}"
|
|
},
|
|
'staff': staff,
|
|
'staff_analysis': analysis,
|
|
'provenance': {
|
|
'data_source': 'LINKEDIN_MANUAL_REGISTER',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'extraction_date': scraped_ts,
|
|
'extraction_method': 'manual_browse_copy_paste',
|
|
'raw_source_file': input_file.name,
|
|
'processed_by': 'parse_custodian_staff.py',
|
|
'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
}
|
|
}
|
|
|
|
return output
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Parse LinkedIn company staff pages from raw manual register files.'
|
|
)
|
|
parser.add_argument('input_file', type=Path, help='Input raw text file')
|
|
parser.add_argument('output_file', type=Path, help='Output JSON file')
|
|
parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
|
|
parser.add_argument('--custodian-slug', required=True, help='Slug for generating staff IDs')
|
|
parser.add_argument('--dry-run', action='store_true', help='Parse but do not write output')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input_file.exists():
|
|
print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Parsing staff from: {args.input_file}")
|
|
staff, custodian_metadata = parse_staff_file(
|
|
args.input_file,
|
|
args.custodian_name,
|
|
args.custodian_slug
|
|
)
|
|
print(f"Extracted {len(staff)} unique staff members")
|
|
|
|
if custodian_metadata:
|
|
print(f"\nCustodian Metadata:")
|
|
for key, value in custodian_metadata.items():
|
|
print(f" {key}: {value}")
|
|
|
|
output = create_output(
|
|
staff,
|
|
custodian_metadata,
|
|
args.custodian_name,
|
|
args.custodian_slug,
|
|
args.input_file,
|
|
)
|
|
|
|
analysis = output['staff_analysis']
|
|
print(f"\nStaff Analysis:")
|
|
print(f" Total staff: {analysis['total_staff_extracted']}")
|
|
print(f" Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
|
|
print(f" By type: {analysis['staff_by_heritage_type']}")
|
|
print(f" By degree: {analysis['staff_by_degree']}")
|
|
print(f" By name type: {analysis['staff_by_name_type']}")
|
|
|
|
if analysis['common_roles']:
|
|
print(f" Common roles:")
|
|
for role, count in list(analysis['common_roles'].items())[:5]:
|
|
print(f" - {role}: {count}")
|
|
|
|
if args.dry_run:
|
|
print("\n[Dry run - not writing output]")
|
|
print("\nSample staff (first 5):")
|
|
for s in staff[:5]:
|
|
print(f" - {s['name']} ({s['degree']})")
|
|
print(f" Headline: {s.get('headline', 'N/A')[:60]}")
|
|
print(f" Heritage: {s.get('heritage_relevant', False)} ({s.get('heritage_type', '-')})")
|
|
else:
|
|
args.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
print(f"\nWrote output to: {args.output_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|