glam/scripts/parse_custodian_staff.py
2025-12-10 18:04:25 +01:00

832 lines
30 KiB
Python

#!/usr/bin/env python3
"""
Parse LinkedIn company staff pages from raw manual register files.
This script processes raw text exports from LinkedIn company "People" pages
and extracts structured staff data for heritage custodian institutions.
The output follows Rule 15 (Connection Data Registration) patterns but adapted
for custodian staff rather than individual connections.
Usage:
python scripts/parse_custodian_staff.py <input_file> <output_file> \
--custodian-name "Name" --custodian-slug "slug"
Example:
python scripts/parse_custodian_staff.py \
data/custodian/person/manual_hc/collectie_overijssel-20251210T0055.md \
data/custodian/person/collectie_overijssel_staff_20251210T0055.json \
--custodian-name "Collectie Overijssel" \
--custodian-slug "collectie-overijssel"
"""
import argparse
import json
import re
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import unicodedata
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
# G - Gallery
'G': [
'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery',
'exhibition space', 'tentoonstellingsruimte'
],
# L - Library
'L': [
'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris',
'KB ', 'national library', 'universiteitsbiblio', 'UB '
],
# A - Archive
'A': [
'archive', 'archief', 'archivist', 'archivaris', 'archival',
'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum',
'eye film', 'EYE ', 'audiovisual', 'audiovisueel',
'sound and vision', 'nationaal archief', 'stadsarchief',
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG',
'archiefspecialist', 'archiefmedewerker', 'archiefinspecteur'
],
# M - Museum
'M': [
'museum', 'musea', 'curator', 'conservator', 'collection manager',
'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
'collectiespecialist', 'collectie'
],
# O - Official Institution
'O': [
'ministry', 'ministerie', 'government', 'overheid', 'gemeente',
'province', 'provincie', 'OCW', 'ministerie van'
],
# R - Research Center
'R': [
'research', 'onderzoek', 'researcher', 'onderzoeker',
'KNAW', 'humanities cluster', 'NWO', 'think tank',
'documentatie', 'documentation', 'kenniscentrum', 'historicus'
],
# C - Corporation (Corporate heritage)
'C': [
'corporate archive', 'bedrijfsarchief', 'company history',
'shell', 'philips', 'heineken'
],
# E - Education Provider
'E': [
'university', 'universiteit', 'professor', 'lecturer', 'docent',
'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate',
'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ',
'leiden university', 'utrecht university', 'UU ', 'TU ',
'reinwardt', 'film academy', 'filmacademie', 'graduate',
'assistant professor', 'associate professor', 'hoogleraar',
'educatie', 'educator'
],
# S - Collecting Society
'S': [
'society', 'vereniging', 'genootschap', 'historical society',
'historische vereniging', 'heemkunde'
],
# D - Digital Platform
'D': [
'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech',
'developer', 'engineer', 'data ', 'AI ', 'machine learning',
'digitalisering', 'datamanagement', 'data analist'
],
}
# Non-heritage keywords (to mark as heritage_relevant=False)
NON_HERITAGE_KEYWORDS = [
'marketing', 'sales', 'HR ', 'human resources', 'recruiter',
'finance', 'accounting', 'legal', 'lawyer', 'advocaat',
'consultant', 'coach', 'therapy', 'health', 'medical',
'food', 'restaurant', 'retail', 'fashion', 'real estate',
'insurance', 'banking', 'investment', 'e-commerce',
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]
# Lines that indicate LinkedIn UI noise (to skip entirely)
NOISE_EXACT = {
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
'Notifications', 'Me', 'For Business', 'Learning', 'People',
'1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters',
'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next',
'About', 'Accessibility', 'Help Center', 'Privacy & Terms',
'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app',
'More', 'Compose message', 'Actively hiring', 'Home', 'About', 'Posts',
'Jobs', 'People', 'Insights', 'Where they live', 'Where they studied',
'What they do', 'People you may know',
}
NOISE_PATTERNS = [
r'^\d+$', # Just a number
r'^\d+ notifications?$',
r'^LinkedIn Corporation',
r'^You are on the messaging overlay',
r'Status is online$',
r'^MessagingYou are on the messaging',
r'^Are these results helpful',
r'^Your feedback helps',
r'^\d+K? followers?$',
r'^Page \d+ of \d+$',
r'^Search employees by',
r'^\d+ associated members$',
r'logo$',
]
def is_noise_line(line: str) -> bool:
"""Check if a line is LinkedIn UI noise that should be skipped."""
line = line.strip()
if not line:
return True
if line in NOISE_EXACT:
return True
for pattern in NOISE_PATTERNS:
if re.match(pattern, line, re.IGNORECASE):
return True
return False
def is_action_button(line: str) -> bool:
"""Check if line is an action button."""
return line.strip() in ('Connect', 'Message', 'Follow')
def is_mutual_connections_line(line: str) -> bool:
"""Check if line describes mutual connections."""
patterns = [
r'mutual connections?$',
r'is a mutual connection$',
r'are mutual connections$',
r'other connection[s]? work here$',
]
for pattern in patterns:
if re.search(pattern, line, re.IGNORECASE):
return True
return False
def is_follower_count(line: str) -> bool:
"""Check if line is a follower count."""
return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE))
def is_employee_count(line: str) -> bool:
"""Check if line is an employee count."""
return bool(re.match(r'^[\d,\-]+ employees?$', line.strip(), re.IGNORECASE))
def is_anonymous_name(name: str) -> bool:
"""Check if name is an anonymous LinkedIn Member."""
anonymous_patterns = [
r'^linkedin\s*member$',
r'^member$',
r'^anonymous$',
]
name_lower = name.lower().strip()
return any(re.match(p, name_lower) for p in anonymous_patterns)
def is_abbreviated_name(name: str) -> bool:
"""
Check if name contains abbreviations (privacy-protected).
Patterns detected:
- "Amy B." (first name + single initial)
- "Elisabeth V." (ends with initial)
- "Tina M. Bastajian" (middle initial)
- "S. Buse Yildirim" (first initial)
"""
parts = name.split()
if not parts:
return False
for part in parts:
clean_part = part.rstrip('.')
if len(clean_part) <= 1 and clean_part.isalpha():
return True
if part.endswith('.') and len(part) <= 2:
return True
return False
def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
"""
Generate a unique identifier for a staff member.
Format: {custodian_slug}_staff_{index:04d}_{name_slug}
Examples:
- collectie-overijssel_staff_0001_vincent_robijn
- nationaal-archief_staff_0042_afelonne_doek
"""
# Normalize unicode and convert to ASCII-safe slug
normalized = unicodedata.normalize('NFD', name.lower())
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Replace spaces and special chars with underscores
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
# Truncate if too long
if len(name_slug) > 30:
name_slug = name_slug[:30].rstrip('_')
return f"{custodian_slug}_staff_{index:04d}_{name_slug}"
def parse_degree(text: str) -> Optional[str]:
"""Extract connection degree from line.
Handles formats:
- "2nd degree connection · 2nd"
- "3rd+ degree connection · 3rd" (note: 3rd+ in first part, 3rd in second)
- "· 2nd" or "• 2nd"
- Standalone "· 3rd"
"""
# Pattern 1: "Name 2nd degree connection" or "3rd+ degree connection"
# The key fix: rd\+? allows "3rd" OR "3rd+" - the + is optional after rd
match = re.search(r'(\d+(?:st|nd|rd\+?))\s*degree\s+connection', text, re.IGNORECASE)
if match:
degree = match.group(1).lower()
# Normalize: "3rd+" stays as "3rd+"
if degree == '3rd+':
return '3rd+'
return degree
# Pattern 2: "· 2nd" or "• 2nd" or "· 3rd"
match = re.search(r'[·•]\s*(1st|2nd|3rd\+?)', text)
if match:
degree = match.group(1)
# Normalize: "3rd" from "· 3rd" should be "3rd+" (these are always 3rd+ connections)
if degree == '3rd':
return '3rd+'
return degree
# Pattern 3: Standalone "· 2nd"
match = re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', text.strip())
if match:
degree = match.group(1)
if degree == '3rd':
return '3rd+'
return degree
return None
def extract_name_from_degree_line(line: str) -> str:
"""Extract just the name from a line like 'John Doe 2nd degree connection · 2nd'."""
# Remove degree suffix patterns
name = re.sub(r'\s*\d+(?:st|nd|rd|\+)?\s*degree\s+connection.*$', '', line.strip(), flags=re.IGNORECASE)
name = re.sub(r'\s*[·•]\s*(1st|2nd|3rd\+?)$', '', name)
# Remove emoji indicators
name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜🏛️]+\s*', ' ', name)
# Remove "is open to work" suffix
name = re.sub(r'\s+is open to work$', '', name, flags=re.IGNORECASE)
return name.strip()
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
"""Detect if a headline is heritage-relevant and what type."""
headline_lower = headline.lower()
# Check for non-heritage indicators first
for keyword in NON_HERITAGE_KEYWORDS:
if keyword.lower() in headline_lower:
return (False, None)
# Check heritage keywords by type (order matters - more specific first)
type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']
for heritage_type in type_order:
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
for keyword in keywords:
if keyword.lower() in headline_lower:
return (True, heritage_type)
# Generic heritage terms
generic_heritage = [
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
'preservation', 'conservation', 'behoud', 'restauratie'
]
for keyword in generic_heritage:
if keyword in headline_lower:
return (True, None)
return (False, None)
def extract_custodian_metadata(lines: list[str]) -> dict[str, Any]:
"""
Extract custodian organization metadata from the header section.
Expected patterns:
- "Collectie Overijssel logo"
- "Collectie Overijssel"
- "Met het heden je verleden in" (description/tagline)
- "Museums, Historical Sites, and Zoos" (industry)
- "Zwolle, Overijssel" (location)
- "2K followers"
- "51-200 employees"
- "58 associated members"
"""
metadata: dict[str, Any] = {}
for i, line in enumerate(lines[:30]): # Only check first 30 lines for header
line = line.strip()
# Skip empty lines
if not line:
continue
# Logo line - extract name
if line.endswith(' logo'):
metadata['name'] = line[:-5].strip()
continue
# Employee count
employee_match = re.match(r'^([\d,\-]+)\s*employees?$', line, re.IGNORECASE)
if employee_match:
metadata['employee_count'] = employee_match.group(1)
continue
# Follower count
follower_match = re.match(r'^([\d,\.]+K?)\s*followers?$', line, re.IGNORECASE)
if follower_match:
metadata['follower_count'] = follower_match.group(1)
continue
# Associated members count
member_match = re.match(r'^(\d+)\s*associated\s+members?$', line, re.IGNORECASE)
if member_match:
metadata['associated_members'] = int(member_match.group(1))
continue
# Industry detection (common patterns)
industry_patterns = [
'Museums', 'Archives', 'Libraries', 'Historical Sites',
'Government', 'Cultural', 'Heritage', 'Education',
'Research', 'Non-profit', 'Zoos'
]
if any(p.lower() in line.lower() for p in industry_patterns):
if 'industry' not in metadata:
metadata['industry'] = line
continue
# Location pattern: "City, Region" or "City, Country"
loc_match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', line)
if loc_match and 'location' not in metadata:
metadata['location'] = {
'city': loc_match.group(1),
'region': loc_match.group(2)
}
continue
return metadata
def is_likely_name_line(line: str) -> bool:
"""
Check if a line looks like a person's name.
Patterns:
- Capitalized words (proper nouns)
- Contains spaces (first + last name)
- Not too long (names rarely exceed 50 chars)
- Doesn't contain obvious non-name patterns
"""
line = line.strip()
if not line or len(line) > 60:
return False
# Skip obvious non-names
non_name_patterns = [
r'^Page \d+',
r'^\d+\s*(st|nd|rd|th)',
r'degree connection',
r'mutual connection',
r'followers?$',
r'employees?$',
r'^Search',
r'^Where they',
r'^What they',
r'work here$',
r'^Connect$',
r'^Message$',
r'^Follow$',
r'logo$',
]
for pattern in non_name_patterns:
if re.search(pattern, line, re.IGNORECASE):
return False
# Names typically start with capital letter
if not line[0].isupper() and not line[0].isalpha():
return False
# Check for reasonable name structure
# Most names have 2-5 words
words = line.split()
if len(words) < 1 or len(words) > 6:
return False
return True
def parse_staff_file(filepath: Path, custodian_name: str, custodian_slug: str) -> tuple[list[dict], dict]:
"""
Parse a LinkedIn company staff page raw text file.
The file structure has TWO formats:
Format 1 (Company People page - Collectie Overijssel style):
Name (line N)
Name (line N+1, duplicate - optional)
2nd degree connection · 2nd (line N+2 - STANDALONE degree line)
Headline (line N+3)
Mutual connections (line N+4)
Format 2 (Nationaal Archief style):
Name (line N)
Name 2nd degree connection (line N+1 - name WITH degree)
· 2nd (line N+2)
Headline (line N+3)
Mutual connections (line N+4)
Connect (action button)
Args:
filepath: Path to the raw staff file
custodian_name: Name of the custodian organization
custodian_slug: Slug for generating staff IDs
Returns:
Tuple of (staff_list, custodian_metadata)
"""
with open(filepath, 'r', encoding='utf-8') as f:
lines = [line.rstrip('\n') for line in f]
# Extract custodian metadata from header
custodian_metadata = extract_custodian_metadata(lines)
if 'name' not in custodian_metadata:
custodian_metadata['name'] = custodian_name
staff: list[dict[str, Any]] = []
seen_names: set[str] = set()
staff_index = 0
# Track anonymous members separately to assign unique IDs
anonymous_count = 0
i = 0
while i < len(lines):
line = lines[i].strip()
# PATTERN A: "LinkedIn Member" entries (anonymous, no degree line)
# These appear outside the viewer's connection network
if line == 'LinkedIn Member':
# Check if next line is a headline (job title) or placeholder
headline_line = ''
lines_to_skip = 1 # At minimum, skip the "LinkedIn Member" line
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
# Check if next line is a placeholder headline (empty/dash patterns)
is_placeholder_headline = next_line in ('--', '-- ', '-', '.', 'notitle', '')
# Check if it's a valid headline (contains custodian keywords or job indicators)
custodian_keywords = custodian_name.lower().split()
is_relevant_headline = (
any(kw in next_line.lower() for kw in custodian_keywords) or
any(kw in next_line.lower() for kw in ['bij', 'at', 'voor'])
)
# If placeholder, treat as empty headline but still include the member
if is_placeholder_headline:
headline_line = '' # No headline available
lines_to_skip = 2 # Skip both LinkedIn Member and placeholder line
elif is_relevant_headline:
headline_line = next_line
lines_to_skip = 2 # Skip both LinkedIn Member and headline
else:
# Next line is not a headline (maybe start of new entry) - member has no headline
headline_line = ''
lines_to_skip = 1 # Only skip LinkedIn Member line
# Always create member record for LinkedIn Member entries
anonymous_count += 1
anonymous_id = f"anonymous_{anonymous_count:04d}"
staff_id = generate_staff_id(anonymous_id, staff_index, custodian_slug)
staff_index += 1
member = {
'staff_id': staff_id,
'name': f"LinkedIn Member #{anonymous_count}",
'name_type': 'anonymous',
'degree': 'outside_network', # No degree = outside connection circles
'heritage_relevant': False, # Will be updated below
}
# Add headline only if we have one
if headline_line:
member['headline'] = headline_line
# Process heritage relevance
is_relevant, heritage_type = detect_heritage_type(headline_line)
member['heritage_relevant'] = is_relevant
if heritage_type:
member['heritage_type'] = heritage_type
staff.append(member)
i += lines_to_skip
continue
# PATTERN B: Regular entries with degree lines
degree = parse_degree(line)
if degree:
# Try to extract name from THIS line first (Format 2: "Name 2nd degree connection")
name = extract_name_from_degree_line(line)
# If no valid name on this line, look BACK for the name (Format 1)
# Check: empty, same as original line, OR not a valid name pattern
if not name or name == line or not is_likely_name_line(name):
name = None # Reset to ensure we look back
# Look back for the name - it should be 1-2 lines above
for lookback in range(1, 4):
if i - lookback >= 0:
prev_line = lines[i - lookback].strip()
if prev_line and is_likely_name_line(prev_line):
# Remove any trailing "is open to work" etc
name = re.sub(r'\s+is open to work$', '', prev_line, flags=re.IGNORECASE)
break
# Skip if we couldn't find a valid name
if not name or not is_likely_name_line(name):
i += 1
continue
# Skip duplicates
if name in seen_names:
i += 1
continue
# Skip if name matches custodian name (org's own entry)
if name.lower() == custodian_name.lower():
i += 1
continue
# Determine name type
if is_anonymous_name(name):
name_type = 'anonymous'
elif is_abbreviated_name(name):
name_type = 'abbreviated'
else:
name_type = 'full'
# Generate unique staff ID
staff_id = generate_staff_id(name, staff_index, custodian_slug)
staff_index += 1
# Build staff member record
member: dict[str, Any] = {
'staff_id': staff_id,
'name': name,
'name_type': name_type,
'degree': degree,
}
i += 1 # Move past degree line
# Check if next line is just "· 2nd" (separate degree line) - skip it
if i < len(lines) and re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', lines[i].strip()):
i += 1
# Skip empty lines
while i < len(lines) and not lines[i].strip():
i += 1
# Next non-empty line should be headline (job title)
if i < len(lines):
headline_line = lines[i].strip()
# Make sure it's not noise or the start of another person entry
# NOTE: Don't filter by is_likely_name_line - headlines can look like names!
if (not is_noise_line(headline_line) and
not parse_degree(headline_line) and
not is_action_button(headline_line) and
not is_mutual_connections_line(headline_line) and
not is_follower_count(headline_line) and
headline_line not in ('-', '.')): # Skip placeholder headlines
member['headline'] = headline_line
i += 1
# Skip to mutual connections or next entry
while i < len(lines):
check_line = lines[i].strip()
# Capture mutual connections info
if is_mutual_connections_line(check_line):
member['mutual_connections'] = check_line
i += 1
continue
# Stop if we find a degree pattern (next staff member) OR LinkedIn Member
if parse_degree(check_line) or check_line == 'LinkedIn Member':
break
# Skip action buttons and noise
i += 1
# Process heritage relevance from headline
headline = member.get('headline', '')
if headline:
is_relevant, heritage_type = detect_heritage_type(headline)
member['heritage_relevant'] = is_relevant
if heritage_type:
member['heritage_type'] = heritage_type
else:
member['heritage_relevant'] = False
staff.append(member)
seen_names.add(name)
else:
i += 1
return staff, custodian_metadata
def compute_staff_analysis(staff: list[dict]) -> dict:
"""Compute analysis statistics for staff members."""
total = len(staff)
heritage_relevant = [s for s in staff if s.get('heritage_relevant', False)]
heritage_count = len(heritage_relevant)
# Count by heritage type
type_counts: Counter[str] = Counter()
for s in heritage_relevant:
ht = s.get('heritage_type')
if ht:
type_counts[ht] += 1
# Count by degree
degree_counts: Counter[str] = Counter()
for s in staff:
degree_counts[s.get('degree', 'unknown')] += 1
# Count by name type
name_type_counts: Counter[str] = Counter()
for s in staff:
name_type_counts[s.get('name_type', 'unknown')] += 1
# Common job titles/roles
role_counts: Counter[str] = Counter()
for s in staff:
headline = s.get('headline', '')
if headline:
# Extract key role words
role_keywords = [
'directeur', 'director', 'manager', 'coordinator', 'coördinator',
'adviseur', 'advisor', 'medewerker', 'specialist', 'archivist',
'archivaris', 'historicus', 'historian', 'curator', 'conservator',
'beheerder', 'onderzoeker', 'researcher', 'projectleider'
]
for keyword in role_keywords:
if keyword.lower() in headline.lower():
role_counts[keyword.title()] += 1
return {
'total_staff_extracted': total,
'heritage_relevant_count': heritage_count,
'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0,
'staff_by_heritage_type': dict(type_counts),
'staff_by_degree': dict(degree_counts),
'staff_by_name_type': dict(name_type_counts),
'common_roles': dict(role_counts.most_common(10)),
}
def create_output(
staff: list[dict],
custodian_metadata: dict,
custodian_name: str,
custodian_slug: str,
input_file: Path,
) -> dict:
"""Create the full output JSON structure."""
analysis = compute_staff_analysis(staff)
# Extract timestamp from filename
timestamp_match = re.search(r'(\d{8}T\d{4,6})', input_file.name)
if timestamp_match:
ts = timestamp_match.group(1)
if len(ts) == 13: # 20251210T0055 format
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:00Z"
elif len(ts) == 15: # 20251210T005500 format
scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z"
else:
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
else:
scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
output = {
'custodian_metadata': {
'custodian_name': custodian_name,
'custodian_slug': custodian_slug,
**custodian_metadata,
},
'source_metadata': {
'source_type': 'linkedin_company_people_page',
'registered_timestamp': scraped_ts,
'registration_method': 'manual_linkedin_browse',
'staff_extracted': len(staff),
'notes': f"Staff extracted from LinkedIn company People page. Raw register in {input_file.name}"
},
'staff': staff,
'staff_analysis': analysis,
'provenance': {
'data_source': 'LINKEDIN_MANUAL_REGISTER',
'data_tier': 'TIER_3_CROWD_SOURCED',
'extraction_date': scraped_ts,
'extraction_method': 'manual_browse_copy_paste',
'raw_source_file': input_file.name,
'processed_by': 'parse_custodian_staff.py',
'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
}
}
return output
def main():
parser = argparse.ArgumentParser(
description='Parse LinkedIn company staff pages from raw manual register files.'
)
parser.add_argument('input_file', type=Path, help='Input raw text file')
parser.add_argument('output_file', type=Path, help='Output JSON file')
parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
parser.add_argument('--custodian-slug', required=True, help='Slug for generating staff IDs')
parser.add_argument('--dry-run', action='store_true', help='Parse but do not write output')
args = parser.parse_args()
if not args.input_file.exists():
print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
sys.exit(1)
print(f"Parsing staff from: {args.input_file}")
staff, custodian_metadata = parse_staff_file(
args.input_file,
args.custodian_name,
args.custodian_slug
)
print(f"Extracted {len(staff)} unique staff members")
if custodian_metadata:
print(f"\nCustodian Metadata:")
for key, value in custodian_metadata.items():
print(f" {key}: {value}")
output = create_output(
staff,
custodian_metadata,
args.custodian_name,
args.custodian_slug,
args.input_file,
)
analysis = output['staff_analysis']
print(f"\nStaff Analysis:")
print(f" Total staff: {analysis['total_staff_extracted']}")
print(f" Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)")
print(f" By type: {analysis['staff_by_heritage_type']}")
print(f" By degree: {analysis['staff_by_degree']}")
print(f" By name type: {analysis['staff_by_name_type']}")
if analysis['common_roles']:
print(f" Common roles:")
for role, count in list(analysis['common_roles'].items())[:5]:
print(f" - {role}: {count}")
if args.dry_run:
print("\n[Dry run - not writing output]")
print("\nSample staff (first 5):")
for s in staff[:5]:
print(f" - {s['name']} ({s['degree']})")
print(f" Headline: {s.get('headline', 'N/A')[:60]}")
print(f" Heritage: {s.get('heritage_relevant', False)} ({s.get('heritage_type', '-')})")
else:
args.output_file.parent.mkdir(parents=True, exist_ok=True)
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nWrote output to: {args.output_file}")
if __name__ == '__main__':
main()