#!/usr/bin/env python3 """ Parse LinkedIn company staff pages from raw manual register files. This script processes raw text exports from LinkedIn company "People" pages and extracts structured staff data for heritage custodian institutions. The output follows Rule 15 (Connection Data Registration) patterns but adapted for custodian staff rather than individual connections. Usage: python scripts/parse_custodian_staff.py \ --custodian-name "Name" --custodian-slug "slug" Example: python scripts/parse_custodian_staff.py \ data/custodian/person/manual_hc/collectie_overijssel-20251210T0055.md \ data/custodian/person/collectie_overijssel_staff_20251210T0055.json \ --custodian-name "Collectie Overijssel" \ --custodian-slug "collectie-overijssel" """ import argparse import json import re import sys from collections import Counter from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional import unicodedata # Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy HERITAGE_KEYWORDS = { # G - Gallery 'G': [ 'gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space', 'tentoonstellingsruimte' ], # L - Library 'L': [ 'library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library', 'universiteitsbiblio', 'UB ' ], # A - Archive 'A': [ 'archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'audiovisueel', 'sound and vision', 'nationaal archief', 'stadsarchief', 'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist', 'archiefmedewerker', 'archiefinspecteur' ], # M - Museum 'M': [ 'museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder', 'collectiespecialist', 'collectie' ], # O - Official Institution 'O': [ 'ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW', 'ministerie van' ], # R - Research Center 'R': [ 'research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO', 'think tank', 'documentatie', 'documentation', 'kenniscentrum', 'historicus' ], # C - Corporation (Corporate heritage) 'C': [ 'corporate archive', 'bedrijfsarchief', 'company history', 'shell', 'philips', 'heineken' ], # E - Education Provider 'E': [ 'university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ', 'leiden university', 'utrecht university', 'UU ', 'TU ', 'reinwardt', 'film academy', 'filmacademie', 'graduate', 'assistant professor', 'associate professor', 'hoogleraar', 'educatie', 'educator' ], # S - Collecting Society 'S': [ 'society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging', 'heemkunde' ], # D - Digital Platform 'D': [ 'digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer', 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist' ], } # Non-heritage keywords (to mark as heritage_relevant=False) NON_HERITAGE_KEYWORDS = [ 'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting', 'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical', 'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking', 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' ] # Lines that indicate LinkedIn UI noise (to skip entirely) NOISE_EXACT = { '0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging', 'Notifications', 'Me', 'For Business', 'Learning', 'People', '1st', '2nd', '3rd+', 'Locations', 'Current companies', 'All filters', 'Reset', 'Connect', 'Message', 'Follow', 'Previous', 'Next', 'About', 'Accessibility', 'Help Center', 'Privacy & Terms', 'Ad Choices', 'Advertising', 'Business Services', 'Get the LinkedIn app', 'More', 'Compose message', 'Actively hiring', 'Home', 'About', 'Posts', 'Jobs', 'People', 'Insights', 'Where they live', 'Where they studied', 'What they do', 'People you may know', } NOISE_PATTERNS = [ r'^\d+$', # Just a number r'^\d+ notifications?$', r'^LinkedIn Corporation', r'^You are on the messaging overlay', r'Status is online$', r'^MessagingYou are on the messaging', r'^Are these results helpful', r'^Your feedback helps', r'^\d+K? followers?$', r'^Page \d+ of \d+$', r'^Search employees by', r'^\d+ associated members$', r'logo$', ] def is_noise_line(line: str) -> bool: """Check if a line is LinkedIn UI noise that should be skipped.""" line = line.strip() if not line: return True if line in NOISE_EXACT: return True for pattern in NOISE_PATTERNS: if re.match(pattern, line, re.IGNORECASE): return True return False def is_action_button(line: str) -> bool: """Check if line is an action button.""" return line.strip() in ('Connect', 'Message', 'Follow') def is_mutual_connections_line(line: str) -> bool: """Check if line describes mutual connections.""" patterns = [ r'mutual connections?$', r'is a mutual connection$', r'are mutual connections$', r'other connection[s]? work here$', ] for pattern in patterns: if re.search(pattern, line, re.IGNORECASE): return True return False def is_follower_count(line: str) -> bool: """Check if line is a follower count.""" return bool(re.match(r'^[\d,\.]+K?\s*followers?$', line.strip(), re.IGNORECASE)) def is_employee_count(line: str) -> bool: """Check if line is an employee count.""" return bool(re.match(r'^[\d,\-]+ employees?$', line.strip(), re.IGNORECASE)) def is_anonymous_name(name: str) -> bool: """Check if name is an anonymous LinkedIn Member.""" anonymous_patterns = [ r'^linkedin\s*member$', r'^member$', r'^anonymous$', ] name_lower = name.lower().strip() return any(re.match(p, name_lower) for p in anonymous_patterns) def is_abbreviated_name(name: str) -> bool: """ Check if name contains abbreviations (privacy-protected). Patterns detected: - "Amy B." (first name + single initial) - "Elisabeth V." (ends with initial) - "Tina M. Bastajian" (middle initial) - "S. Buse Yildirim" (first initial) """ parts = name.split() if not parts: return False for part in parts: clean_part = part.rstrip('.') if len(clean_part) <= 1 and clean_part.isalpha(): return True if part.endswith('.') and len(part) <= 2: return True return False def generate_staff_id(name: str, index: int, custodian_slug: str) -> str: """ Generate a unique identifier for a staff member. Format: {custodian_slug}_staff_{index:04d}_{name_slug} Examples: - collectie-overijssel_staff_0001_vincent_robijn - nationaal-archief_staff_0042_afelonne_doek """ # Normalize unicode and convert to ASCII-safe slug normalized = unicodedata.normalize('NFD', name.lower()) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Replace spaces and special chars with underscores name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name) name_slug = re.sub(r'_+', '_', name_slug).strip('_') # Truncate if too long if len(name_slug) > 30: name_slug = name_slug[:30].rstrip('_') return f"{custodian_slug}_staff_{index:04d}_{name_slug}" def parse_degree(text: str) -> Optional[str]: """Extract connection degree from line. Handles formats: - "2nd degree connection · 2nd" - "3rd+ degree connection · 3rd" (note: 3rd+ in first part, 3rd in second) - "· 2nd" or "• 2nd" - Standalone "· 3rd" """ # Pattern 1: "Name 2nd degree connection" or "3rd+ degree connection" # The key fix: rd\+? allows "3rd" OR "3rd+" - the + is optional after rd match = re.search(r'(\d+(?:st|nd|rd\+?))\s*degree\s+connection', text, re.IGNORECASE) if match: degree = match.group(1).lower() # Normalize: "3rd+" stays as "3rd+" if degree == '3rd+': return '3rd+' return degree # Pattern 2: "· 2nd" or "• 2nd" or "· 3rd" match = re.search(r'[·•]\s*(1st|2nd|3rd\+?)', text) if match: degree = match.group(1) # Normalize: "3rd" from "· 3rd" should be "3rd+" (these are always 3rd+ connections) if degree == '3rd': return '3rd+' return degree # Pattern 3: Standalone "· 2nd" match = re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', text.strip()) if match: degree = match.group(1) if degree == '3rd': return '3rd+' return degree return None def extract_name_from_degree_line(line: str) -> str: """Extract just the name from a line like 'John Doe 2nd degree connection · 2nd'.""" # Remove degree suffix patterns name = re.sub(r'\s*\d+(?:st|nd|rd|\+)?\s*degree\s+connection.*$', '', line.strip(), flags=re.IGNORECASE) name = re.sub(r'\s*[·•]\s*(1st|2nd|3rd\+?)$', '', name) # Remove emoji indicators name = re.sub(r'\s*[🟥🟦🟧🟩🟨⬛⬜🏛️]+\s*', ' ', name) # Remove "is open to work" suffix name = re.sub(r'\s+is open to work$', '', name, flags=re.IGNORECASE) return name.strip() def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]: """Detect if a headline is heritage-relevant and what type.""" headline_lower = headline.lower() # Check for non-heritage indicators first for keyword in NON_HERITAGE_KEYWORDS: if keyword.lower() in headline_lower: return (False, None) # Check heritage keywords by type (order matters - more specific first) type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C'] for heritage_type in type_order: keywords = HERITAGE_KEYWORDS.get(heritage_type, []) for keyword in keywords: if keyword.lower() in headline_lower: return (True, heritage_type) # Generic heritage terms generic_heritage = [ 'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'behoud', 'restauratie' ] for keyword in generic_heritage: if keyword in headline_lower: return (True, None) return (False, None) def extract_custodian_metadata(lines: list[str]) -> dict[str, Any]: """ Extract custodian organization metadata from the header section. Expected patterns: - "Collectie Overijssel logo" - "Collectie Overijssel" - "Met het heden je verleden in" (description/tagline) - "Museums, Historical Sites, and Zoos" (industry) - "Zwolle, Overijssel" (location) - "2K followers" - "51-200 employees" - "58 associated members" """ metadata: dict[str, Any] = {} for i, line in enumerate(lines[:30]): # Only check first 30 lines for header line = line.strip() # Skip empty lines if not line: continue # Logo line - extract name if line.endswith(' logo'): metadata['name'] = line[:-5].strip() continue # Employee count employee_match = re.match(r'^([\d,\-]+)\s*employees?$', line, re.IGNORECASE) if employee_match: metadata['employee_count'] = employee_match.group(1) continue # Follower count follower_match = re.match(r'^([\d,\.]+K?)\s*followers?$', line, re.IGNORECASE) if follower_match: metadata['follower_count'] = follower_match.group(1) continue # Associated members count member_match = re.match(r'^(\d+)\s*associated\s+members?$', line, re.IGNORECASE) if member_match: metadata['associated_members'] = int(member_match.group(1)) continue # Industry detection (common patterns) industry_patterns = [ 'Museums', 'Archives', 'Libraries', 'Historical Sites', 'Government', 'Cultural', 'Heritage', 'Education', 'Research', 'Non-profit', 'Zoos' ] if any(p.lower() in line.lower() for p in industry_patterns): if 'industry' not in metadata: metadata['industry'] = line continue # Location pattern: "City, Region" or "City, Country" loc_match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', line) if loc_match and 'location' not in metadata: metadata['location'] = { 'city': loc_match.group(1), 'region': loc_match.group(2) } continue return metadata def is_likely_name_line(line: str) -> bool: """ Check if a line looks like a person's name. Patterns: - Capitalized words (proper nouns) - Contains spaces (first + last name) - Not too long (names rarely exceed 50 chars) - Doesn't contain obvious non-name patterns """ line = line.strip() if not line or len(line) > 60: return False # Skip obvious non-names non_name_patterns = [ r'^Page \d+', r'^\d+\s*(st|nd|rd|th)', r'degree connection', r'mutual connection', r'followers?$', r'employees?$', r'^Search', r'^Where they', r'^What they', r'work here$', r'^Connect$', r'^Message$', r'^Follow$', r'logo$', ] for pattern in non_name_patterns: if re.search(pattern, line, re.IGNORECASE): return False # Names typically start with capital letter if not line[0].isupper() and not line[0].isalpha(): return False # Check for reasonable name structure # Most names have 2-5 words words = line.split() if len(words) < 1 or len(words) > 6: return False return True def parse_staff_file(filepath: Path, custodian_name: str, custodian_slug: str) -> tuple[list[dict], dict]: """ Parse a LinkedIn company staff page raw text file. The file structure has TWO formats: Format 1 (Company People page - Collectie Overijssel style): Name (line N) Name (line N+1, duplicate - optional) 2nd degree connection · 2nd (line N+2 - STANDALONE degree line) Headline (line N+3) Mutual connections (line N+4) Format 2 (Nationaal Archief style): Name (line N) Name 2nd degree connection (line N+1 - name WITH degree) · 2nd (line N+2) Headline (line N+3) Mutual connections (line N+4) Connect (action button) Args: filepath: Path to the raw staff file custodian_name: Name of the custodian organization custodian_slug: Slug for generating staff IDs Returns: Tuple of (staff_list, custodian_metadata) """ with open(filepath, 'r', encoding='utf-8') as f: lines = [line.rstrip('\n') for line in f] # Extract custodian metadata from header custodian_metadata = extract_custodian_metadata(lines) if 'name' not in custodian_metadata: custodian_metadata['name'] = custodian_name staff: list[dict[str, Any]] = [] seen_names: set[str] = set() staff_index = 0 # Track anonymous members separately to assign unique IDs anonymous_count = 0 i = 0 while i < len(lines): line = lines[i].strip() # PATTERN A: "LinkedIn Member" entries (anonymous, no degree line) # These appear outside the viewer's connection network if line == 'LinkedIn Member': # Check if next line is a headline (job title) or placeholder headline_line = '' lines_to_skip = 1 # At minimum, skip the "LinkedIn Member" line if i + 1 < len(lines): next_line = lines[i + 1].strip() # Check if next line is a placeholder headline (empty/dash patterns) is_placeholder_headline = next_line in ('--', '-- ', '-', '.', 'notitle', '') # Check if it's a valid headline (contains custodian keywords or job indicators) custodian_keywords = custodian_name.lower().split() is_relevant_headline = ( any(kw in next_line.lower() for kw in custodian_keywords) or any(kw in next_line.lower() for kw in ['bij', 'at', 'voor']) ) # If placeholder, treat as empty headline but still include the member if is_placeholder_headline: headline_line = '' # No headline available lines_to_skip = 2 # Skip both LinkedIn Member and placeholder line elif is_relevant_headline: headline_line = next_line lines_to_skip = 2 # Skip both LinkedIn Member and headline else: # Next line is not a headline (maybe start of new entry) - member has no headline headline_line = '' lines_to_skip = 1 # Only skip LinkedIn Member line # Always create member record for LinkedIn Member entries anonymous_count += 1 anonymous_id = f"anonymous_{anonymous_count:04d}" staff_id = generate_staff_id(anonymous_id, staff_index, custodian_slug) staff_index += 1 member = { 'staff_id': staff_id, 'name': f"LinkedIn Member #{anonymous_count}", 'name_type': 'anonymous', 'degree': 'outside_network', # No degree = outside connection circles 'heritage_relevant': False, # Will be updated below } # Add headline only if we have one if headline_line: member['headline'] = headline_line # Process heritage relevance is_relevant, heritage_type = detect_heritage_type(headline_line) member['heritage_relevant'] = is_relevant if heritage_type: member['heritage_type'] = heritage_type staff.append(member) i += lines_to_skip continue # PATTERN B: Regular entries with degree lines degree = parse_degree(line) if degree: # Try to extract name from THIS line first (Format 2: "Name 2nd degree connection") name = extract_name_from_degree_line(line) # If no valid name on this line, look BACK for the name (Format 1) # Check: empty, same as original line, OR not a valid name pattern if not name or name == line or not is_likely_name_line(name): name = None # Reset to ensure we look back # Look back for the name - it should be 1-2 lines above for lookback in range(1, 4): if i - lookback >= 0: prev_line = lines[i - lookback].strip() if prev_line and is_likely_name_line(prev_line): # Remove any trailing "is open to work" etc name = re.sub(r'\s+is open to work$', '', prev_line, flags=re.IGNORECASE) break # Skip if we couldn't find a valid name if not name or not is_likely_name_line(name): i += 1 continue # Skip duplicates if name in seen_names: i += 1 continue # Skip if name matches custodian name (org's own entry) if name.lower() == custodian_name.lower(): i += 1 continue # Determine name type if is_anonymous_name(name): name_type = 'anonymous' elif is_abbreviated_name(name): name_type = 'abbreviated' else: name_type = 'full' # Generate unique staff ID staff_id = generate_staff_id(name, staff_index, custodian_slug) staff_index += 1 # Build staff member record member: dict[str, Any] = { 'staff_id': staff_id, 'name': name, 'name_type': name_type, 'degree': degree, } i += 1 # Move past degree line # Check if next line is just "· 2nd" (separate degree line) - skip it if i < len(lines) and re.match(r'^[·•]\s*(1st|2nd|3rd\+?)$', lines[i].strip()): i += 1 # Skip empty lines while i < len(lines) and not lines[i].strip(): i += 1 # Next non-empty line should be headline (job title) if i < len(lines): headline_line = lines[i].strip() # Make sure it's not noise or the start of another person entry # NOTE: Don't filter by is_likely_name_line - headlines can look like names! if (not is_noise_line(headline_line) and not parse_degree(headline_line) and not is_action_button(headline_line) and not is_mutual_connections_line(headline_line) and not is_follower_count(headline_line) and headline_line not in ('-', '.')): # Skip placeholder headlines member['headline'] = headline_line i += 1 # Skip to mutual connections or next entry while i < len(lines): check_line = lines[i].strip() # Capture mutual connections info if is_mutual_connections_line(check_line): member['mutual_connections'] = check_line i += 1 continue # Stop if we find a degree pattern (next staff member) OR LinkedIn Member if parse_degree(check_line) or check_line == 'LinkedIn Member': break # Skip action buttons and noise i += 1 # Process heritage relevance from headline headline = member.get('headline', '') if headline: is_relevant, heritage_type = detect_heritage_type(headline) member['heritage_relevant'] = is_relevant if heritage_type: member['heritage_type'] = heritage_type else: member['heritage_relevant'] = False staff.append(member) seen_names.add(name) else: i += 1 return staff, custodian_metadata def compute_staff_analysis(staff: list[dict]) -> dict: """Compute analysis statistics for staff members.""" total = len(staff) heritage_relevant = [s for s in staff if s.get('heritage_relevant', False)] heritage_count = len(heritage_relevant) # Count by heritage type type_counts: Counter[str] = Counter() for s in heritage_relevant: ht = s.get('heritage_type') if ht: type_counts[ht] += 1 # Count by degree degree_counts: Counter[str] = Counter() for s in staff: degree_counts[s.get('degree', 'unknown')] += 1 # Count by name type name_type_counts: Counter[str] = Counter() for s in staff: name_type_counts[s.get('name_type', 'unknown')] += 1 # Common job titles/roles role_counts: Counter[str] = Counter() for s in staff: headline = s.get('headline', '') if headline: # Extract key role words role_keywords = [ 'directeur', 'director', 'manager', 'coordinator', 'coördinator', 'adviseur', 'advisor', 'medewerker', 'specialist', 'archivist', 'archivaris', 'historicus', 'historian', 'curator', 'conservator', 'beheerder', 'onderzoeker', 'researcher', 'projectleider' ] for keyword in role_keywords: if keyword.lower() in headline.lower(): role_counts[keyword.title()] += 1 return { 'total_staff_extracted': total, 'heritage_relevant_count': heritage_count, 'heritage_relevant_percentage': round(heritage_count / total * 100, 1) if total > 0 else 0, 'staff_by_heritage_type': dict(type_counts), 'staff_by_degree': dict(degree_counts), 'staff_by_name_type': dict(name_type_counts), 'common_roles': dict(role_counts.most_common(10)), } def create_output( staff: list[dict], custodian_metadata: dict, custodian_name: str, custodian_slug: str, input_file: Path, ) -> dict: """Create the full output JSON structure.""" analysis = compute_staff_analysis(staff) # Extract timestamp from filename timestamp_match = re.search(r'(\d{8}T\d{4,6})', input_file.name) if timestamp_match: ts = timestamp_match.group(1) if len(ts) == 13: # 20251210T0055 format scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:00Z" elif len(ts) == 15: # 20251210T005500 format scraped_ts = f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}T{ts[9:11]}:{ts[11:13]}:{ts[13:15]}Z" else: scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') else: scraped_ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') output = { 'custodian_metadata': { 'custodian_name': custodian_name, 'custodian_slug': custodian_slug, **custodian_metadata, }, 'source_metadata': { 'source_type': 'linkedin_company_people_page', 'registered_timestamp': scraped_ts, 'registration_method': 'manual_linkedin_browse', 'staff_extracted': len(staff), 'notes': f"Staff extracted from LinkedIn company People page. Raw register in {input_file.name}" }, 'staff': staff, 'staff_analysis': analysis, 'provenance': { 'data_source': 'LINKEDIN_MANUAL_REGISTER', 'data_tier': 'TIER_3_CROWD_SOURCED', 'extraction_date': scraped_ts, 'extraction_method': 'manual_browse_copy_paste', 'raw_source_file': input_file.name, 'processed_by': 'parse_custodian_staff.py', 'processing_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), } } return output def main(): parser = argparse.ArgumentParser( description='Parse LinkedIn company staff pages from raw manual register files.' ) parser.add_argument('input_file', type=Path, help='Input raw text file') parser.add_argument('output_file', type=Path, help='Output JSON file') parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization') parser.add_argument('--custodian-slug', required=True, help='Slug for generating staff IDs') parser.add_argument('--dry-run', action='store_true', help='Parse but do not write output') args = parser.parse_args() if not args.input_file.exists(): print(f"Error: Input file not found: {args.input_file}", file=sys.stderr) sys.exit(1) print(f"Parsing staff from: {args.input_file}") staff, custodian_metadata = parse_staff_file( args.input_file, args.custodian_name, args.custodian_slug ) print(f"Extracted {len(staff)} unique staff members") if custodian_metadata: print(f"\nCustodian Metadata:") for key, value in custodian_metadata.items(): print(f" {key}: {value}") output = create_output( staff, custodian_metadata, args.custodian_name, args.custodian_slug, args.input_file, ) analysis = output['staff_analysis'] print(f"\nStaff Analysis:") print(f" Total staff: {analysis['total_staff_extracted']}") print(f" Heritage-relevant: {analysis['heritage_relevant_count']} ({analysis['heritage_relevant_percentage']}%)") print(f" By type: {analysis['staff_by_heritage_type']}") print(f" By degree: {analysis['staff_by_degree']}") print(f" By name type: {analysis['staff_by_name_type']}") if analysis['common_roles']: print(f" Common roles:") for role, count in list(analysis['common_roles'].items())[:5]: print(f" - {role}: {count}") if args.dry_run: print("\n[Dry run - not writing output]") print("\nSample staff (first 5):") for s in staff[:5]: print(f" - {s['name']} ({s['degree']})") print(f" Headline: {s.get('headline', 'N/A')[:60]}") print(f" Heritage: {s.get('heritage_relevant', False)} ({s.get('heritage_type', '-')})") else: args.output_file.parent.mkdir(parents=True, exist_ok=True) with open(args.output_file, 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\nWrote output to: {args.output_file}") if __name__ == '__main__': main()