956 lines
39 KiB
Python
Executable file
956 lines
39 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Extract complete LinkedIn staff data from saved company People page HTML files.
|
|
|
|
This script parses saved HTML files to extract complete staff profiles including:
|
|
- Name
|
|
- LinkedIn profile URL
|
|
- Headline/job title
|
|
- Connection degree
|
|
- Mutual connections
|
|
|
|
This replaces the need for MD file parsing - HTML contains ALL the data.
|
|
|
|
Usage:
|
|
python scripts/parse_linkedin_html.py <html_file> \
|
|
--custodian-name "Name" --custodian-slug "slug" \
|
|
--output staff.json
|
|
|
|
Example:
|
|
python scripts/parse_linkedin_html.py \
|
|
"data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \
|
|
--custodian-name "Rijksmuseum" \
|
|
--custodian-slug "rijksmuseum" \
|
|
--output data/custodian/person/rijksmuseum_staff.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from collections import Counter
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from html.parser import HTMLParser
|
|
from urllib.parse import unquote
|
|
|
|
|
|
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
|
HERITAGE_KEYWORDS = {
|
|
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
|
|
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
|
|
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
|
|
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
|
|
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
|
|
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
|
|
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
|
|
'collectiespecialist', 'collectie'],
|
|
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
|
|
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
|
|
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
|
|
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
|
|
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
|
|
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
|
|
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
|
|
'associate professor', 'hoogleraar', 'educatie', 'educator'],
|
|
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
|
|
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
|
|
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
|
|
}
|
|
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
|
|
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
|
|
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
|
|
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
|
]
|
|
|
|
# Organizations that are explicitly NOT heritage institutions
|
|
# These should never be classified as heritage-relevant
|
|
NON_HERITAGE_ORGANIZATIONS = [
|
|
# Banks & Financial
|
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
|
# Security companies
|
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
|
# Police/Government (non-cultural)
|
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
|
# Political parties
|
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
|
# Tech companies (non-heritage)
|
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
|
# Telecom
|
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
|
# Postal / Logistics
|
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
|
# Healthcare
|
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
|
# Retail
|
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
|
# Consulting / Professional services
|
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
|
# Recruitment / HR
|
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
|
# Energy / Utilities
|
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
|
# Transport
|
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
|
# Other
|
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
|
]
|
|
|
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
|
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
|
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
|
# Archives
|
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
|
# Museums
|
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
|
# Libraries
|
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
|
# Film/AV heritage
|
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
|
# Heritage platforms
|
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
|
# Research institutes (heritage-focused)
|
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
|
]
|
|
|
|
# LinkedIn status phrases that pollute name fields (extracted from img alt text)
|
|
# These should be removed from names and stored as metadata
|
|
LINKEDIN_STATUS_PHRASES = [
|
|
' is open to work',
|
|
' is hiring',
|
|
' is looking for new opportunities',
|
|
' is looking for opportunities',
|
|
' is actively looking',
|
|
' is available for work',
|
|
' open to work',
|
|
' - open to work',
|
|
' • Open to work',
|
|
' - Hiring',
|
|
' • Hiring',
|
|
]
|
|
|
|
# Known compound slugs that cannot be parsed by simple hyphen splitting
|
|
# These are manually verified name mappings
|
|
KNOWN_COMPOUND_SLUGS = {
|
|
'jponjee': 'J. Ponjee',
|
|
'sharellyemanuelson': 'Sharelly Emanuelson',
|
|
'addieroelofsen': 'Addie Roelofsen',
|
|
'adheliap': 'Adhelia P.',
|
|
'anejanboomsma': 'Anejan Boomsma',
|
|
'fredericlogghe': 'Frederic Logghe',
|
|
'dirkjanheinen': 'Dirkjan Heinen',
|
|
}
|
|
|
|
# Dutch name particles that should remain lowercase when not at start of name
|
|
DUTCH_NAME_PARTICLES = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
|
|
|
|
|
|
def clean_linkedin_status_from_name(name: str) -> tuple[str, str | None]:
|
|
"""
|
|
Remove LinkedIn status phrases from name and return clean name + status.
|
|
|
|
Args:
|
|
name: Raw name possibly containing LinkedIn status
|
|
|
|
Returns:
|
|
Tuple of (clean_name, linkedin_status or None)
|
|
|
|
Examples:
|
|
"John Doe is open to work" -> ("John Doe", "open_to_work")
|
|
"Jane Smith is hiring" -> ("Jane Smith", "hiring")
|
|
"Bob Jones" -> ("Bob Jones", None)
|
|
"""
|
|
if not name:
|
|
return (name, None)
|
|
|
|
name_lower = name.lower()
|
|
|
|
for phrase in LINKEDIN_STATUS_PHRASES:
|
|
phrase_lower = phrase.lower()
|
|
if phrase_lower in name_lower:
|
|
# Find position and remove
|
|
idx = name_lower.find(phrase_lower)
|
|
clean_name = name[:idx].strip()
|
|
|
|
# Determine status type
|
|
if 'hiring' in phrase_lower:
|
|
status = 'hiring'
|
|
elif 'open to work' in phrase_lower or 'looking' in phrase_lower or 'available' in phrase_lower:
|
|
status = 'open_to_work'
|
|
else:
|
|
status = 'active'
|
|
|
|
return (clean_name, status)
|
|
|
|
return (name, None)
|
|
|
|
|
|
def slug_to_name(slug: str) -> tuple[str, bool]:
|
|
"""
|
|
Convert LinkedIn slug to a human-readable name.
|
|
|
|
This is used when the extracted name from HTML doesn't match the slug
|
|
(e.g., name contamination from logged-in user's name appearing for
|
|
privacy-restricted profiles).
|
|
|
|
Args:
|
|
slug: LinkedIn profile slug (e.g., 'jan-van-der-berg-abc123')
|
|
|
|
Returns:
|
|
Tuple of (derived_name, is_reliable)
|
|
- is_reliable is True if the slug had clear hyphen-separated parts
|
|
- is_reliable is False for compound slugs without hyphens
|
|
|
|
Examples:
|
|
'jan-van-der-berg-abc123' -> ('Jan van der Berg', True)
|
|
'charlotte-van-beek-55370314' -> ('Charlotte van Beek', True)
|
|
'jponjee' -> ('J. Ponjee', True) # Known compound slug
|
|
'unknownslug' -> ('Unknown', False) # Cannot parse
|
|
"""
|
|
# URL decode the slug (handles %20, etc.)
|
|
decoded_slug = unquote(slug)
|
|
|
|
# Check known compound slugs first
|
|
if decoded_slug in KNOWN_COMPOUND_SLUGS:
|
|
return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)
|
|
|
|
# Check if slug has hyphens (parseable)
|
|
if '-' not in decoded_slug:
|
|
return ("Unknown", False)
|
|
|
|
# Remove trailing alphanumeric ID (e.g., '-abc123', '-55370314')
|
|
# Pattern: hyphen followed by 6+ hex chars or 5+ digits at end
|
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
|
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
|
|
|
# Split by hyphen
|
|
parts = [p for p in clean_slug.split('-') if p]
|
|
if not parts:
|
|
return ("Unknown", False)
|
|
|
|
# Capitalize each part, keeping Dutch particles lowercase when not first
|
|
name_parts = []
|
|
for i, part in enumerate(parts):
|
|
if part.lower() in DUTCH_NAME_PARTICLES and i > 0:
|
|
# Dutch particle (van, de, den, der) - keep lowercase unless first word
|
|
name_parts.append(part.lower())
|
|
else:
|
|
# Regular name part - capitalize
|
|
name_parts.append(part.capitalize())
|
|
|
|
return (' '.join(name_parts), True)
|
|
|
|
|
|
def name_matches_slug(name: str, slug: str) -> bool:
|
|
"""
|
|
Check if an extracted name plausibly matches a LinkedIn slug.
|
|
|
|
This is used to detect "name contamination" - when the logged-in user's
|
|
name is extracted from the HTML instead of the actual profile owner's name.
|
|
|
|
Args:
|
|
name: Extracted name from HTML (e.g., 'Simon Kemper')
|
|
slug: LinkedIn profile slug (e.g., 'jan-van-der-berg-abc123')
|
|
|
|
Returns:
|
|
True if the name appears to match the slug
|
|
False if the name does NOT match (possible contamination)
|
|
|
|
Examples:
|
|
name_matches_slug('Jan van der Berg', 'jan-van-der-berg-abc123') -> True
|
|
name_matches_slug('Simon Kemper', 'jan-van-der-berg-abc123') -> False
|
|
"""
|
|
if not name or not slug:
|
|
return False
|
|
|
|
# Special case: "LinkedIn Member" is valid for anonymous profiles
|
|
if name == 'LinkedIn Member':
|
|
return True
|
|
|
|
# URL decode and lowercase the slug
|
|
decoded_slug = unquote(slug).lower()
|
|
|
|
# Remove trailing ID from slug
|
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
|
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
|
|
|
# Normalize the name: lowercase, remove punctuation, convert spaces to hyphens
|
|
name_lower = name.lower()
|
|
name_normalized = re.sub(r'[.\'`]', '', name_lower) # Remove periods, apostrophes
|
|
name_normalized = re.sub(r'\s+', '-', name_normalized) # Spaces to hyphens
|
|
|
|
# Extract name parts (at least 2 chars each)
|
|
name_parts = [p for p in name_normalized.split('-') if len(p) >= 2]
|
|
if not name_parts:
|
|
return False
|
|
|
|
# The first name part (first name) should appear in the slug
|
|
# This is the primary check for contamination
|
|
first_name = name_parts[0]
|
|
|
|
return first_name in clean_slug
|
|
|
|
|
|
class LinkedInProfileCardParser(HTMLParser):
|
|
"""
|
|
Parse LinkedIn profile cards from saved HTML.
|
|
|
|
Each profile card has structure:
|
|
- org-people-profile-card__profile-image-N (contains img with alt=name, href=profile_url)
|
|
- artdeco-entity-lockup__title (contains name text and profile link)
|
|
- artdeco-entity-lockup__badge (contains connection degree)
|
|
- artdeco-entity-lockup__subtitle (contains headline)
|
|
- Mutual connections text
|
|
|
|
Anonymous "LinkedIn Member" profiles have a different structure:
|
|
- org-people-profile-card__profile-image-N is on an <img> tag (NOT an <a> tag)
|
|
- No href link (privacy-protected)
|
|
- Name appears as "LinkedIn Member" in the title
|
|
- Still have subtitle (headline) content
|
|
|
|
NOTE: The "People you may know" h2 header in LinkedIn company pages is actually
|
|
the section title for the associated members list, NOT a separate recommendations
|
|
section. All profile cards under this header are real associated members.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.profiles: list[dict] = []
|
|
self.current_profile: dict = {}
|
|
|
|
# State tracking
|
|
self.in_profile_card = False
|
|
self.in_title = False
|
|
self.in_subtitle = False
|
|
self.in_badge = False
|
|
self.in_caption = False
|
|
self.in_mutual = False
|
|
|
|
self.current_text = ""
|
|
self.card_index = -1
|
|
|
|
# For custodian metadata extraction
|
|
self.custodian_metadata: dict = {}
|
|
self.in_header = True
|
|
self.header_texts: list[str] = []
|
|
|
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
attrs_dict = dict(attrs)
|
|
attr_id = attrs_dict.get('id') or ''
|
|
attr_class = attrs_dict.get('class') or ''
|
|
|
|
# Detect profile card start - can be on <a> tag (regular) OR <img> tag (anonymous)
|
|
if 'org-people-profile-card__profile-image' in attr_id:
|
|
self.in_profile_card = True
|
|
self.in_header = False
|
|
match = re.search(r'profile-image-(\d+)', attr_id)
|
|
if match:
|
|
new_index = int(match.group(1))
|
|
if new_index != self.card_index:
|
|
# Save previous profile if exists
|
|
if self.current_profile.get('name'):
|
|
self.profiles.append(self.current_profile)
|
|
self.current_profile = {}
|
|
self.card_index = new_index
|
|
|
|
# Extract URL from href (only on <a> tags - regular profiles)
|
|
href = attrs_dict.get('href', '')
|
|
if href and 'linkedin.com/in/' in href:
|
|
slug = self._extract_slug(href)
|
|
if slug:
|
|
self.current_profile['linkedin_slug'] = slug
|
|
self.current_profile['linkedin_profile_url'] = f"https://www.linkedin.com/in/{slug}"
|
|
|
|
# If this is an <img> tag with the profile-image ID, it's likely an anonymous member
|
|
# We'll capture this and the name will come from the title section as "LinkedIn Member"
|
|
if tag == 'img':
|
|
# Mark as potential anonymous (will be confirmed when we see "LinkedIn Member" in title)
|
|
self.current_profile['_may_be_anonymous'] = True
|
|
|
|
# Extract name from img alt (for regular profiles with named photos)
|
|
if tag == 'img' and self.in_profile_card:
|
|
alt = attrs_dict.get('alt', '')
|
|
if alt and alt not in ('', 'photo', 'Profile photo'):
|
|
# Clean LinkedIn status phrases from name
|
|
clean_name, linkedin_status = clean_linkedin_status_from_name(alt)
|
|
self.current_profile['name'] = clean_name
|
|
if linkedin_status:
|
|
self.current_profile['linkedin_status'] = linkedin_status
|
|
|
|
# Title section (contains name link or "LinkedIn Member" text)
|
|
if 'artdeco-entity-lockup__title' in attr_class:
|
|
self.in_title = True
|
|
self.current_text = ""
|
|
|
|
# Badge section (contains degree)
|
|
if 'artdeco-entity-lockup__badge' in attr_class:
|
|
self.in_badge = True
|
|
self.current_text = ""
|
|
|
|
# Subtitle section (contains headline)
|
|
if 'artdeco-entity-lockup__subtitle' in attr_class:
|
|
self.in_subtitle = True
|
|
self.current_text = ""
|
|
|
|
# Caption/mutual connections
|
|
if 'artdeco-entity-lockup__caption' in attr_class or 'mutual' in attr_class.lower():
|
|
self.in_mutual = True
|
|
self.current_text = ""
|
|
|
|
# Check for mutual connections in span
|
|
if tag == 'span' and 'mutual' in attr_class.lower():
|
|
self.in_mutual = True
|
|
self.current_text = ""
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
text = data.strip()
|
|
if not text:
|
|
return
|
|
|
|
# Collect header texts for metadata
|
|
if self.in_header:
|
|
self.header_texts.append(text)
|
|
|
|
if self.in_title:
|
|
self.current_text += " " + text
|
|
elif self.in_badge:
|
|
self.current_text += " " + text
|
|
elif self.in_subtitle:
|
|
self.current_text += " " + text
|
|
elif self.in_mutual:
|
|
self.current_text += " " + text
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
if tag == 'div':
|
|
if self.in_title:
|
|
text = self.current_text.strip()
|
|
if text and 'name' not in self.current_profile:
|
|
# Clean up name
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if len(text) > 1 and not text.startswith('View '):
|
|
# Clean LinkedIn status phrases from name
|
|
clean_name, linkedin_status = clean_linkedin_status_from_name(text)
|
|
self.current_profile['name'] = clean_name
|
|
if linkedin_status and 'linkedin_status' not in self.current_profile:
|
|
self.current_profile['linkedin_status'] = linkedin_status
|
|
# Check if this is "LinkedIn Member" (anonymous profile)
|
|
if clean_name == 'LinkedIn Member':
|
|
self.current_profile['is_anonymous'] = True
|
|
self.in_title = False
|
|
self.current_text = ""
|
|
|
|
if self.in_badge:
|
|
text = self.current_text.strip()
|
|
degree = self._parse_degree(text)
|
|
if degree:
|
|
self.current_profile['degree'] = degree
|
|
self.in_badge = False
|
|
self.current_text = ""
|
|
|
|
if self.in_subtitle:
|
|
text = self.current_text.strip()
|
|
if text and len(text) > 2:
|
|
# Clean up headline
|
|
text = re.sub(r'\s+', ' ', text)
|
|
self.current_profile['headline'] = text
|
|
self.in_subtitle = False
|
|
self.current_text = ""
|
|
|
|
if tag == 'span' and self.in_mutual:
|
|
text = self.current_text.strip()
|
|
if text and 'mutual' in text.lower():
|
|
self.current_profile['mutual_connections'] = text
|
|
self.in_mutual = False
|
|
self.current_text = ""
|
|
|
|
def _extract_slug(self, url: str) -> Optional[str]:
|
|
"""Extract profile slug from URL."""
|
|
match = re.search(r'linkedin\.com/in/([^?/]+)', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _parse_degree(self, text: str) -> Optional[str]:
|
|
"""Parse connection degree from text."""
|
|
if '1st' in text:
|
|
return '1st'
|
|
if '2nd' in text:
|
|
return '2nd'
|
|
if '3rd' in text:
|
|
return '3rd+'
|
|
return None
|
|
|
|
def finalize(self) -> list[dict]:
|
|
"""Finalize parsing and return all profiles."""
|
|
# Save last profile
|
|
if self.current_profile.get('name'):
|
|
self.profiles.append(self.current_profile)
|
|
|
|
# Parse custodian metadata from header
|
|
self._parse_header_metadata()
|
|
|
|
# Validate and fix names that may be contaminated
|
|
self._validate_and_fix_names()
|
|
|
|
return self.profiles
|
|
|
|
def _validate_and_fix_names(self) -> None:
|
|
"""
|
|
Validate extracted names against LinkedIn slugs and fix contamination.
|
|
|
|
Name contamination occurs when saving LinkedIn HTML while logged in:
|
|
privacy-restricted profiles may show the logged-in user's name/photo
|
|
instead of the actual profile owner's info.
|
|
|
|
Detection: If extracted name doesn't match the slug, it's likely contaminated.
|
|
Fix: Derive the correct name from the slug.
|
|
"""
|
|
for profile in self.profiles:
|
|
name = profile.get('name', '')
|
|
slug = profile.get('linkedin_slug', '')
|
|
|
|
# Skip anonymous profiles (no slug)
|
|
if not slug:
|
|
continue
|
|
|
|
# Skip "LinkedIn Member" - valid anonymous name
|
|
if name == 'LinkedIn Member':
|
|
continue
|
|
|
|
# Check if the extracted name matches the slug
|
|
if not name_matches_slug(name, slug):
|
|
# Name contamination detected - derive correct name from slug
|
|
derived_name, is_reliable = slug_to_name(slug)
|
|
|
|
if is_reliable and derived_name != "Unknown":
|
|
# Record the correction
|
|
profile['_original_contaminated_name'] = name
|
|
profile['name'] = derived_name
|
|
profile['_name_derived_from_slug'] = True
|
|
else:
|
|
# Could not reliably derive name - mark for review
|
|
profile['_name_may_be_contaminated'] = True
|
|
profile['_original_name'] = name
|
|
|
|
def _parse_header_metadata(self) -> None:
|
|
"""Extract custodian metadata from header texts."""
|
|
for text in self.header_texts:
|
|
# Skip JSON blobs and very long texts (data artifacts)
|
|
if text.startswith('{') or len(text) > 200:
|
|
continue
|
|
|
|
# Follower count
|
|
match = re.match(r'^([\d,\.]+K?)\s*followers?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['follower_count'] = match.group(1)
|
|
continue
|
|
|
|
# Employee count
|
|
match = re.match(r'^([\d,\-]+)\s*employees?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['employee_count'] = match.group(1)
|
|
continue
|
|
|
|
# Associated members
|
|
match = re.match(r'^(\d+)\s*associated\s+members?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['associated_members'] = int(match.group(1))
|
|
continue
|
|
|
|
# Industry - must be a clean standalone text, not embedded in JSON
|
|
industry_keywords = ['Museums', 'Archives', 'Libraries', 'Historical Sites', 'Heritage', 'Zoos']
|
|
if any(kw.lower() in text.lower() for kw in industry_keywords):
|
|
# Ensure it's a clean industry text (not JSON or HTML)
|
|
if not text.startswith('{') and not '<' in text and len(text) < 100:
|
|
if 'industry' not in self.custodian_metadata:
|
|
self.custodian_metadata['industry'] = text.strip()
|
|
continue
|
|
|
|
# Location (City, Region)
|
|
match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', text)
|
|
if match and 'location' not in self.custodian_metadata:
|
|
self.custodian_metadata['location'] = {
|
|
'city': match.group(1),
|
|
'region': match.group(2)
|
|
}
|
|
|
|
|
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
|
"""
|
|
Detect if a headline is heritage-relevant and what type.
|
|
|
|
Two-stage classification:
|
|
1. Check if organization is explicitly non-heritage (blocklist)
|
|
2. Check if role/organization matches heritage patterns
|
|
|
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
|
This prevents generic IT workers at banks/police from being classified as heritage.
|
|
"""
|
|
if not headline:
|
|
return (False, None)
|
|
|
|
headline_lower = headline.lower()
|
|
|
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
|
if org.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 2: Check for non-heritage role indicators
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
if keyword.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 3: Check if this is a heritage organization
|
|
is_heritage_org = False
|
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
|
if org_keyword.lower() in headline_lower:
|
|
is_heritage_org = True
|
|
break
|
|
|
|
# Check heritage keywords by type (order matters - more specific first)
|
|
# 'D' (Digital) is checked last and requires heritage org validation
|
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
|
|
|
for heritage_type in type_order:
|
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
|
for keyword in keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, heritage_type)
|
|
|
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
|
if is_heritage_org:
|
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
|
for keyword in digital_keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, 'D')
|
|
|
|
# Generic heritage terms (without specific type)
|
|
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
|
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
|
for keyword in generic:
|
|
if keyword in headline_lower:
|
|
return (True, None)
|
|
|
|
return (False, None)
|
|
|
|
|
|
def is_abbreviated_name(name: str) -> bool:
|
|
"""Check if name contains abbreviations."""
|
|
parts = name.split()
|
|
for part in parts:
|
|
clean_part = part.rstrip('.')
|
|
if len(clean_part) <= 1 and clean_part.isalpha():
|
|
return True
|
|
if part.endswith('.') and len(part) <= 2:
|
|
return True
|
|
return False
|
|
|
|
|
|
def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
|
|
"""Generate unique staff ID."""
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
|
|
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
|
|
if len(name_slug) > 30:
|
|
name_slug = name_slug[:30].rstrip('_')
|
|
return f"{custodian_slug}_staff_{index:04d}_{name_slug}"
|
|
|
|
|
|
def parse_html_file(filepath: Path, custodian_name: str, custodian_slug: str) -> dict[str, Any]:
|
|
"""
|
|
Parse LinkedIn company People page HTML and extract all staff data.
|
|
|
|
Handles:
|
|
- Duplicate profile merging (same person with multiple LinkedIn accounts)
|
|
- Anonymous "LinkedIn Member" entries (each counted separately)
|
|
|
|
Returns complete staff JSON structure.
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
|
|
# Parse HTML
|
|
parser = LinkedInProfileCardParser()
|
|
try:
|
|
parser.feed(html_content)
|
|
except Exception as e:
|
|
print(f"Warning: HTML parsing error: {e}", file=sys.stderr)
|
|
|
|
raw_profiles = parser.finalize()
|
|
custodian_metadata = parser.custodian_metadata
|
|
|
|
# First pass: Group profiles by LinkedIn SLUG to detect duplicates
|
|
# The same profile may appear multiple times on a page (LinkedIn UI quirk)
|
|
# We merge by slug, NOT by name, because different people can have the same name
|
|
# BUT: Do NOT merge "LinkedIn Member" (anonymous) - each is unique
|
|
slug_to_profiles: dict[str, list[dict]] = {}
|
|
|
|
for profile in raw_profiles:
|
|
name = profile.get('name', '').strip()
|
|
slug = profile.get('linkedin_slug', '')
|
|
is_anonymous = profile.get('is_anonymous', False) or name == 'LinkedIn Member'
|
|
|
|
if not name:
|
|
continue
|
|
|
|
if is_anonymous:
|
|
# Each anonymous profile gets a unique key (cannot deduplicate without slug)
|
|
unique_key = f"_anonymous_{len(slug_to_profiles)}"
|
|
slug_to_profiles[unique_key] = [profile]
|
|
elif slug:
|
|
# Deduplicate by slug - same slug = same person appearing multiple times
|
|
if slug not in slug_to_profiles:
|
|
slug_to_profiles[slug] = []
|
|
slug_to_profiles[slug].append(profile)
|
|
else:
|
|
# No slug (shouldn't happen for non-anonymous) - use unique key
|
|
unique_key = f"_no_slug_{len(slug_to_profiles)}"
|
|
slug_to_profiles[unique_key] = [profile]
|
|
|
|
# Second pass: Build staff list with merged duplicates
|
|
staff: list[dict] = []
|
|
anonymous_count = 0
|
|
duplicate_profiles_count = 0
|
|
|
|
for slug_key, profiles in slug_to_profiles.items():
|
|
if slug_key.startswith('_anonymous_'):
|
|
# Anonymous profile
|
|
profile = profiles[0]
|
|
anonymous_count += 1
|
|
display_name = f"LinkedIn Member #{anonymous_count}"
|
|
name_type = 'anonymous'
|
|
|
|
headline = profile.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(display_name, len(staff), custodian_slug),
|
|
'name': display_name,
|
|
'name_type': name_type,
|
|
'degree': profile.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': profile.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
staff.append(staff_entry)
|
|
elif slug_key.startswith('_no_slug_'):
|
|
# Profile without slug (rare edge case)
|
|
profile = profiles[0]
|
|
name = profile.get('name', 'Unknown')
|
|
|
|
if is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
headline = profile.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(name, len(staff), custodian_slug),
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': profile.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': profile.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
staff.append(staff_entry)
|
|
else:
|
|
# Regular profile with slug - may have duplicates to merge
|
|
# (same profile appearing multiple times on page)
|
|
primary = profiles[0]
|
|
name = primary.get('name', slug_key)
|
|
|
|
# Determine name type
|
|
if primary.get('_name_derived_from_slug'):
|
|
name_type = 'derived_from_slug'
|
|
elif is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
headline = primary.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(name, len(staff), custodian_slug),
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': primary.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': primary.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
|
|
# Add primary LinkedIn URL
|
|
if primary.get('linkedin_profile_url'):
|
|
staff_entry['linkedin_profile_url'] = primary['linkedin_profile_url']
|
|
staff_entry['linkedin_slug'] = primary['linkedin_slug']
|
|
|
|
# Add name correction metadata if name was derived from slug
|
|
if primary.get('_name_derived_from_slug'):
|
|
staff_entry['name_correction'] = {
|
|
'original_contaminated_name': primary.get('_original_contaminated_name', ''),
|
|
'derived_from_slug': True,
|
|
'correction_method': 'slug_to_name',
|
|
}
|
|
elif primary.get('_name_may_be_contaminated'):
|
|
staff_entry['name_correction'] = {
|
|
'may_be_contaminated': True,
|
|
'original_name': primary.get('_original_name', name),
|
|
'note': 'Name could not be reliably derived from slug - manual review needed',
|
|
}
|
|
|
|
# If same profile appeared multiple times, count as duplicates merged
|
|
if len(profiles) > 1:
|
|
duplicate_profiles_count += len(profiles) - 1
|
|
|
|
staff.append(staff_entry)
|
|
|
|
# Build final output structure
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
# Calculate PYMK filtered count
|
|
pymk_filtered = custodian_metadata.get('_pymk_cards_filtered', 0)
|
|
|
|
result = {
|
|
'custodian_metadata': {
|
|
'custodian_name': custodian_name,
|
|
'custodian_slug': custodian_slug,
|
|
'name': custodian_metadata.get('name', custodian_name),
|
|
'industry': custodian_metadata.get('industry', ''),
|
|
'location': custodian_metadata.get('location', {}),
|
|
'follower_count': custodian_metadata.get('follower_count', ''),
|
|
'associated_members': custodian_metadata.get('associated_members', 0),
|
|
},
|
|
'source_metadata': {
|
|
'source_type': 'linkedin_company_people_page_html',
|
|
'source_file': str(filepath.name),
|
|
'registered_timestamp': timestamp,
|
|
'registration_method': 'html_parsing',
|
|
'staff_extracted': len(staff),
|
|
'pymk_cards_filtered': pymk_filtered,
|
|
'duplicate_profiles_merged': duplicate_profiles_count,
|
|
},
|
|
'staff': staff,
|
|
'staff_analysis': {
|
|
'total_staff_extracted': len(staff),
|
|
'with_linkedin_url': sum(1 for s in staff if 'linkedin_profile_url' in s),
|
|
'with_alternate_profiles': sum(1 for s in staff if 'alternate_profiles' in s),
|
|
'anonymous_members': anonymous_count,
|
|
'heritage_relevant_count': sum(1 for s in staff if s.get('heritage_relevant')),
|
|
'staff_by_heritage_type': dict(Counter(
|
|
s.get('heritage_type') for s in staff if s.get('heritage_type')
|
|
)),
|
|
'names_derived_from_slug': sum(
|
|
1 for s in staff
|
|
if s.get('name_correction', {}).get('derived_from_slug')
|
|
),
|
|
'names_possibly_contaminated': sum(
|
|
1 for s in staff
|
|
if s.get('name_correction', {}).get('may_be_contaminated')
|
|
),
|
|
}
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Parse LinkedIn company People page HTML to extract staff data'
|
|
)
|
|
parser.add_argument('html_file', type=Path, help='Path to saved HTML file')
|
|
parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
|
|
parser.add_argument('--custodian-slug', required=True, help='Slug for staff ID generation')
|
|
parser.add_argument('--output', '-o', type=Path, help='Output JSON file path')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.html_file.exists():
|
|
print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Parsing: {args.html_file}")
|
|
result = parse_html_file(args.html_file, args.custodian_name, args.custodian_slug)
|
|
|
|
# Print summary
|
|
print(f"\nExtraction Results:")
|
|
print(f" Total staff: {result['staff_analysis']['total_staff_extracted']}")
|
|
print(f" With LinkedIn URL: {result['staff_analysis']['with_linkedin_url']}")
|
|
print(f" With alternate profiles: {result['staff_analysis']['with_alternate_profiles']}")
|
|
print(f" Anonymous members: {result['staff_analysis']['anonymous_members']}")
|
|
print(f" Heritage-relevant: {result['staff_analysis']['heritage_relevant_count']}")
|
|
|
|
# Show filtering/merging stats
|
|
pymk_filtered = result['source_metadata'].get('pymk_cards_filtered', 0)
|
|
duplicates_merged = result['source_metadata'].get('duplicate_profiles_merged', 0)
|
|
if pymk_filtered > 0:
|
|
print(f"\n 'People you may know' cards filtered: {pymk_filtered}")
|
|
if duplicates_merged > 0:
|
|
print(f" Duplicate profiles merged: {duplicates_merged}")
|
|
|
|
# Show name correction stats
|
|
names_derived = result['staff_analysis'].get('names_derived_from_slug', 0)
|
|
names_contaminated = result['staff_analysis'].get('names_possibly_contaminated', 0)
|
|
if names_derived > 0 or names_contaminated > 0:
|
|
print(f"\n Name Corrections:")
|
|
if names_derived > 0:
|
|
print(f" Names derived from slug (contamination fixed): {names_derived}")
|
|
if names_contaminated > 0:
|
|
print(f" Names possibly contaminated (manual review needed): {names_contaminated}")
|
|
|
|
expected = result['custodian_metadata'].get('associated_members', 0)
|
|
if expected:
|
|
extracted = result['staff_analysis']['total_staff_extracted']
|
|
print(f"\n Expected (associated members): {expected}")
|
|
print(f" Extracted: {extracted}")
|
|
diff = extracted - expected
|
|
if diff == 0:
|
|
print(f" Match: EXACT")
|
|
elif diff > 0:
|
|
print(f" Difference: +{diff} (more than expected)")
|
|
else:
|
|
print(f" Difference: {diff} (fewer than expected)")
|
|
|
|
print(f"\n Heritage types: {result['staff_analysis']['staff_by_heritage_type']}")
|
|
|
|
# Save output
|
|
if args.output:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
print(f"\nSaved to: {args.output}")
|
|
else:
|
|
# Print to stdout
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|