- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
759 lines
31 KiB
Python
Executable file
759 lines
31 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Extract complete LinkedIn staff data from saved company People page HTML files.
|
|
|
|
This script parses saved HTML files to extract complete staff profiles including:
|
|
- Name
|
|
- LinkedIn profile URL
|
|
- Headline/job title
|
|
- Connection degree
|
|
- Mutual connections
|
|
|
|
This replaces the need for MD file parsing - HTML contains ALL the data.
|
|
|
|
Usage:
|
|
python scripts/parse_linkedin_html.py <html_file> \
|
|
--custodian-name "Name" --custodian-slug "slug" \
|
|
--output staff.json
|
|
|
|
Example:
|
|
python scripts/parse_linkedin_html.py \
|
|
"data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \
|
|
--custodian-name "Rijksmuseum" \
|
|
--custodian-slug "rijksmuseum" \
|
|
--output data/custodian/person/rijksmuseum_staff.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from collections import Counter
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
|
HERITAGE_KEYWORDS = {
|
|
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
|
|
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
|
|
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
|
|
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
|
|
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
|
|
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
|
|
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
|
|
'collectiespecialist', 'collectie'],
|
|
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
|
|
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
|
|
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
|
|
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
|
|
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
|
|
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
|
|
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
|
|
'associate professor', 'hoogleraar', 'educatie', 'educator'],
|
|
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
|
|
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
|
|
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
|
|
}
|
|
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
|
|
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
|
|
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
|
|
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
|
]
|
|
|
|
# Organizations that are explicitly NOT heritage institutions
|
|
# These should never be classified as heritage-relevant
|
|
NON_HERITAGE_ORGANIZATIONS = [
|
|
# Banks & Financial
|
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
|
# Security companies
|
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
|
# Police/Government (non-cultural)
|
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
|
# Political parties
|
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
|
# Tech companies (non-heritage)
|
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
|
# Telecom
|
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
|
# Postal / Logistics
|
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
|
# Healthcare
|
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
|
# Retail
|
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
|
# Consulting / Professional services
|
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
|
# Recruitment / HR
|
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
|
# Energy / Utilities
|
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
|
# Transport
|
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
|
# Other
|
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
|
]
|
|
|
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
|
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
|
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
|
# Archives
|
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
|
# Museums
|
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
|
# Libraries
|
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
|
# Film/AV heritage
|
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
|
# Heritage platforms
|
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
|
# Research institutes (heritage-focused)
|
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
|
]
|
|
|
|
# LinkedIn status phrases that pollute name fields (extracted from img alt text)
|
|
# These should be removed from names and stored as metadata
|
|
LINKEDIN_STATUS_PHRASES = [
|
|
' is open to work',
|
|
' is hiring',
|
|
' is looking for new opportunities',
|
|
' is looking for opportunities',
|
|
' is actively looking',
|
|
' is available for work',
|
|
' open to work',
|
|
' - open to work',
|
|
' • Open to work',
|
|
' - Hiring',
|
|
' • Hiring',
|
|
]
|
|
|
|
|
|
def clean_linkedin_status_from_name(name: str) -> tuple[str, str | None]:
|
|
"""
|
|
Remove LinkedIn status phrases from name and return clean name + status.
|
|
|
|
Args:
|
|
name: Raw name possibly containing LinkedIn status
|
|
|
|
Returns:
|
|
Tuple of (clean_name, linkedin_status or None)
|
|
|
|
Examples:
|
|
"John Doe is open to work" -> ("John Doe", "open_to_work")
|
|
"Jane Smith is hiring" -> ("Jane Smith", "hiring")
|
|
"Bob Jones" -> ("Bob Jones", None)
|
|
"""
|
|
if not name:
|
|
return (name, None)
|
|
|
|
name_lower = name.lower()
|
|
|
|
for phrase in LINKEDIN_STATUS_PHRASES:
|
|
phrase_lower = phrase.lower()
|
|
if phrase_lower in name_lower:
|
|
# Find position and remove
|
|
idx = name_lower.find(phrase_lower)
|
|
clean_name = name[:idx].strip()
|
|
|
|
# Determine status type
|
|
if 'hiring' in phrase_lower:
|
|
status = 'hiring'
|
|
elif 'open to work' in phrase_lower or 'looking' in phrase_lower or 'available' in phrase_lower:
|
|
status = 'open_to_work'
|
|
else:
|
|
status = 'active'
|
|
|
|
return (clean_name, status)
|
|
|
|
return (name, None)
|
|
|
|
|
|
class LinkedInProfileCardParser(HTMLParser):
|
|
"""
|
|
Parse LinkedIn profile cards from saved HTML.
|
|
|
|
Each profile card has structure:
|
|
- org-people-profile-card__profile-image-N (contains img with alt=name, href=profile_url)
|
|
- artdeco-entity-lockup__title (contains name text and profile link)
|
|
- artdeco-entity-lockup__badge (contains connection degree)
|
|
- artdeco-entity-lockup__subtitle (contains headline)
|
|
- Mutual connections text
|
|
|
|
Anonymous "LinkedIn Member" profiles have a different structure:
|
|
- org-people-profile-card__profile-image-N is on an <img> tag (NOT an <a> tag)
|
|
- No href link (privacy-protected)
|
|
- Name appears as "LinkedIn Member" in the title
|
|
- Still have subtitle (headline) content
|
|
|
|
NOTE: The "People you may know" h2 header in LinkedIn company pages is actually
|
|
the section title for the associated members list, NOT a separate recommendations
|
|
section. All profile cards under this header are real associated members.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.profiles: list[dict] = []
|
|
self.current_profile: dict = {}
|
|
|
|
# State tracking
|
|
self.in_profile_card = False
|
|
self.in_title = False
|
|
self.in_subtitle = False
|
|
self.in_badge = False
|
|
self.in_caption = False
|
|
self.in_mutual = False
|
|
|
|
self.current_text = ""
|
|
self.card_index = -1
|
|
|
|
# For custodian metadata extraction
|
|
self.custodian_metadata: dict = {}
|
|
self.in_header = True
|
|
self.header_texts: list[str] = []
|
|
|
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
attrs_dict = dict(attrs)
|
|
attr_id = attrs_dict.get('id') or ''
|
|
attr_class = attrs_dict.get('class') or ''
|
|
|
|
# Detect profile card start - can be on <a> tag (regular) OR <img> tag (anonymous)
|
|
if 'org-people-profile-card__profile-image' in attr_id:
|
|
self.in_profile_card = True
|
|
self.in_header = False
|
|
match = re.search(r'profile-image-(\d+)', attr_id)
|
|
if match:
|
|
new_index = int(match.group(1))
|
|
if new_index != self.card_index:
|
|
# Save previous profile if exists
|
|
if self.current_profile.get('name'):
|
|
self.profiles.append(self.current_profile)
|
|
self.current_profile = {}
|
|
self.card_index = new_index
|
|
|
|
# Extract URL from href (only on <a> tags - regular profiles)
|
|
href = attrs_dict.get('href', '')
|
|
if href and 'linkedin.com/in/' in href:
|
|
slug = self._extract_slug(href)
|
|
if slug:
|
|
self.current_profile['linkedin_slug'] = slug
|
|
self.current_profile['linkedin_profile_url'] = f"https://www.linkedin.com/in/{slug}"
|
|
|
|
# If this is an <img> tag with the profile-image ID, it's likely an anonymous member
|
|
# We'll capture this and the name will come from the title section as "LinkedIn Member"
|
|
if tag == 'img':
|
|
# Mark as potential anonymous (will be confirmed when we see "LinkedIn Member" in title)
|
|
self.current_profile['_may_be_anonymous'] = True
|
|
|
|
# Extract name from img alt (for regular profiles with named photos)
|
|
if tag == 'img' and self.in_profile_card:
|
|
alt = attrs_dict.get('alt', '')
|
|
if alt and alt not in ('', 'photo', 'Profile photo'):
|
|
# Clean LinkedIn status phrases from name
|
|
clean_name, linkedin_status = clean_linkedin_status_from_name(alt)
|
|
self.current_profile['name'] = clean_name
|
|
if linkedin_status:
|
|
self.current_profile['linkedin_status'] = linkedin_status
|
|
|
|
# Title section (contains name link or "LinkedIn Member" text)
|
|
if 'artdeco-entity-lockup__title' in attr_class:
|
|
self.in_title = True
|
|
self.current_text = ""
|
|
|
|
# Badge section (contains degree)
|
|
if 'artdeco-entity-lockup__badge' in attr_class:
|
|
self.in_badge = True
|
|
self.current_text = ""
|
|
|
|
# Subtitle section (contains headline)
|
|
if 'artdeco-entity-lockup__subtitle' in attr_class:
|
|
self.in_subtitle = True
|
|
self.current_text = ""
|
|
|
|
# Caption/mutual connections
|
|
if 'artdeco-entity-lockup__caption' in attr_class or 'mutual' in attr_class.lower():
|
|
self.in_mutual = True
|
|
self.current_text = ""
|
|
|
|
# Check for mutual connections in span
|
|
if tag == 'span' and 'mutual' in attr_class.lower():
|
|
self.in_mutual = True
|
|
self.current_text = ""
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
text = data.strip()
|
|
if not text:
|
|
return
|
|
|
|
# Collect header texts for metadata
|
|
if self.in_header:
|
|
self.header_texts.append(text)
|
|
|
|
if self.in_title:
|
|
self.current_text += " " + text
|
|
elif self.in_badge:
|
|
self.current_text += " " + text
|
|
elif self.in_subtitle:
|
|
self.current_text += " " + text
|
|
elif self.in_mutual:
|
|
self.current_text += " " + text
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
if tag == 'div':
|
|
if self.in_title:
|
|
text = self.current_text.strip()
|
|
if text and 'name' not in self.current_profile:
|
|
# Clean up name
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if len(text) > 1 and not text.startswith('View '):
|
|
# Clean LinkedIn status phrases from name
|
|
clean_name, linkedin_status = clean_linkedin_status_from_name(text)
|
|
self.current_profile['name'] = clean_name
|
|
if linkedin_status and 'linkedin_status' not in self.current_profile:
|
|
self.current_profile['linkedin_status'] = linkedin_status
|
|
# Check if this is "LinkedIn Member" (anonymous profile)
|
|
if clean_name == 'LinkedIn Member':
|
|
self.current_profile['is_anonymous'] = True
|
|
self.in_title = False
|
|
self.current_text = ""
|
|
|
|
if self.in_badge:
|
|
text = self.current_text.strip()
|
|
degree = self._parse_degree(text)
|
|
if degree:
|
|
self.current_profile['degree'] = degree
|
|
self.in_badge = False
|
|
self.current_text = ""
|
|
|
|
if self.in_subtitle:
|
|
text = self.current_text.strip()
|
|
if text and len(text) > 2:
|
|
# Clean up headline
|
|
text = re.sub(r'\s+', ' ', text)
|
|
self.current_profile['headline'] = text
|
|
self.in_subtitle = False
|
|
self.current_text = ""
|
|
|
|
if tag == 'span' and self.in_mutual:
|
|
text = self.current_text.strip()
|
|
if text and 'mutual' in text.lower():
|
|
self.current_profile['mutual_connections'] = text
|
|
self.in_mutual = False
|
|
self.current_text = ""
|
|
|
|
def _extract_slug(self, url: str) -> Optional[str]:
|
|
"""Extract profile slug from URL."""
|
|
match = re.search(r'linkedin\.com/in/([^?/]+)', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _parse_degree(self, text: str) -> Optional[str]:
|
|
"""Parse connection degree from text."""
|
|
if '1st' in text:
|
|
return '1st'
|
|
if '2nd' in text:
|
|
return '2nd'
|
|
if '3rd' in text:
|
|
return '3rd+'
|
|
return None
|
|
|
|
def finalize(self) -> list[dict]:
|
|
"""Finalize parsing and return all profiles."""
|
|
# Save last profile
|
|
if self.current_profile.get('name'):
|
|
self.profiles.append(self.current_profile)
|
|
|
|
# Parse custodian metadata from header
|
|
self._parse_header_metadata()
|
|
|
|
return self.profiles
|
|
|
|
def _parse_header_metadata(self) -> None:
|
|
"""Extract custodian metadata from header texts."""
|
|
for text in self.header_texts:
|
|
# Skip JSON blobs and very long texts (data artifacts)
|
|
if text.startswith('{') or len(text) > 200:
|
|
continue
|
|
|
|
# Follower count
|
|
match = re.match(r'^([\d,\.]+K?)\s*followers?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['follower_count'] = match.group(1)
|
|
continue
|
|
|
|
# Employee count
|
|
match = re.match(r'^([\d,\-]+)\s*employees?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['employee_count'] = match.group(1)
|
|
continue
|
|
|
|
# Associated members
|
|
match = re.match(r'^(\d+)\s*associated\s+members?$', text, re.IGNORECASE)
|
|
if match:
|
|
self.custodian_metadata['associated_members'] = int(match.group(1))
|
|
continue
|
|
|
|
# Industry - must be a clean standalone text, not embedded in JSON
|
|
industry_keywords = ['Museums', 'Archives', 'Libraries', 'Historical Sites', 'Heritage', 'Zoos']
|
|
if any(kw.lower() in text.lower() for kw in industry_keywords):
|
|
# Ensure it's a clean industry text (not JSON or HTML)
|
|
if not text.startswith('{') and not '<' in text and len(text) < 100:
|
|
if 'industry' not in self.custodian_metadata:
|
|
self.custodian_metadata['industry'] = text.strip()
|
|
continue
|
|
|
|
# Location (City, Region)
|
|
match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', text)
|
|
if match and 'location' not in self.custodian_metadata:
|
|
self.custodian_metadata['location'] = {
|
|
'city': match.group(1),
|
|
'region': match.group(2)
|
|
}
|
|
|
|
|
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
|
"""
|
|
Detect if a headline is heritage-relevant and what type.
|
|
|
|
Two-stage classification:
|
|
1. Check if organization is explicitly non-heritage (blocklist)
|
|
2. Check if role/organization matches heritage patterns
|
|
|
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
|
This prevents generic IT workers at banks/police from being classified as heritage.
|
|
"""
|
|
if not headline:
|
|
return (False, None)
|
|
|
|
headline_lower = headline.lower()
|
|
|
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
|
if org.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 2: Check for non-heritage role indicators
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
if keyword.lower() in headline_lower:
|
|
return (False, None)
|
|
|
|
# Stage 3: Check if this is a heritage organization
|
|
is_heritage_org = False
|
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
|
if org_keyword.lower() in headline_lower:
|
|
is_heritage_org = True
|
|
break
|
|
|
|
# Check heritage keywords by type (order matters - more specific first)
|
|
# 'D' (Digital) is checked last and requires heritage org validation
|
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
|
|
|
for heritage_type in type_order:
|
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
|
for keyword in keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, heritage_type)
|
|
|
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
|
if is_heritage_org:
|
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
|
for keyword in digital_keywords:
|
|
if keyword.lower() in headline_lower:
|
|
return (True, 'D')
|
|
|
|
# Generic heritage terms (without specific type)
|
|
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
|
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
|
for keyword in generic:
|
|
if keyword in headline_lower:
|
|
return (True, None)
|
|
|
|
return (False, None)
|
|
|
|
|
|
def is_abbreviated_name(name: str) -> bool:
|
|
"""Check if name contains abbreviations."""
|
|
parts = name.split()
|
|
for part in parts:
|
|
clean_part = part.rstrip('.')
|
|
if len(clean_part) <= 1 and clean_part.isalpha():
|
|
return True
|
|
if part.endswith('.') and len(part) <= 2:
|
|
return True
|
|
return False
|
|
|
|
|
|
def generate_staff_id(name: str, index: int, custodian_slug: str) -> str:
|
|
"""Generate unique staff ID."""
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name)
|
|
name_slug = re.sub(r'_+', '_', name_slug).strip('_')
|
|
if len(name_slug) > 30:
|
|
name_slug = name_slug[:30].rstrip('_')
|
|
return f"{custodian_slug}_staff_{index:04d}_{name_slug}"
|
|
|
|
|
|
def parse_html_file(filepath: Path, custodian_name: str, custodian_slug: str) -> dict[str, Any]:
|
|
"""
|
|
Parse LinkedIn company People page HTML and extract all staff data.
|
|
|
|
Handles:
|
|
- Duplicate profile merging (same person with multiple LinkedIn accounts)
|
|
- Anonymous "LinkedIn Member" entries (each counted separately)
|
|
|
|
Returns complete staff JSON structure.
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
|
|
# Parse HTML
|
|
parser = LinkedInProfileCardParser()
|
|
try:
|
|
parser.feed(html_content)
|
|
except Exception as e:
|
|
print(f"Warning: HTML parsing error: {e}", file=sys.stderr)
|
|
|
|
raw_profiles = parser.finalize()
|
|
custodian_metadata = parser.custodian_metadata
|
|
|
|
# First pass: Group profiles by LinkedIn SLUG to detect duplicates
|
|
# The same profile may appear multiple times on a page (LinkedIn UI quirk)
|
|
# We merge by slug, NOT by name, because different people can have the same name
|
|
# BUT: Do NOT merge "LinkedIn Member" (anonymous) - each is unique
|
|
slug_to_profiles: dict[str, list[dict]] = {}
|
|
|
|
for profile in raw_profiles:
|
|
name = profile.get('name', '').strip()
|
|
slug = profile.get('linkedin_slug', '')
|
|
is_anonymous = profile.get('is_anonymous', False) or name == 'LinkedIn Member'
|
|
|
|
if not name:
|
|
continue
|
|
|
|
if is_anonymous:
|
|
# Each anonymous profile gets a unique key (cannot deduplicate without slug)
|
|
unique_key = f"_anonymous_{len(slug_to_profiles)}"
|
|
slug_to_profiles[unique_key] = [profile]
|
|
elif slug:
|
|
# Deduplicate by slug - same slug = same person appearing multiple times
|
|
if slug not in slug_to_profiles:
|
|
slug_to_profiles[slug] = []
|
|
slug_to_profiles[slug].append(profile)
|
|
else:
|
|
# No slug (shouldn't happen for non-anonymous) - use unique key
|
|
unique_key = f"_no_slug_{len(slug_to_profiles)}"
|
|
slug_to_profiles[unique_key] = [profile]
|
|
|
|
# Second pass: Build staff list with merged duplicates
|
|
staff: list[dict] = []
|
|
anonymous_count = 0
|
|
duplicate_profiles_count = 0
|
|
|
|
for slug_key, profiles in slug_to_profiles.items():
|
|
if slug_key.startswith('_anonymous_'):
|
|
# Anonymous profile
|
|
profile = profiles[0]
|
|
anonymous_count += 1
|
|
display_name = f"LinkedIn Member #{anonymous_count}"
|
|
name_type = 'anonymous'
|
|
|
|
headline = profile.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(display_name, len(staff), custodian_slug),
|
|
'name': display_name,
|
|
'name_type': name_type,
|
|
'degree': profile.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': profile.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
staff.append(staff_entry)
|
|
elif slug_key.startswith('_no_slug_'):
|
|
# Profile without slug (rare edge case)
|
|
profile = profiles[0]
|
|
name = profile.get('name', 'Unknown')
|
|
|
|
if is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
headline = profile.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(name, len(staff), custodian_slug),
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': profile.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': profile.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
staff.append(staff_entry)
|
|
else:
|
|
# Regular profile with slug - may have duplicates to merge
|
|
# (same profile appearing multiple times on page)
|
|
primary = profiles[0]
|
|
name = primary.get('name', slug_key)
|
|
|
|
# Determine name type
|
|
if is_abbreviated_name(name):
|
|
name_type = 'abbreviated'
|
|
else:
|
|
name_type = 'full'
|
|
|
|
headline = primary.get('headline', '')
|
|
is_heritage, heritage_type = detect_heritage_type(headline)
|
|
if not headline and custodian_name:
|
|
is_heritage = True
|
|
heritage_type = 'M'
|
|
|
|
staff_entry = {
|
|
'staff_id': generate_staff_id(name, len(staff), custodian_slug),
|
|
'name': name,
|
|
'name_type': name_type,
|
|
'degree': primary.get('degree', 'unknown'),
|
|
'headline': headline,
|
|
'mutual_connections': primary.get('mutual_connections', ''),
|
|
'heritage_relevant': is_heritage,
|
|
'heritage_type': heritage_type,
|
|
}
|
|
|
|
# Add primary LinkedIn URL
|
|
if primary.get('linkedin_profile_url'):
|
|
staff_entry['linkedin_profile_url'] = primary['linkedin_profile_url']
|
|
staff_entry['linkedin_slug'] = primary['linkedin_slug']
|
|
|
|
# If same profile appeared multiple times, count as duplicates merged
|
|
if len(profiles) > 1:
|
|
duplicate_profiles_count += len(profiles) - 1
|
|
|
|
staff.append(staff_entry)
|
|
|
|
# Build final output structure
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
# Calculate PYMK filtered count
|
|
pymk_filtered = custodian_metadata.get('_pymk_cards_filtered', 0)
|
|
|
|
result = {
|
|
'custodian_metadata': {
|
|
'custodian_name': custodian_name,
|
|
'custodian_slug': custodian_slug,
|
|
'name': custodian_metadata.get('name', custodian_name),
|
|
'industry': custodian_metadata.get('industry', ''),
|
|
'location': custodian_metadata.get('location', {}),
|
|
'follower_count': custodian_metadata.get('follower_count', ''),
|
|
'associated_members': custodian_metadata.get('associated_members', 0),
|
|
},
|
|
'source_metadata': {
|
|
'source_type': 'linkedin_company_people_page_html',
|
|
'source_file': str(filepath.name),
|
|
'registered_timestamp': timestamp,
|
|
'registration_method': 'html_parsing',
|
|
'staff_extracted': len(staff),
|
|
'pymk_cards_filtered': pymk_filtered,
|
|
'duplicate_profiles_merged': duplicate_profiles_count,
|
|
},
|
|
'staff': staff,
|
|
'staff_analysis': {
|
|
'total_staff_extracted': len(staff),
|
|
'with_linkedin_url': sum(1 for s in staff if 'linkedin_profile_url' in s),
|
|
'with_alternate_profiles': sum(1 for s in staff if 'alternate_profiles' in s),
|
|
'anonymous_members': anonymous_count,
|
|
'heritage_relevant_count': sum(1 for s in staff if s.get('heritage_relevant')),
|
|
'staff_by_heritage_type': dict(Counter(
|
|
s.get('heritage_type') for s in staff if s.get('heritage_type')
|
|
)),
|
|
}
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Parse LinkedIn company People page HTML to extract staff data'
|
|
)
|
|
parser.add_argument('html_file', type=Path, help='Path to saved HTML file')
|
|
parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization')
|
|
parser.add_argument('--custodian-slug', required=True, help='Slug for staff ID generation')
|
|
parser.add_argument('--output', '-o', type=Path, help='Output JSON file path')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.html_file.exists():
|
|
print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Parsing: {args.html_file}")
|
|
result = parse_html_file(args.html_file, args.custodian_name, args.custodian_slug)
|
|
|
|
# Print summary
|
|
print(f"\nExtraction Results:")
|
|
print(f" Total staff: {result['staff_analysis']['total_staff_extracted']}")
|
|
print(f" With LinkedIn URL: {result['staff_analysis']['with_linkedin_url']}")
|
|
print(f" With alternate profiles: {result['staff_analysis']['with_alternate_profiles']}")
|
|
print(f" Anonymous members: {result['staff_analysis']['anonymous_members']}")
|
|
print(f" Heritage-relevant: {result['staff_analysis']['heritage_relevant_count']}")
|
|
|
|
# Show filtering/merging stats
|
|
pymk_filtered = result['source_metadata'].get('pymk_cards_filtered', 0)
|
|
duplicates_merged = result['source_metadata'].get('duplicate_profiles_merged', 0)
|
|
if pymk_filtered > 0:
|
|
print(f"\n 'People you may know' cards filtered: {pymk_filtered}")
|
|
if duplicates_merged > 0:
|
|
print(f" Duplicate profiles merged: {duplicates_merged}")
|
|
|
|
expected = result['custodian_metadata'].get('associated_members', 0)
|
|
if expected:
|
|
extracted = result['staff_analysis']['total_staff_extracted']
|
|
print(f"\n Expected (associated members): {expected}")
|
|
print(f" Extracted: {extracted}")
|
|
diff = extracted - expected
|
|
if diff == 0:
|
|
print(f" Match: EXACT")
|
|
elif diff > 0:
|
|
print(f" Difference: +{diff} (more than expected)")
|
|
else:
|
|
print(f" Difference: {diff} (fewer than expected)")
|
|
|
|
print(f"\n Heritage types: {result['staff_analysis']['staff_by_heritage_type']}")
|
|
|
|
# Save output
|
|
if args.output:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
print(f"\nSaved to: {args.output}")
|
|
else:
|
|
# Print to stdout
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|