#!/usr/bin/env python3 """ Extract complete LinkedIn staff data from saved company People page HTML files. This script parses saved HTML files to extract complete staff profiles including: - Name - LinkedIn profile URL - Headline/job title - Connection degree - Mutual connections This replaces the need for MD file parsing - HTML contains ALL the data. Usage: python scripts/parse_linkedin_html.py \ --custodian-name "Name" --custodian-slug "slug" \ --output staff.json Example: python scripts/parse_linkedin_html.py \ "data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \ --custodian-name "Rijksmuseum" \ --custodian-slug "rijksmuseum" \ --output data/custodian/person/rijksmuseum_staff.json """ import argparse import json import re import sys import unicodedata from collections import Counter from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional from html.parser import HTMLParser # Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy HERITAGE_KEYWORDS = { 'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'], 'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'], 'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief', 'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'], 'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder', 'collectiespecialist', 'collectie'], 'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'], 'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO', 'documentatie', 'documentation', 'kenniscentrum', 'historicus'], 'C': ['corporate archive', 'bedrijfsarchief', 'company history'], 'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor', 'associate professor', 'hoogleraar', 'educatie', 'educator'], 'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'], 'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer', 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'], } NON_HERITAGE_KEYWORDS = [ 'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting', 'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical', 'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking', 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' ] class LinkedInProfileCardParser(HTMLParser): """ Parse LinkedIn profile cards from saved HTML. Each profile card has structure: - org-people-profile-card__profile-image-N (contains img with alt=name, href=profile_url) - artdeco-entity-lockup__title (contains name text and profile link) - artdeco-entity-lockup__badge (contains connection degree) - artdeco-entity-lockup__subtitle (contains headline) - Mutual connections text Anonymous "LinkedIn Member" profiles have a different structure: - org-people-profile-card__profile-image-N is on an tag (NOT an tag) - No href link (privacy-protected) - Name appears as "LinkedIn Member" in the title - Still have subtitle (headline) content NOTE: The "People you may know" h2 header in LinkedIn company pages is actually the section title for the associated members list, NOT a separate recommendations section. All profile cards under this header are real associated members. """ def __init__(self): super().__init__() self.profiles: list[dict] = [] self.current_profile: dict = {} # State tracking self.in_profile_card = False self.in_title = False self.in_subtitle = False self.in_badge = False self.in_caption = False self.in_mutual = False self.current_text = "" self.card_index = -1 # For custodian metadata extraction self.custodian_metadata: dict = {} self.in_header = True self.header_texts: list[str] = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = dict(attrs) attr_id = attrs_dict.get('id', '') attr_class = attrs_dict.get('class', '') # Detect profile card start - can be on tag (regular) OR tag (anonymous) if 'org-people-profile-card__profile-image' in attr_id: self.in_profile_card = True self.in_header = False match = re.search(r'profile-image-(\d+)', attr_id) if match: new_index = int(match.group(1)) if new_index != self.card_index: # Save previous profile if exists if self.current_profile.get('name'): self.profiles.append(self.current_profile) self.current_profile = {} self.card_index = new_index # Extract URL from href (only on tags - regular profiles) href = attrs_dict.get('href', '') if href and 'linkedin.com/in/' in href: slug = self._extract_slug(href) if slug: self.current_profile['linkedin_slug'] = slug self.current_profile['linkedin_profile_url'] = f"https://www.linkedin.com/in/{slug}" # If this is an tag with the profile-image ID, it's likely an anonymous member # We'll capture this and the name will come from the title section as "LinkedIn Member" if tag == 'img': # Mark as potential anonymous (will be confirmed when we see "LinkedIn Member" in title) self.current_profile['_may_be_anonymous'] = True # Extract name from img alt (for regular profiles with named photos) if tag == 'img' and self.in_profile_card: alt = attrs_dict.get('alt', '') if alt and alt not in ('', 'photo', 'Profile photo'): self.current_profile['name'] = alt # Title section (contains name link or "LinkedIn Member" text) if 'artdeco-entity-lockup__title' in attr_class: self.in_title = True self.current_text = "" # Badge section (contains degree) if 'artdeco-entity-lockup__badge' in attr_class: self.in_badge = True self.current_text = "" # Subtitle section (contains headline) if 'artdeco-entity-lockup__subtitle' in attr_class: self.in_subtitle = True self.current_text = "" # Caption/mutual connections if 'artdeco-entity-lockup__caption' in attr_class or 'mutual' in attr_class.lower(): self.in_mutual = True self.current_text = "" # Check for mutual connections in span if tag == 'span' and 'mutual' in attr_class.lower(): self.in_mutual = True self.current_text = "" def handle_data(self, data: str) -> None: text = data.strip() if not text: return # Collect header texts for metadata if self.in_header: self.header_texts.append(text) if self.in_title: self.current_text += " " + text elif self.in_badge: self.current_text += " " + text elif self.in_subtitle: self.current_text += " " + text elif self.in_mutual: self.current_text += " " + text def handle_endtag(self, tag: str) -> None: if tag == 'div': if self.in_title: text = self.current_text.strip() if text and 'name' not in self.current_profile: # Clean up name text = re.sub(r'\s+', ' ', text) if len(text) > 1 and not text.startswith('View '): self.current_profile['name'] = text # Check if this is "LinkedIn Member" (anonymous profile) if text == 'LinkedIn Member': self.current_profile['is_anonymous'] = True self.in_title = False self.current_text = "" if self.in_badge: text = self.current_text.strip() degree = self._parse_degree(text) if degree: self.current_profile['degree'] = degree self.in_badge = False self.current_text = "" if self.in_subtitle: text = self.current_text.strip() if text and len(text) > 2: # Clean up headline text = re.sub(r'\s+', ' ', text) self.current_profile['headline'] = text self.in_subtitle = False self.current_text = "" if tag == 'span' and self.in_mutual: text = self.current_text.strip() if text and 'mutual' in text.lower(): self.current_profile['mutual_connections'] = text self.in_mutual = False self.current_text = "" def _extract_slug(self, url: str) -> Optional[str]: """Extract profile slug from URL.""" match = re.search(r'linkedin\.com/in/([^?/]+)', url) if match: return match.group(1) return None def _parse_degree(self, text: str) -> Optional[str]: """Parse connection degree from text.""" if '1st' in text: return '1st' if '2nd' in text: return '2nd' if '3rd' in text: return '3rd+' return None def finalize(self) -> list[dict]: """Finalize parsing and return all profiles.""" # Save last profile if self.current_profile.get('name'): self.profiles.append(self.current_profile) # Parse custodian metadata from header self._parse_header_metadata() return self.profiles def _parse_header_metadata(self) -> None: """Extract custodian metadata from header texts.""" for text in self.header_texts: # Skip JSON blobs and very long texts (data artifacts) if text.startswith('{') or len(text) > 200: continue # Follower count match = re.match(r'^([\d,\.]+K?)\s*followers?$', text, re.IGNORECASE) if match: self.custodian_metadata['follower_count'] = match.group(1) continue # Employee count match = re.match(r'^([\d,\-]+)\s*employees?$', text, re.IGNORECASE) if match: self.custodian_metadata['employee_count'] = match.group(1) continue # Associated members match = re.match(r'^(\d+)\s*associated\s+members?$', text, re.IGNORECASE) if match: self.custodian_metadata['associated_members'] = int(match.group(1)) continue # Industry - must be a clean standalone text, not embedded in JSON industry_keywords = ['Museums', 'Archives', 'Libraries', 'Historical Sites', 'Heritage', 'Zoos'] if any(kw.lower() in text.lower() for kw in industry_keywords): # Ensure it's a clean industry text (not JSON or HTML) if not text.startswith('{') and not '<' in text and len(text) < 100: if 'industry' not in self.custodian_metadata: self.custodian_metadata['industry'] = text.strip() continue # Location (City, Region) match = re.match(r'^([A-Z][a-zA-Zéèêëïöüá\-]+),\s*([A-Z][a-zA-Zéèêëïöüá\-\s]+)$', text) if match and 'location' not in self.custodian_metadata: self.custodian_metadata['location'] = { 'city': match.group(1), 'region': match.group(2) } def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]: """Detect if a headline is heritage-relevant and what type.""" if not headline: return (False, None) headline_lower = headline.lower() # Check non-heritage first for keyword in NON_HERITAGE_KEYWORDS: if keyword.lower() in headline_lower: return (False, None) # Check heritage keywords by type type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C'] for heritage_type in type_order: keywords = HERITAGE_KEYWORDS.get(heritage_type, []) for keyword in keywords: if keyword.lower() in headline_lower: return (True, heritage_type) # Generic heritage terms generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation'] for keyword in generic: if keyword in headline_lower: return (True, None) return (False, None) def is_abbreviated_name(name: str) -> bool: """Check if name contains abbreviations.""" parts = name.split() for part in parts: clean_part = part.rstrip('.') if len(clean_part) <= 1 and clean_part.isalpha(): return True if part.endswith('.') and len(part) <= 2: return True return False def generate_staff_id(name: str, index: int, custodian_slug: str) -> str: """Generate unique staff ID.""" normalized = unicodedata.normalize('NFD', name.lower()) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') name_slug = re.sub(r'[^a-z0-9]+', '_', ascii_name) name_slug = re.sub(r'_+', '_', name_slug).strip('_') if len(name_slug) > 30: name_slug = name_slug[:30].rstrip('_') return f"{custodian_slug}_staff_{index:04d}_{name_slug}" def parse_html_file(filepath: Path, custodian_name: str, custodian_slug: str) -> dict[str, Any]: """ Parse LinkedIn company People page HTML and extract all staff data. Handles: - Duplicate profile merging (same person with multiple LinkedIn accounts) - Anonymous "LinkedIn Member" entries (each counted separately) Returns complete staff JSON structure. """ with open(filepath, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() # Parse HTML parser = LinkedInProfileCardParser() try: parser.feed(html_content) except Exception as e: print(f"Warning: HTML parsing error: {e}", file=sys.stderr) raw_profiles = parser.finalize() custodian_metadata = parser.custodian_metadata # First pass: Group profiles by LinkedIn SLUG to detect duplicates # The same profile may appear multiple times on a page (LinkedIn UI quirk) # We merge by slug, NOT by name, because different people can have the same name # BUT: Do NOT merge "LinkedIn Member" (anonymous) - each is unique slug_to_profiles: dict[str, list[dict]] = {} for profile in raw_profiles: name = profile.get('name', '').strip() slug = profile.get('linkedin_slug', '') is_anonymous = profile.get('is_anonymous', False) or name == 'LinkedIn Member' if not name: continue if is_anonymous: # Each anonymous profile gets a unique key (cannot deduplicate without slug) unique_key = f"_anonymous_{len(slug_to_profiles)}" slug_to_profiles[unique_key] = [profile] elif slug: # Deduplicate by slug - same slug = same person appearing multiple times if slug not in slug_to_profiles: slug_to_profiles[slug] = [] slug_to_profiles[slug].append(profile) else: # No slug (shouldn't happen for non-anonymous) - use unique key unique_key = f"_no_slug_{len(slug_to_profiles)}" slug_to_profiles[unique_key] = [profile] # Second pass: Build staff list with merged duplicates staff: list[dict] = [] anonymous_count = 0 duplicate_profiles_count = 0 for slug_key, profiles in slug_to_profiles.items(): if slug_key.startswith('_anonymous_'): # Anonymous profile profile = profiles[0] anonymous_count += 1 display_name = f"LinkedIn Member #{anonymous_count}" name_type = 'anonymous' headline = profile.get('headline', '') is_heritage, heritage_type = detect_heritage_type(headline) if not headline and custodian_name: is_heritage = True heritage_type = 'M' staff_entry = { 'staff_id': generate_staff_id(display_name, len(staff), custodian_slug), 'name': display_name, 'name_type': name_type, 'degree': profile.get('degree', 'unknown'), 'headline': headline, 'mutual_connections': profile.get('mutual_connections', ''), 'heritage_relevant': is_heritage, 'heritage_type': heritage_type, } staff.append(staff_entry) elif slug_key.startswith('_no_slug_'): # Profile without slug (rare edge case) profile = profiles[0] name = profile.get('name', 'Unknown') if is_abbreviated_name(name): name_type = 'abbreviated' else: name_type = 'full' headline = profile.get('headline', '') is_heritage, heritage_type = detect_heritage_type(headline) if not headline and custodian_name: is_heritage = True heritage_type = 'M' staff_entry = { 'staff_id': generate_staff_id(name, len(staff), custodian_slug), 'name': name, 'name_type': name_type, 'degree': profile.get('degree', 'unknown'), 'headline': headline, 'mutual_connections': profile.get('mutual_connections', ''), 'heritage_relevant': is_heritage, 'heritage_type': heritage_type, } staff.append(staff_entry) else: # Regular profile with slug - may have duplicates to merge # (same profile appearing multiple times on page) primary = profiles[0] name = primary.get('name', slug_key) # Determine name type if is_abbreviated_name(name): name_type = 'abbreviated' else: name_type = 'full' headline = primary.get('headline', '') is_heritage, heritage_type = detect_heritage_type(headline) if not headline and custodian_name: is_heritage = True heritage_type = 'M' staff_entry = { 'staff_id': generate_staff_id(name, len(staff), custodian_slug), 'name': name, 'name_type': name_type, 'degree': primary.get('degree', 'unknown'), 'headline': headline, 'mutual_connections': primary.get('mutual_connections', ''), 'heritage_relevant': is_heritage, 'heritage_type': heritage_type, } # Add primary LinkedIn URL if primary.get('linkedin_profile_url'): staff_entry['linkedin_profile_url'] = primary['linkedin_profile_url'] staff_entry['linkedin_slug'] = primary['linkedin_slug'] # If same profile appeared multiple times, count as duplicates merged if len(profiles) > 1: duplicate_profiles_count += len(profiles) - 1 staff.append(staff_entry) # Build final output structure timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Calculate PYMK filtered count pymk_filtered = custodian_metadata.get('_pymk_cards_filtered', 0) result = { 'custodian_metadata': { 'custodian_name': custodian_name, 'custodian_slug': custodian_slug, 'name': custodian_metadata.get('name', custodian_name), 'industry': custodian_metadata.get('industry', ''), 'location': custodian_metadata.get('location', {}), 'follower_count': custodian_metadata.get('follower_count', ''), 'associated_members': custodian_metadata.get('associated_members', 0), }, 'source_metadata': { 'source_type': 'linkedin_company_people_page_html', 'source_file': str(filepath.name), 'registered_timestamp': timestamp, 'registration_method': 'html_parsing', 'staff_extracted': len(staff), 'pymk_cards_filtered': pymk_filtered, 'duplicate_profiles_merged': duplicate_profiles_count, }, 'staff': staff, 'staff_analysis': { 'total_staff_extracted': len(staff), 'with_linkedin_url': sum(1 for s in staff if 'linkedin_profile_url' in s), 'with_alternate_profiles': sum(1 for s in staff if 'alternate_profiles' in s), 'anonymous_members': anonymous_count, 'heritage_relevant_count': sum(1 for s in staff if s.get('heritage_relevant')), 'staff_by_heritage_type': dict(Counter( s.get('heritage_type') for s in staff if s.get('heritage_type') )), } } return result def main(): parser = argparse.ArgumentParser( description='Parse LinkedIn company People page HTML to extract staff data' ) parser.add_argument('html_file', type=Path, help='Path to saved HTML file') parser.add_argument('--custodian-name', required=True, help='Name of the custodian organization') parser.add_argument('--custodian-slug', required=True, help='Slug for staff ID generation') parser.add_argument('--output', '-o', type=Path, help='Output JSON file path') args = parser.parse_args() if not args.html_file.exists(): print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr) sys.exit(1) print(f"Parsing: {args.html_file}") result = parse_html_file(args.html_file, args.custodian_name, args.custodian_slug) # Print summary print(f"\nExtraction Results:") print(f" Total staff: {result['staff_analysis']['total_staff_extracted']}") print(f" With LinkedIn URL: {result['staff_analysis']['with_linkedin_url']}") print(f" With alternate profiles: {result['staff_analysis']['with_alternate_profiles']}") print(f" Anonymous members: {result['staff_analysis']['anonymous_members']}") print(f" Heritage-relevant: {result['staff_analysis']['heritage_relevant_count']}") # Show filtering/merging stats pymk_filtered = result['source_metadata'].get('pymk_cards_filtered', 0) duplicates_merged = result['source_metadata'].get('duplicate_profiles_merged', 0) if pymk_filtered > 0: print(f"\n 'People you may know' cards filtered: {pymk_filtered}") if duplicates_merged > 0: print(f" Duplicate profiles merged: {duplicates_merged}") expected = result['custodian_metadata'].get('associated_members', 0) if expected: extracted = result['staff_analysis']['total_staff_extracted'] print(f"\n Expected (associated members): {expected}") print(f" Extracted: {extracted}") diff = extracted - expected if diff == 0: print(f" Match: EXACT") elif diff > 0: print(f" Difference: +{diff} (more than expected)") else: print(f" Difference: {diff} (fewer than expected)") print(f"\n Heritage types: {result['staff_analysis']['staff_by_heritage_type']}") # Save output if args.output: with open(args.output, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f"\nSaved to: {args.output}") else: # Print to stdout print(json.dumps(result, indent=2, ensure_ascii=False)) return 0 if __name__ == '__main__': sys.exit(main())