#!/usr/bin/env python3 """ Extract LinkedIn profile URLs from saved LinkedIn company People page HTML files. This script parses saved HTML files to extract name → profile URL mappings, which can then be used to enrich staff data parsed from markdown files. The HTML contains profile cards with structure like: Person Name Person Name Usage: python scripts/extract_linkedin_urls_from_html.py [--output json_file] Example: python scripts/extract_linkedin_urls_from_html.py \ "data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \ --output data/custodian/person/rijksmuseum_profile_urls.json """ import argparse import json import re import sys from collections import defaultdict from pathlib import Path from typing import Any from html.parser import HTMLParser from urllib.parse import urlparse, parse_qs, unquote class LinkedInProfileExtractor(HTMLParser): """ HTML parser to extract LinkedIn profile URLs and associated names. """ def __init__(self): super().__init__() self.profiles: dict[str, dict] = {} # url_slug -> {name, full_url, ...} self.current_href = None self.current_name = None self.in_link = False self.link_text = "" # Track all name-url associations self.name_to_urls: dict[str, list[str]] = defaultdict(list) self.url_to_names: dict[str, list[str]] = defaultdict(list) def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = dict(attrs) if tag == 'a': href = attrs_dict.get('href', '') if href and 'linkedin.com/in/' in href: self.in_link = True self.current_href = href self.link_text = "" # Extract name from aria-label if available aria_label = attrs_dict.get('aria-label', '') if aria_label: # "View Kelly Davis' profile" -> "Kelly Davis" match = re.match(r"View (.+?)'s profile", aria_label) if match: self.current_name = match.group(1) self._record_association(self.current_name, href) elif tag == 'img': # Images have alt text with names alt = attrs_dict.get('alt', '') if alt and self.current_href: # Don't use generic alt text if alt.lower() not in ('profile photo', 'photo', 'image', ''): self._record_association(alt, self.current_href) def handle_data(self, data: str) -> None: if self.in_link: self.link_text += data.strip() def handle_endtag(self, tag: str) -> None: if tag == 'a' and self.in_link: # Record link text as name if self.link_text and self.current_href: # Clean up the name name = self.link_text.strip() if name and len(name) > 1 and not name.isdigit(): self._record_association(name, self.current_href) self.in_link = False self.current_href = None self.link_text = "" def _record_association(self, name: str, url: str) -> None: """Record a name-URL association.""" if not name or not url: return # Extract the clean slug from URL slug = extract_slug_from_url(url) if not slug: return # Clean name name = name.strip() if not name or len(name) < 2: return # Record both directions self.name_to_urls[name].append(slug) self.url_to_names[slug].append(name) # Store in profiles dict (will be deduplicated later) if slug not in self.profiles: self.profiles[slug] = { 'slug': slug, 'full_url': f"https://www.linkedin.com/in/{slug}", 'names': set(), 'is_aco_id': slug.startswith('ACo'), } self.profiles[slug]['names'].add(name) def extract_slug_from_url(url: str) -> str | None: """ Extract the profile slug from a LinkedIn URL. Handles: - https://www.linkedin.com/in/username - https://www.linkedin.com/in/username?miniProfileUrn=... - /in/username (relative URL) """ # Handle relative URLs if url.startswith('/in/'): url = f"https://www.linkedin.com{url}" try: parsed = urlparse(url) path = parsed.path # Extract from /in/username match = re.match(r'/in/([^/?]+)', path) if match: return match.group(1) except Exception: pass return None def parse_html_file(filepath: Path) -> dict[str, Any]: """ Parse an HTML file and extract profile URL mappings. Returns a dict with: - profiles: dict[slug] -> {slug, full_url, names, is_aco_id} - name_to_slug: dict[name] -> slug (best match) - stats: extraction statistics """ with open(filepath, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() # Parse with our custom parser parser = LinkedInProfileExtractor() try: parser.feed(html_content) except Exception as e: print(f"Warning: HTML parsing error: {e}", file=sys.stderr) # Also do regex extraction as backup # Pattern for profile URLs url_pattern = r'linkedin\.com/in/([a-zA-Z0-9_-]+)' regex_slugs = set(re.findall(url_pattern, html_content)) # Add any regex-found slugs not in parser results for slug in regex_slugs: if slug not in parser.profiles: parser.profiles[slug] = { 'slug': slug, 'full_url': f"https://www.linkedin.com/in/{slug}", 'names': set(), 'is_aco_id': slug.startswith('ACo'), } # Build name -> slug mapping (prefer non-ACo slugs) name_to_slug: dict[str, str] = {} for name, slugs in parser.name_to_urls.items(): # Get unique slugs unique_slugs = list(set(slugs)) # Prefer non-ACo slugs non_aco = [s for s in unique_slugs if not s.startswith('ACo')] if non_aco: name_to_slug[name] = non_aco[0] elif unique_slugs: name_to_slug[name] = unique_slugs[0] # Convert sets to lists for JSON serialization profiles_serializable = {} for slug, data in parser.profiles.items(): profiles_serializable[slug] = { **data, 'names': list(data['names']) } # Compute stats total_profiles = len(parser.profiles) aco_profiles = len([s for s in parser.profiles if s.startswith('ACo')]) named_profiles = len([p for p in parser.profiles.values() if p['names']]) return { 'profiles': profiles_serializable, 'name_to_slug': name_to_slug, 'slug_to_names': {slug: list(names) for slug, names in parser.url_to_names.items()}, 'stats': { 'total_profiles': total_profiles, 'clean_slugs': total_profiles - aco_profiles, 'aco_ids': aco_profiles, 'profiles_with_names': named_profiles, 'unique_names_found': len(name_to_slug), } } def normalize_name_for_matching(name: str) -> str: """Normalize a name for fuzzy matching.""" import unicodedata # NFD decomposition and remove diacritics normalized = unicodedata.normalize('NFD', name.lower()) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Remove extra whitespace ascii_name = ' '.join(ascii_name.split()) return ascii_name def match_staff_to_urls(staff_json_path: Path, url_data: dict) -> dict[str, Any]: """ Match existing staff entries to extracted URLs. Returns enrichment data with: - matched: staff entries with URL matches - unmatched_staff: staff entries without URL matches - unmatched_urls: URLs without staff matches """ with open(staff_json_path, 'r', encoding='utf-8') as f: staff_data = json.load(f) staff_list = staff_data.get('staff', []) name_to_slug = url_data.get('name_to_slug', {}) slug_to_names = url_data.get('slug_to_names', {}) # Build normalized name lookup normalized_lookup: dict[str, str] = {} for name, slug in name_to_slug.items(): norm_name = normalize_name_for_matching(name) normalized_lookup[norm_name] = slug matched = [] unmatched_staff = [] used_slugs = set() for staff in staff_list: name = staff.get('name', '') norm_name = normalize_name_for_matching(name) # Try exact match first slug = name_to_slug.get(name) # Try normalized match if not slug: slug = normalized_lookup.get(norm_name) if slug: staff_enriched = { **staff, 'linkedin_profile_url': f"https://www.linkedin.com/in/{slug}", 'linkedin_slug': slug, } matched.append(staff_enriched) used_slugs.add(slug) else: unmatched_staff.append(staff) # Find URLs without matches all_slugs = set(url_data.get('profiles', {}).keys()) unmatched_urls = [] for slug in all_slugs - used_slugs: profile = url_data['profiles'].get(slug, {}) unmatched_urls.append({ 'slug': slug, 'names': profile.get('names', []), 'is_aco_id': profile.get('is_aco_id', False), }) return { 'matched': matched, 'unmatched_staff': unmatched_staff, 'unmatched_urls': unmatched_urls, 'match_stats': { 'total_staff': len(staff_list), 'matched_count': len(matched), 'unmatched_staff_count': len(unmatched_staff), 'unmatched_url_count': len(unmatched_urls), 'match_rate': len(matched) / len(staff_list) if staff_list else 0, } } def main(): parser = argparse.ArgumentParser( description='Extract LinkedIn profile URLs from saved HTML files' ) parser.add_argument('html_file', type=Path, help='Path to saved HTML file') parser.add_argument('--output', '-o', type=Path, help='Output JSON file path') parser.add_argument('--staff-json', type=Path, help='Optional: Staff JSON file to enrich with URLs') parser.add_argument('--enrich-output', type=Path, help='Output path for enriched staff JSON') args = parser.parse_args() if not args.html_file.exists(): print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr) sys.exit(1) print(f"Parsing HTML file: {args.html_file}") url_data = parse_html_file(args.html_file) # Print stats stats = url_data['stats'] print(f"\nExtraction Results:") print(f" Total profiles found: {stats['total_profiles']}") print(f" Clean slugs: {stats['clean_slugs']}") print(f" ACo IDs: {stats['aco_ids']}") print(f" Profiles with names: {stats['profiles_with_names']}") print(f" Unique names found: {stats['unique_names_found']}") # Save URL extraction results if args.output: with open(args.output, 'w', encoding='utf-8') as f: json.dump(url_data, f, indent=2, ensure_ascii=False) print(f"\nSaved URL data to: {args.output}") # Enrich staff data if provided if args.staff_json: if not args.staff_json.exists(): print(f"Error: Staff JSON not found: {args.staff_json}", file=sys.stderr) sys.exit(1) print(f"\nMatching staff to URLs...") match_results = match_staff_to_urls(args.staff_json, url_data) match_stats = match_results['match_stats'] print(f"\nMatching Results:") print(f" Total staff: {match_stats['total_staff']}") print(f" Matched: {match_stats['matched_count']} ({match_stats['match_rate']:.1%})") print(f" Unmatched staff: {match_stats['unmatched_staff_count']}") print(f" Unmatched URLs: {match_stats['unmatched_url_count']}") # Show some unmatched staff names if match_results['unmatched_staff'][:5]: print(f"\n Sample unmatched staff:") for staff in match_results['unmatched_staff'][:5]: print(f" - {staff.get('name')}") # Save enriched data if args.enrich_output: with open(args.staff_json, 'r', encoding='utf-8') as f: original_data = json.load(f) # Create enriched version enriched_data = { **original_data, 'staff': match_results['matched'] + match_results['unmatched_staff'], 'url_enrichment_stats': match_stats, } with open(args.enrich_output, 'w', encoding='utf-8') as f: json.dump(enriched_data, f, indent=2, ensure_ascii=False) print(f"\nSaved enriched staff data to: {args.enrich_output}") return 0 if __name__ == '__main__': sys.exit(main())