#!/usr/bin/env python3
"""
Extract LinkedIn profile URLs from saved LinkedIn company People page HTML files.
This script parses saved HTML files to extract name → profile URL mappings,
which can then be used to enrich staff data parsed from markdown files.
The HTML contains profile cards with structure like:
Person Name
Usage:
python scripts/extract_linkedin_urls_from_html.py [--output json_file]
Example:
python scripts/extract_linkedin_urls_from_html.py \
"data/custodian/person/manual_hc/Rijksmuseum_ People _ LinkedIn.html" \
--output data/custodian/person/rijksmuseum_profile_urls.json
"""
import argparse
import json
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any
from html.parser import HTMLParser
from urllib.parse import urlparse, parse_qs, unquote
class LinkedInProfileExtractor(HTMLParser):
"""
HTML parser to extract LinkedIn profile URLs and associated names.
"""
def __init__(self):
super().__init__()
self.profiles: dict[str, dict] = {} # url_slug -> {name, full_url, ...}
self.current_href = None
self.current_name = None
self.in_link = False
self.link_text = ""
# Track all name-url associations
self.name_to_urls: dict[str, list[str]] = defaultdict(list)
self.url_to_names: dict[str, list[str]] = defaultdict(list)
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
attrs_dict = dict(attrs)
if tag == 'a':
href = attrs_dict.get('href', '')
if href and 'linkedin.com/in/' in href:
self.in_link = True
self.current_href = href
self.link_text = ""
# Extract name from aria-label if available
aria_label = attrs_dict.get('aria-label', '')
if aria_label:
# "View Kelly Davis' profile" -> "Kelly Davis"
match = re.match(r"View (.+?)'s profile", aria_label)
if match:
self.current_name = match.group(1)
self._record_association(self.current_name, href)
elif tag == 'img':
# Images have alt text with names
alt = attrs_dict.get('alt', '')
if alt and self.current_href:
# Don't use generic alt text
if alt.lower() not in ('profile photo', 'photo', 'image', ''):
self._record_association(alt, self.current_href)
def handle_data(self, data: str) -> None:
if self.in_link:
self.link_text += data.strip()
def handle_endtag(self, tag: str) -> None:
if tag == 'a' and self.in_link:
# Record link text as name
if self.link_text and self.current_href:
# Clean up the name
name = self.link_text.strip()
if name and len(name) > 1 and not name.isdigit():
self._record_association(name, self.current_href)
self.in_link = False
self.current_href = None
self.link_text = ""
def _record_association(self, name: str, url: str) -> None:
"""Record a name-URL association."""
if not name or not url:
return
# Extract the clean slug from URL
slug = extract_slug_from_url(url)
if not slug:
return
# Clean name
name = name.strip()
if not name or len(name) < 2:
return
# Record both directions
self.name_to_urls[name].append(slug)
self.url_to_names[slug].append(name)
# Store in profiles dict (will be deduplicated later)
if slug not in self.profiles:
self.profiles[slug] = {
'slug': slug,
'full_url': f"https://www.linkedin.com/in/{slug}",
'names': set(),
'is_aco_id': slug.startswith('ACo'),
}
self.profiles[slug]['names'].add(name)
def extract_slug_from_url(url: str) -> str | None:
"""
Extract the profile slug from a LinkedIn URL.
Handles:
- https://www.linkedin.com/in/username
- https://www.linkedin.com/in/username?miniProfileUrn=...
- /in/username (relative URL)
"""
# Handle relative URLs
if url.startswith('/in/'):
url = f"https://www.linkedin.com{url}"
try:
parsed = urlparse(url)
path = parsed.path
# Extract from /in/username
match = re.match(r'/in/([^/?]+)', path)
if match:
return match.group(1)
except Exception:
pass
return None
def parse_html_file(filepath: Path) -> dict[str, Any]:
"""
Parse an HTML file and extract profile URL mappings.
Returns a dict with:
- profiles: dict[slug] -> {slug, full_url, names, is_aco_id}
- name_to_slug: dict[name] -> slug (best match)
- stats: extraction statistics
"""
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
# Parse with our custom parser
parser = LinkedInProfileExtractor()
try:
parser.feed(html_content)
except Exception as e:
print(f"Warning: HTML parsing error: {e}", file=sys.stderr)
# Also do regex extraction as backup
# Pattern for profile URLs
url_pattern = r'linkedin\.com/in/([a-zA-Z0-9_-]+)'
regex_slugs = set(re.findall(url_pattern, html_content))
# Add any regex-found slugs not in parser results
for slug in regex_slugs:
if slug not in parser.profiles:
parser.profiles[slug] = {
'slug': slug,
'full_url': f"https://www.linkedin.com/in/{slug}",
'names': set(),
'is_aco_id': slug.startswith('ACo'),
}
# Build name -> slug mapping (prefer non-ACo slugs)
name_to_slug: dict[str, str] = {}
for name, slugs in parser.name_to_urls.items():
# Get unique slugs
unique_slugs = list(set(slugs))
# Prefer non-ACo slugs
non_aco = [s for s in unique_slugs if not s.startswith('ACo')]
if non_aco:
name_to_slug[name] = non_aco[0]
elif unique_slugs:
name_to_slug[name] = unique_slugs[0]
# Convert sets to lists for JSON serialization
profiles_serializable = {}
for slug, data in parser.profiles.items():
profiles_serializable[slug] = {
**data,
'names': list(data['names'])
}
# Compute stats
total_profiles = len(parser.profiles)
aco_profiles = len([s for s in parser.profiles if s.startswith('ACo')])
named_profiles = len([p for p in parser.profiles.values() if p['names']])
return {
'profiles': profiles_serializable,
'name_to_slug': name_to_slug,
'slug_to_names': {slug: list(names) for slug, names in parser.url_to_names.items()},
'stats': {
'total_profiles': total_profiles,
'clean_slugs': total_profiles - aco_profiles,
'aco_ids': aco_profiles,
'profiles_with_names': named_profiles,
'unique_names_found': len(name_to_slug),
}
}
def normalize_name_for_matching(name: str) -> str:
"""Normalize a name for fuzzy matching."""
import unicodedata
# NFD decomposition and remove diacritics
normalized = unicodedata.normalize('NFD', name.lower())
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Remove extra whitespace
ascii_name = ' '.join(ascii_name.split())
return ascii_name
def match_staff_to_urls(staff_json_path: Path, url_data: dict) -> dict[str, Any]:
"""
Match existing staff entries to extracted URLs.
Returns enrichment data with:
- matched: staff entries with URL matches
- unmatched_staff: staff entries without URL matches
- unmatched_urls: URLs without staff matches
"""
with open(staff_json_path, 'r', encoding='utf-8') as f:
staff_data = json.load(f)
staff_list = staff_data.get('staff', [])
name_to_slug = url_data.get('name_to_slug', {})
slug_to_names = url_data.get('slug_to_names', {})
# Build normalized name lookup
normalized_lookup: dict[str, str] = {}
for name, slug in name_to_slug.items():
norm_name = normalize_name_for_matching(name)
normalized_lookup[norm_name] = slug
matched = []
unmatched_staff = []
used_slugs = set()
for staff in staff_list:
name = staff.get('name', '')
norm_name = normalize_name_for_matching(name)
# Try exact match first
slug = name_to_slug.get(name)
# Try normalized match
if not slug:
slug = normalized_lookup.get(norm_name)
if slug:
staff_enriched = {
**staff,
'linkedin_profile_url': f"https://www.linkedin.com/in/{slug}",
'linkedin_slug': slug,
}
matched.append(staff_enriched)
used_slugs.add(slug)
else:
unmatched_staff.append(staff)
# Find URLs without matches
all_slugs = set(url_data.get('profiles', {}).keys())
unmatched_urls = []
for slug in all_slugs - used_slugs:
profile = url_data['profiles'].get(slug, {})
unmatched_urls.append({
'slug': slug,
'names': profile.get('names', []),
'is_aco_id': profile.get('is_aco_id', False),
})
return {
'matched': matched,
'unmatched_staff': unmatched_staff,
'unmatched_urls': unmatched_urls,
'match_stats': {
'total_staff': len(staff_list),
'matched_count': len(matched),
'unmatched_staff_count': len(unmatched_staff),
'unmatched_url_count': len(unmatched_urls),
'match_rate': len(matched) / len(staff_list) if staff_list else 0,
}
}
def main():
parser = argparse.ArgumentParser(
description='Extract LinkedIn profile URLs from saved HTML files'
)
parser.add_argument('html_file', type=Path, help='Path to saved HTML file')
parser.add_argument('--output', '-o', type=Path, help='Output JSON file path')
parser.add_argument('--staff-json', type=Path,
help='Optional: Staff JSON file to enrich with URLs')
parser.add_argument('--enrich-output', type=Path,
help='Output path for enriched staff JSON')
args = parser.parse_args()
if not args.html_file.exists():
print(f"Error: HTML file not found: {args.html_file}", file=sys.stderr)
sys.exit(1)
print(f"Parsing HTML file: {args.html_file}")
url_data = parse_html_file(args.html_file)
# Print stats
stats = url_data['stats']
print(f"\nExtraction Results:")
print(f" Total profiles found: {stats['total_profiles']}")
print(f" Clean slugs: {stats['clean_slugs']}")
print(f" ACo IDs: {stats['aco_ids']}")
print(f" Profiles with names: {stats['profiles_with_names']}")
print(f" Unique names found: {stats['unique_names_found']}")
# Save URL extraction results
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(url_data, f, indent=2, ensure_ascii=False)
print(f"\nSaved URL data to: {args.output}")
# Enrich staff data if provided
if args.staff_json:
if not args.staff_json.exists():
print(f"Error: Staff JSON not found: {args.staff_json}", file=sys.stderr)
sys.exit(1)
print(f"\nMatching staff to URLs...")
match_results = match_staff_to_urls(args.staff_json, url_data)
match_stats = match_results['match_stats']
print(f"\nMatching Results:")
print(f" Total staff: {match_stats['total_staff']}")
print(f" Matched: {match_stats['matched_count']} ({match_stats['match_rate']:.1%})")
print(f" Unmatched staff: {match_stats['unmatched_staff_count']}")
print(f" Unmatched URLs: {match_stats['unmatched_url_count']}")
# Show some unmatched staff names
if match_results['unmatched_staff'][:5]:
print(f"\n Sample unmatched staff:")
for staff in match_results['unmatched_staff'][:5]:
print(f" - {staff.get('name')}")
# Save enriched data
if args.enrich_output:
with open(args.staff_json, 'r', encoding='utf-8') as f:
original_data = json.load(f)
# Create enriched version
enriched_data = {
**original_data,
'staff': match_results['matched'] + match_results['unmatched_staff'],
'url_enrichment_stats': match_stats,
}
with open(args.enrich_output, 'w', encoding='utf-8') as f:
json.dump(enriched_data, f, indent=2, ensure_ascii=False)
print(f"\nSaved enriched staff data to: {args.enrich_output}")
return 0
if __name__ == '__main__':
sys.exit(main())