#!/usr/bin/env python3 """ Extract company metadata from LinkedIn About page HTML files. Extracts: - Website URL - Industry - Company size/employee count - Headquarters location - Description Usage: python scripts/extract_about_page_data.py [--output-dir DIR] """ import argparse import json import re import sys import unicodedata from datetime import datetime, timezone from pathlib import Path from typing import Any from html.parser import HTMLParser def extract_custodian_name(filename: str) -> str | None: """Extract custodian name from LinkedIn About filename. Filename format: "(N) Custodian Name_ About _ LinkedIn.html" """ match = re.match(r'^\(\d+\)\s*(.+?)_\s*About\s*_\s*LinkedIn\.html$', filename) if match: name = match.group(1).strip() # Clean up underscores that LinkedIn uses instead of colons name = name.replace('_ ', ': ').replace(' _', ':') return name return None def generate_slug(name: str) -> str: """Generate URL-safe slug from custodian name.""" normalized = unicodedata.normalize('NFD', name.lower()) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') slug = re.sub(r'[^a-z0-9]+', '-', ascii_name) slug = re.sub(r'-+', '-', slug).strip('-') return slug def is_valid_url(url: str) -> bool: """Validate URL is properly formatted and not garbage.""" if not url: return False # Reject URLs with newlines or escape sequences if '\n' in url or '\\n' in url or '\r' in url or '\\r' in url: return False # Reject suspiciously long URLs if len(url) > 150: return False # Must be a properly formatted URL with valid TLD if not re.match(r'^https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(/[^\s<>"\']*)?$', url): return False return True def extract_website_url(html_content: str) -> str | None: """Extract the company website URL from About page HTML. Uses multiple strategies: 1. Look for href in anchor tags pointing to external sites 2. Look for JSON patterns with website URLs 3. Fall back to general URL extraction with strict validation """ # URLs to exclude (LinkedIn, CDN, social, analytics, etc.) exclude_patterns = [ 'linkedin', 'licdn', 'w3.org', 'bing.com', 'google.com', 'facebook.com', 'twitter.com', 'instagram.com', 'youtube.com', 'tiktok.com', 'pinterest.com', 'tumblr.com', 'reddit.com', 'schema.org', 'cloudflare', 'analytics', 'tracking', 'leerob.com', 'cdn.', '.svg', '.png', '.jpg', '.gif', '.css', '.js', '.woff', 'fonts.googleapis', 'googletagmanager', 'doubleclick', 'adsense', 'microsoft.com', 'apple.com', 'amazon.com', 'github.com', 'openai.com', 'anthropic.com', ] def is_excluded(url: str) -> bool: url_lower = url.lower() return any(pattern in url_lower for pattern in exclude_patterns) # Strategy 1: Extract from href attributes in anchor tags (most reliable) # Pattern: href="http://www.example.com/" or href="https://example.org" href_pattern = re.compile(r'href="(https?://[^"]+)"') href_matches = href_pattern.findall(html_content) for url in href_matches: if is_excluded(url): continue # Clean trailing slash for consistency url = url.rstrip('/') if is_valid_url(url): return url # Strategy 2: Look for JSON patterns like "websiteUrl":"..." or "website":"..." json_url_patterns = [ r'"websiteUrl"\s*:\s*"(https?://[^"]+)"', r'"website"\s*:\s*"(https?://[^"]+)"', r'"companyUrl"\s*:\s*"(https?://[^"]+)"', r'"homepageUrl"\s*:\s*"(https?://[^"]+)"', ] for pattern in json_url_patterns: matches = re.findall(pattern, html_content) for url in matches: if is_excluded(url): continue url = url.rstrip('/') if is_valid_url(url): return url # Strategy 3: General URL extraction (last resort, with strict validation) url_pattern = re.compile(r'https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?') all_urls = url_pattern.findall(html_content) candidate_urls = [] for url in all_urls: if is_excluded(url): continue # Skip URLs ending with media extensions if url.lower().endswith(('.svg', '.png', '.jpg', '.jpeg', '.gif', '.css', '.js', '.woff', '.woff2')): continue url = url.rstrip('/') if is_valid_url(url): candidate_urls.append(url) if not candidate_urls: return None # Return the first unique URL (most likely the company website) seen = set() for url in candidate_urls: if url not in seen: seen.add(url) return url return None def extract_about_data(html_file: Path) -> dict[str, Any] | None: """Extract metadata from a LinkedIn About page HTML file.""" custodian_name = extract_custodian_name(html_file.name) if not custodian_name: return None try: with open(html_file, 'r', encoding='utf-8', errors='ignore') as f: html_content = f.read() except Exception as e: print(f"Error reading {html_file}: {e}") return None slug = generate_slug(custodian_name) website_url = extract_website_url(html_content) # Extract other metadata if present in JSON-LD or structured data industry = None employee_count = None headquarters = None description = None # Try to find industry from JSON patterns industry_match = re.search(r'"industry"[:\s]*"([^"]+)"', html_content) if industry_match: industry = industry_match.group(1) # Try to find employee count employee_match = re.search(r'"employeeCount"[:\s]*"?(\d+)', html_content) if employee_match: employee_count = int(employee_match.group(1)) # Try to find headquarters hq_match = re.search(r'"headquarters"[:\s]*"([^"]+)"', html_content) if hq_match: headquarters = hq_match.group(1) timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') return { 'custodian_name': custodian_name, 'custodian_slug': slug, 'website_url': website_url, 'industry': industry, 'employee_count': employee_count, 'headquarters': headquarters, 'description': description, 'source_metadata': { 'source_type': 'linkedin_company_about_page_html', 'source_file': html_file.name, 'extraction_timestamp': timestamp, } } def main(): parser = argparse.ArgumentParser( description='Extract metadata from LinkedIn About page HTML files' ) parser.add_argument( '--manual-dir', type=Path, default=Path('data/custodian/person/affiliated/manual'), help='Directory containing LinkedIn HTML files' ) parser.add_argument( '--output-dir', type=Path, default=Path('data/custodian/person/affiliated/about_data'), help='Directory to save extracted data' ) parser.add_argument( '--limit', '-l', type=int, default=None, help='Limit number of files to process' ) args = parser.parse_args() if not args.manual_dir.exists(): print(f"Error: Manual directory not found: {args.manual_dir}", file=sys.stderr) sys.exit(1) args.output_dir.mkdir(parents=True, exist_ok=True) # Find all About page HTML files about_files = sorted(args.manual_dir.glob('*About*LinkedIn.html')) print(f"Found {len(about_files)} About page HTML files") if args.limit: about_files = about_files[:args.limit] print(f"Limited to {len(about_files)} files") results = [] websites_found = 0 for html_file in about_files: data = extract_about_data(html_file) if data: results.append(data) if data['website_url']: websites_found += 1 print(f"✓ {data['custodian_name']}: {data['website_url']}") else: print(f"✗ {data['custodian_name']}: No website found") # Save all results timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') output_file = args.output_dir / f'about_data_{timestamp}.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump({ 'timestamp': timestamp, 'total_processed': len(results), 'websites_found': websites_found, 'data': results, }, f, indent=2, ensure_ascii=False) print(f"\n{'='*60}") print(f"EXTRACTION COMPLETE") print(f" Total processed: {len(results)}") print(f" Websites found: {websites_found}") print(f" Output: {output_file}") return 0 if __name__ == '__main__': sys.exit(main())