glam/scripts/extract_about_page_data.py
2025-12-16 20:27:39 +01:00

277 lines
8.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Extract company metadata from LinkedIn About page HTML files.
Extracts:
- Website URL
- Industry
- Company size/employee count
- Headquarters location
- Description
Usage:
python scripts/extract_about_page_data.py [--output-dir DIR]
"""
import argparse
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from html.parser import HTMLParser
def extract_custodian_name(filename: str) -> str | None:
"""Extract custodian name from LinkedIn About filename.
Filename format: "(N) Custodian Name_ About _ LinkedIn.html"
"""
match = re.match(r'^\(\d+\)\s*(.+?)_\s*About\s*_\s*LinkedIn\.html$', filename)
if match:
name = match.group(1).strip()
# Clean up underscores that LinkedIn uses instead of colons
name = name.replace('_ ', ': ').replace(' _', ':')
return name
return None
def generate_slug(name: str) -> str:
"""Generate URL-safe slug from custodian name."""
normalized = unicodedata.normalize('NFD', name.lower())
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
slug = re.sub(r'[^a-z0-9]+', '-', ascii_name)
slug = re.sub(r'-+', '-', slug).strip('-')
return slug
def is_valid_url(url: str) -> bool:
"""Validate URL is properly formatted and not garbage."""
if not url:
return False
# Reject URLs with newlines or escape sequences
if '\n' in url or '\\n' in url or '\r' in url or '\\r' in url:
return False
# Reject suspiciously long URLs
if len(url) > 150:
return False
# Must be a properly formatted URL with valid TLD
if not re.match(r'^https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(/[^\s<>"\']*)?$', url):
return False
return True
def extract_website_url(html_content: str) -> str | None:
"""Extract the company website URL from About page HTML.
Uses multiple strategies:
1. Look for href in anchor tags pointing to external sites
2. Look for JSON patterns with website URLs
3. Fall back to general URL extraction with strict validation
"""
# URLs to exclude (LinkedIn, CDN, social, analytics, etc.)
exclude_patterns = [
'linkedin', 'licdn', 'w3.org', 'bing.com', 'google.com',
'facebook.com', 'twitter.com', 'instagram.com', 'youtube.com',
'tiktok.com', 'pinterest.com', 'tumblr.com', 'reddit.com',
'schema.org', 'cloudflare', 'analytics', 'tracking', 'leerob.com',
'cdn.', '.svg', '.png', '.jpg', '.gif', '.css', '.js', '.woff',
'fonts.googleapis', 'googletagmanager', 'doubleclick', 'adsense',
'microsoft.com', 'apple.com', 'amazon.com', 'github.com',
'openai.com', 'anthropic.com',
]
def is_excluded(url: str) -> bool:
url_lower = url.lower()
return any(pattern in url_lower for pattern in exclude_patterns)
# Strategy 1: Extract from href attributes in anchor tags (most reliable)
# Pattern: href="http://www.example.com/" or href="https://example.org"
href_pattern = re.compile(r'href="(https?://[^"]+)"')
href_matches = href_pattern.findall(html_content)
for url in href_matches:
if is_excluded(url):
continue
# Clean trailing slash for consistency
url = url.rstrip('/')
if is_valid_url(url):
return url
# Strategy 2: Look for JSON patterns like "websiteUrl":"..." or "website":"..."
json_url_patterns = [
r'"websiteUrl"\s*:\s*"(https?://[^"]+)"',
r'"website"\s*:\s*"(https?://[^"]+)"',
r'"companyUrl"\s*:\s*"(https?://[^"]+)"',
r'"homepageUrl"\s*:\s*"(https?://[^"]+)"',
]
for pattern in json_url_patterns:
matches = re.findall(pattern, html_content)
for url in matches:
if is_excluded(url):
continue
url = url.rstrip('/')
if is_valid_url(url):
return url
# Strategy 3: General URL extraction (last resort, with strict validation)
url_pattern = re.compile(r'https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?')
all_urls = url_pattern.findall(html_content)
candidate_urls = []
for url in all_urls:
if is_excluded(url):
continue
# Skip URLs ending with media extensions
if url.lower().endswith(('.svg', '.png', '.jpg', '.jpeg', '.gif', '.css', '.js', '.woff', '.woff2')):
continue
url = url.rstrip('/')
if is_valid_url(url):
candidate_urls.append(url)
if not candidate_urls:
return None
# Return the first unique URL (most likely the company website)
seen = set()
for url in candidate_urls:
if url not in seen:
seen.add(url)
return url
return None
def extract_about_data(html_file: Path) -> dict[str, Any] | None:
"""Extract metadata from a LinkedIn About page HTML file."""
custodian_name = extract_custodian_name(html_file.name)
if not custodian_name:
return None
try:
with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
html_content = f.read()
except Exception as e:
print(f"Error reading {html_file}: {e}")
return None
slug = generate_slug(custodian_name)
website_url = extract_website_url(html_content)
# Extract other metadata if present in JSON-LD or structured data
industry = None
employee_count = None
headquarters = None
description = None
# Try to find industry from JSON patterns
industry_match = re.search(r'"industry"[:\s]*"([^"]+)"', html_content)
if industry_match:
industry = industry_match.group(1)
# Try to find employee count
employee_match = re.search(r'"employeeCount"[:\s]*"?(\d+)', html_content)
if employee_match:
employee_count = int(employee_match.group(1))
# Try to find headquarters
hq_match = re.search(r'"headquarters"[:\s]*"([^"]+)"', html_content)
if hq_match:
headquarters = hq_match.group(1)
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
return {
'custodian_name': custodian_name,
'custodian_slug': slug,
'website_url': website_url,
'industry': industry,
'employee_count': employee_count,
'headquarters': headquarters,
'description': description,
'source_metadata': {
'source_type': 'linkedin_company_about_page_html',
'source_file': html_file.name,
'extraction_timestamp': timestamp,
}
}
def main():
parser = argparse.ArgumentParser(
description='Extract metadata from LinkedIn About page HTML files'
)
parser.add_argument(
'--manual-dir',
type=Path,
default=Path('data/custodian/person/affiliated/manual'),
help='Directory containing LinkedIn HTML files'
)
parser.add_argument(
'--output-dir',
type=Path,
default=Path('data/custodian/person/affiliated/about_data'),
help='Directory to save extracted data'
)
parser.add_argument(
'--limit', '-l',
type=int,
default=None,
help='Limit number of files to process'
)
args = parser.parse_args()
if not args.manual_dir.exists():
print(f"Error: Manual directory not found: {args.manual_dir}", file=sys.stderr)
sys.exit(1)
args.output_dir.mkdir(parents=True, exist_ok=True)
# Find all About page HTML files
about_files = sorted(args.manual_dir.glob('*About*LinkedIn.html'))
print(f"Found {len(about_files)} About page HTML files")
if args.limit:
about_files = about_files[:args.limit]
print(f"Limited to {len(about_files)} files")
results = []
websites_found = 0
for html_file in about_files:
data = extract_about_data(html_file)
if data:
results.append(data)
if data['website_url']:
websites_found += 1
print(f"{data['custodian_name']}: {data['website_url']}")
else:
print(f"{data['custodian_name']}: No website found")
# Save all results
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
output_file = args.output_dir / f'about_data_{timestamp}.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
'timestamp': timestamp,
'total_processed': len(results),
'websites_found': websites_found,
'data': results,
}, f, indent=2, ensure_ascii=False)
print(f"\n{'='*60}")
print(f"EXTRACTION COMPLETE")
print(f" Total processed: {len(results)}")
print(f" Websites found: {websites_found}")
print(f" Output: {output_file}")
return 0
if __name__ == '__main__':
sys.exit(main())