277 lines
8.9 KiB
Python
Executable file
277 lines
8.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Extract company metadata from LinkedIn About page HTML files.
|
|
|
|
Extracts:
|
|
- Website URL
|
|
- Industry
|
|
- Company size/employee count
|
|
- Headquarters location
|
|
- Description
|
|
|
|
Usage:
|
|
python scripts/extract_about_page_data.py [--output-dir DIR]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
def extract_custodian_name(filename: str) -> str | None:
|
|
"""Extract custodian name from LinkedIn About filename.
|
|
|
|
Filename format: "(N) Custodian Name_ About _ LinkedIn.html"
|
|
"""
|
|
match = re.match(r'^\(\d+\)\s*(.+?)_\s*About\s*_\s*LinkedIn\.html$', filename)
|
|
if match:
|
|
name = match.group(1).strip()
|
|
# Clean up underscores that LinkedIn uses instead of colons
|
|
name = name.replace('_ ', ': ').replace(' _', ':')
|
|
return name
|
|
return None
|
|
|
|
|
|
def generate_slug(name: str) -> str:
|
|
"""Generate URL-safe slug from custodian name."""
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
slug = re.sub(r'[^a-z0-9]+', '-', ascii_name)
|
|
slug = re.sub(r'-+', '-', slug).strip('-')
|
|
return slug
|
|
|
|
|
|
def is_valid_url(url: str) -> bool:
|
|
"""Validate URL is properly formatted and not garbage."""
|
|
if not url:
|
|
return False
|
|
# Reject URLs with newlines or escape sequences
|
|
if '\n' in url or '\\n' in url or '\r' in url or '\\r' in url:
|
|
return False
|
|
# Reject suspiciously long URLs
|
|
if len(url) > 150:
|
|
return False
|
|
# Must be a properly formatted URL with valid TLD
|
|
if not re.match(r'^https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(/[^\s<>"\']*)?$', url):
|
|
return False
|
|
return True
|
|
|
|
|
|
def extract_website_url(html_content: str) -> str | None:
|
|
"""Extract the company website URL from About page HTML.
|
|
|
|
Uses multiple strategies:
|
|
1. Look for href in anchor tags pointing to external sites
|
|
2. Look for JSON patterns with website URLs
|
|
3. Fall back to general URL extraction with strict validation
|
|
"""
|
|
# URLs to exclude (LinkedIn, CDN, social, analytics, etc.)
|
|
exclude_patterns = [
|
|
'linkedin', 'licdn', 'w3.org', 'bing.com', 'google.com',
|
|
'facebook.com', 'twitter.com', 'instagram.com', 'youtube.com',
|
|
'tiktok.com', 'pinterest.com', 'tumblr.com', 'reddit.com',
|
|
'schema.org', 'cloudflare', 'analytics', 'tracking', 'leerob.com',
|
|
'cdn.', '.svg', '.png', '.jpg', '.gif', '.css', '.js', '.woff',
|
|
'fonts.googleapis', 'googletagmanager', 'doubleclick', 'adsense',
|
|
'microsoft.com', 'apple.com', 'amazon.com', 'github.com',
|
|
'openai.com', 'anthropic.com',
|
|
]
|
|
|
|
def is_excluded(url: str) -> bool:
|
|
url_lower = url.lower()
|
|
return any(pattern in url_lower for pattern in exclude_patterns)
|
|
|
|
# Strategy 1: Extract from href attributes in anchor tags (most reliable)
|
|
# Pattern: href="http://www.example.com/" or href="https://example.org"
|
|
href_pattern = re.compile(r'href="(https?://[^"]+)"')
|
|
href_matches = href_pattern.findall(html_content)
|
|
|
|
for url in href_matches:
|
|
if is_excluded(url):
|
|
continue
|
|
# Clean trailing slash for consistency
|
|
url = url.rstrip('/')
|
|
if is_valid_url(url):
|
|
return url
|
|
|
|
# Strategy 2: Look for JSON patterns like "websiteUrl":"..." or "website":"..."
|
|
json_url_patterns = [
|
|
r'"websiteUrl"\s*:\s*"(https?://[^"]+)"',
|
|
r'"website"\s*:\s*"(https?://[^"]+)"',
|
|
r'"companyUrl"\s*:\s*"(https?://[^"]+)"',
|
|
r'"homepageUrl"\s*:\s*"(https?://[^"]+)"',
|
|
]
|
|
|
|
for pattern in json_url_patterns:
|
|
matches = re.findall(pattern, html_content)
|
|
for url in matches:
|
|
if is_excluded(url):
|
|
continue
|
|
url = url.rstrip('/')
|
|
if is_valid_url(url):
|
|
return url
|
|
|
|
# Strategy 3: General URL extraction (last resort, with strict validation)
|
|
url_pattern = re.compile(r'https?://[a-zA-Z0-9][a-zA-Z0-9._-]*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?')
|
|
all_urls = url_pattern.findall(html_content)
|
|
|
|
candidate_urls = []
|
|
for url in all_urls:
|
|
if is_excluded(url):
|
|
continue
|
|
# Skip URLs ending with media extensions
|
|
if url.lower().endswith(('.svg', '.png', '.jpg', '.jpeg', '.gif', '.css', '.js', '.woff', '.woff2')):
|
|
continue
|
|
url = url.rstrip('/')
|
|
if is_valid_url(url):
|
|
candidate_urls.append(url)
|
|
|
|
if not candidate_urls:
|
|
return None
|
|
|
|
# Return the first unique URL (most likely the company website)
|
|
seen = set()
|
|
for url in candidate_urls:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
return url
|
|
|
|
return None
|
|
|
|
|
|
def extract_about_data(html_file: Path) -> dict[str, Any] | None:
|
|
"""Extract metadata from a LinkedIn About page HTML file."""
|
|
custodian_name = extract_custodian_name(html_file.name)
|
|
if not custodian_name:
|
|
return None
|
|
|
|
try:
|
|
with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
|
|
html_content = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading {html_file}: {e}")
|
|
return None
|
|
|
|
slug = generate_slug(custodian_name)
|
|
website_url = extract_website_url(html_content)
|
|
|
|
# Extract other metadata if present in JSON-LD or structured data
|
|
industry = None
|
|
employee_count = None
|
|
headquarters = None
|
|
description = None
|
|
|
|
# Try to find industry from JSON patterns
|
|
industry_match = re.search(r'"industry"[:\s]*"([^"]+)"', html_content)
|
|
if industry_match:
|
|
industry = industry_match.group(1)
|
|
|
|
# Try to find employee count
|
|
employee_match = re.search(r'"employeeCount"[:\s]*"?(\d+)', html_content)
|
|
if employee_match:
|
|
employee_count = int(employee_match.group(1))
|
|
|
|
# Try to find headquarters
|
|
hq_match = re.search(r'"headquarters"[:\s]*"([^"]+)"', html_content)
|
|
if hq_match:
|
|
headquarters = hq_match.group(1)
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
return {
|
|
'custodian_name': custodian_name,
|
|
'custodian_slug': slug,
|
|
'website_url': website_url,
|
|
'industry': industry,
|
|
'employee_count': employee_count,
|
|
'headquarters': headquarters,
|
|
'description': description,
|
|
'source_metadata': {
|
|
'source_type': 'linkedin_company_about_page_html',
|
|
'source_file': html_file.name,
|
|
'extraction_timestamp': timestamp,
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract metadata from LinkedIn About page HTML files'
|
|
)
|
|
parser.add_argument(
|
|
'--manual-dir',
|
|
type=Path,
|
|
default=Path('data/custodian/person/affiliated/manual'),
|
|
help='Directory containing LinkedIn HTML files'
|
|
)
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
type=Path,
|
|
default=Path('data/custodian/person/affiliated/about_data'),
|
|
help='Directory to save extracted data'
|
|
)
|
|
parser.add_argument(
|
|
'--limit', '-l',
|
|
type=int,
|
|
default=None,
|
|
help='Limit number of files to process'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.manual_dir.exists():
|
|
print(f"Error: Manual directory not found: {args.manual_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Find all About page HTML files
|
|
about_files = sorted(args.manual_dir.glob('*About*LinkedIn.html'))
|
|
print(f"Found {len(about_files)} About page HTML files")
|
|
|
|
if args.limit:
|
|
about_files = about_files[:args.limit]
|
|
print(f"Limited to {len(about_files)} files")
|
|
|
|
results = []
|
|
websites_found = 0
|
|
|
|
for html_file in about_files:
|
|
data = extract_about_data(html_file)
|
|
if data:
|
|
results.append(data)
|
|
if data['website_url']:
|
|
websites_found += 1
|
|
print(f"✓ {data['custodian_name']}: {data['website_url']}")
|
|
else:
|
|
print(f"✗ {data['custodian_name']}: No website found")
|
|
|
|
# Save all results
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
output_file = args.output_dir / f'about_data_{timestamp}.json'
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'timestamp': timestamp,
|
|
'total_processed': len(results),
|
|
'websites_found': websites_found,
|
|
'data': results,
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"EXTRACTION COMPLETE")
|
|
print(f" Total processed: {len(results)}")
|
|
print(f" Websites found: {websites_found}")
|
|
print(f" Output: {output_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|