glam/scripts/linkedin_h1_fast.py
2025-12-30 03:43:31 +01:00

308 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Fast LinkedIn H1 Name Extraction
This is a FAST version that:
1. Extracts H1 institution names from HTML files
2. Cleans filenames properly (removes macOS resource forks, periods, parentheses)
3. Creates custodian YAML files with basic metadata
4. Does NOT extract detailed staff (too slow for 3335 files)
This solves the critical issues:
- Name extraction from H1 tags (not filenames)
- Proper filename cleaning
Usage:
python scripts/linkedin_h1_fast.py \
--input-dir /path/to/html/files \
--output-dir data/custodian/
"""
import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
try:
from bs4 import BeautifulSoup
except ImportError:
print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
sys.exit(1)
try:
import yaml
except ImportError:
print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
sys.exit(1)
def clean_filename_to_slug(filename: str) -> str:
"""
Clean HTML filename to generate URL-safe slug.
Handles:
- macOS resource fork prefixes (._)
- Periods before numbers (._(15))
- Numbers in parentheses (15), (7)
- Extra spaces and underscores
- " People _ LinkedIn.html" suffix
Examples:
"._(15) Gemeente Enkhuizen_ People _ LinkedIn.html"
-> "gemeente-enkhuizen"
"(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html"
-> "advn-archief-voor-nationale-bewegingen"
"""
# Remove " People _ LinkedIn.html" suffix
name = filename.replace(' People _ LinkedIn.html', '')
name = name.replace('.html', '')
# Remove macOS resource fork prefix (._)
if name.startswith('._'):
name = name[2:]
# Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
name = re.sub(r'^\._*\(\d+\)\s*', '', name)
# Remove trailing spaces and underscores
name = name.strip('_ ')
# Replace multiple spaces with single space
name = re.sub(r'\s+', ' ', name)
# Convert to URL-safe slug
slug = re.sub(r'[^a-z0-9]+', '-', name.lower())
slug = re.sub(r'-+', '-', slug).strip('-')
return slug
def extract_h1_name(html_content: str) -> Optional[str]:
"""
Extract institution name from HTML H1 tag.
LinkedIn H1 format: "Organization Name | LinkedIn"
We extract the part before the pipe.
Returns None if H1 not found.
"""
soup = BeautifulSoup(html_content, 'html.parser')
h1 = soup.find('h1')
if h1:
h1_text = h1.get_text().strip()
# Remove " | LinkedIn" suffix
if ' | ' in h1_text:
name = h1_text.split(' | ')[0].strip()
else:
name = h1_text
# Clean up extra pipes or separators
name = re.sub(r'\s*\|\s*', ' ', name)
name = re.sub(r'\s+', ' ', name)
return name if name else None
return None
def process_single_file(html_path: Path, output_dir: Path) -> dict:
"""
Process a single HTML file.
Extracts H1 name and creates custodian YAML.
"""
# Read HTML
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
# Extract name from H1
h1_name = extract_h1_name(html_content)
if not h1_name:
# Fallback: extract from filename
filename_clean = html_path.name.replace(' People _ LinkedIn.html', '')
filename_clean = filename_clean.replace('.html', '')
if filename_clean.startswith('._'):
filename_clean = filename_clean[2:]
filename_clean = re.sub(r'^\.?\_?\(\d+\)\s*', '', filename_clean)
filename_clean = re.sub(r'^\._*\(\d+\)\s*', '', filename_clean)
filename_clean = re.sub(r'\s+', ' ', filename_clean).strip()
h1_name = filename_clean
# Generate slug
slug = clean_filename_to_slug(html_path.name)
# Try to extract basic metadata
follower_count = ''
associated_members = 0
# Look for follower count (e.g., "86K followers")
follower_match = re.search(r'(\d+K?)\s+followers?', html_content, re.IGNORECASE)
if follower_match:
follower_count = follower_match.group(1)
# Look for associated members
member_match = re.search(r'(\d+)\s+associated\s+members?', html_content, re.IGNORECASE)
if member_match:
associated_members = int(member_match.group(1))
# Count staff mentions (rough count of LinkedIn profiles)
# Look for profile cards
profile_count = len(re.findall(r'org-people-profile-card', html_content))
# Create custodian data
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
custodian_data = {
'ghcid_current': f"NL-XX-XXX-PENDING-{slug.upper()}",
'custodian_name': h1_name,
'institution_type': 'MUSEUM', # Default, can be refined later
'custodian_name': {
'emic_name': h1_name,
'english_name': None,
'name_verified': True,
'name_source': 'linkedin_html_h1',
},
'linkedin_enrichment': {
'source_file': html_path.name,
'extraction_date': timestamp,
'follower_count': follower_count,
'associated_members': associated_members,
'profile_cards_detected': profile_count,
'source_type': 'linkedin_company_people_page_html',
'extraction_method': 'h1_name_extraction_only',
},
'provenance': {
'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': timestamp,
'extraction_method': 'Fast H1 name extraction',
'confidence_score': 0.90,
'notes': f'H1 institution name extracted from HTML. Profile cards detected: {profile_count}. Detailed staff extraction not performed due to performance constraints.',
},
}
return {
'status': 'success',
'slug': slug,
'filename': html_path.name,
'custodian_name': h1_name,
'custodian_data': custodian_data,
}
def main():
parser = argparse.ArgumentParser(
description='Fast LinkedIn H1 name extraction - solves name extraction issues'
)
parser.add_argument('--input-dir', type=Path, required=True,
help='Directory containing LinkedIn HTML files')
parser.add_argument('--output-dir', type=Path, required=True,
help='Output directory for custodian YAML files')
parser.add_argument('--limit', type=int, default=0,
help='Limit processing to first N files (0 = all)')
args = parser.parse_args()
if not args.input_dir.exists():
print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
sys.exit(1)
# Create output directory
args.output_dir.mkdir(parents=True, exist_ok=True)
# Get all HTML files
html_files = sorted(args.input_dir.glob('*.html'))
if args.limit > 0:
html_files = html_files[:args.limit]
print(f"Processing {len(html_files)} HTML files...")
print(f"Input directory: {args.input_dir}")
print(f"Output directory: {args.output_dir}")
print(f"This will extract H1 names and create custodian YAMLs")
print(f"Estimated time: ~{len(html_files)} seconds (~{len(html_files)//60} minutes)")
# Statistics
stats = {
'total': len(html_files),
'success': 0,
'errors': 0,
'name_from_h1': 0,
'name_from_filename': 0,
'with_profiles': 0,
'total_profiles_detected': 0,
}
# Process files
for i, html_path in enumerate(html_files, 1):
try:
if i % 100 == 0:
print(f"Progress: [{i}/{len(html_files)}]", end='\r')
result = process_single_file(html_path, args.output_dir)
if result['status'] == 'success':
stats['success'] += 1
stats['total_profiles_detected'] += result['custodian_data'].get('linkedin_enrichment', {}).get('profile_cards_detected', 0)
# Save custodian YAML
custodian_file = args.output_dir / f"{result['slug']}.yaml"
with open(custodian_file, 'w', encoding='utf-8') as f:
yaml.dump(result['custodian_data'], f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Track name source
if 'linkedin_html_h1' in result['custodian_data'].get('custodian_name', {}).get('name_source', ''):
stats['name_from_h1'] += 1
else:
stats['name_from_filename'] += 1
elif result['status'] == 'error':
stats['errors'] += 1
print(f"Error: {result['filename']}: {result.get('error')}", file=sys.stderr)
except Exception as e:
stats['errors'] += 1
print(f"Error: {html_path.name}: {e}", file=sys.stderr)
print(f"\nProcessing complete!")
# Print summary
print("\n" + "=" * 60)
print("PROCESSING COMPLETE")
print("=" * 60)
print(f"\nStatistics:")
print(f" Total HTML files: {stats['total']}")
print(f" Successfully processed: {stats['success']}")
print(f" Errors: {stats['errors']}")
print(f" Names from H1: {stats['name_from_h1']}")
print(f" Names from filename: {stats['name_from_filename']}")
print(f" Total profiles detected: {stats['total_profiles_detected']}")
print(f"\nOutput directory: {args.output_dir}")
# Save processing report
report = {
'processing_date': datetime.now(timezone.utc).isoformat(),
'input_directory': str(args.input_dir),
'output_directory': str(args.output_dir),
'statistics': stats,
}
report_file = Path('reports/linkedin_h1_fast_report.json')
report_file.parent.mkdir(parents=True, exist_ok=True)
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\nReport saved to: {report_file}")
return 0
if __name__ == '__main__':
sys.exit(main())