308 lines
10 KiB
Python
308 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast LinkedIn H1 Name Extraction
|
|
|
|
This is a FAST version that:
|
|
1. Extracts H1 institution names from HTML files
|
|
2. Cleans filenames properly (removes macOS resource forks, periods, parentheses)
|
|
3. Creates custodian YAML files with basic metadata
|
|
4. Does NOT extract detailed staff (too slow for 3335 files)
|
|
|
|
This solves the critical issues:
|
|
- Name extraction from H1 tags (not filenames)
|
|
- Proper filename cleaning
|
|
|
|
Usage:
|
|
python scripts/linkedin_h1_fast.py \
|
|
--input-dir /path/to/html/files \
|
|
--output-dir data/custodian/
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def clean_filename_to_slug(filename: str) -> str:
|
|
"""
|
|
Clean HTML filename to generate URL-safe slug.
|
|
|
|
Handles:
|
|
- macOS resource fork prefixes (._)
|
|
- Periods before numbers (._(15))
|
|
- Numbers in parentheses (15), (7)
|
|
- Extra spaces and underscores
|
|
- " People _ LinkedIn.html" suffix
|
|
|
|
Examples:
|
|
"._(15) Gemeente Enkhuizen_ People _ LinkedIn.html"
|
|
-> "gemeente-enkhuizen"
|
|
|
|
"(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html"
|
|
-> "advn-archief-voor-nationale-bewegingen"
|
|
"""
|
|
# Remove " People _ LinkedIn.html" suffix
|
|
name = filename.replace(' People _ LinkedIn.html', '')
|
|
name = name.replace('.html', '')
|
|
|
|
# Remove macOS resource fork prefix (._)
|
|
if name.startswith('._'):
|
|
name = name[2:]
|
|
|
|
# Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
|
|
name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
|
|
name = re.sub(r'^\._*\(\d+\)\s*', '', name)
|
|
|
|
# Remove trailing spaces and underscores
|
|
name = name.strip('_ ')
|
|
|
|
# Replace multiple spaces with single space
|
|
name = re.sub(r'\s+', ' ', name)
|
|
|
|
# Convert to URL-safe slug
|
|
slug = re.sub(r'[^a-z0-9]+', '-', name.lower())
|
|
slug = re.sub(r'-+', '-', slug).strip('-')
|
|
|
|
return slug
|
|
|
|
|
|
def extract_h1_name(html_content: str) -> Optional[str]:
|
|
"""
|
|
Extract institution name from HTML H1 tag.
|
|
|
|
LinkedIn H1 format: "Organization Name | LinkedIn"
|
|
We extract the part before the pipe.
|
|
|
|
Returns None if H1 not found.
|
|
"""
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
h1 = soup.find('h1')
|
|
|
|
if h1:
|
|
h1_text = h1.get_text().strip()
|
|
# Remove " | LinkedIn" suffix
|
|
if ' | ' in h1_text:
|
|
name = h1_text.split(' | ')[0].strip()
|
|
else:
|
|
name = h1_text
|
|
|
|
# Clean up extra pipes or separators
|
|
name = re.sub(r'\s*\|\s*', ' ', name)
|
|
name = re.sub(r'\s+', ' ', name)
|
|
|
|
return name if name else None
|
|
|
|
return None
|
|
|
|
|
|
def process_single_file(html_path: Path, output_dir: Path) -> dict:
|
|
"""
|
|
Process a single HTML file.
|
|
|
|
Extracts H1 name and creates custodian YAML.
|
|
"""
|
|
# Read HTML
|
|
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
|
|
# Extract name from H1
|
|
h1_name = extract_h1_name(html_content)
|
|
|
|
if not h1_name:
|
|
# Fallback: extract from filename
|
|
filename_clean = html_path.name.replace(' People _ LinkedIn.html', '')
|
|
filename_clean = filename_clean.replace('.html', '')
|
|
if filename_clean.startswith('._'):
|
|
filename_clean = filename_clean[2:]
|
|
filename_clean = re.sub(r'^\.?\_?\(\d+\)\s*', '', filename_clean)
|
|
filename_clean = re.sub(r'^\._*\(\d+\)\s*', '', filename_clean)
|
|
filename_clean = re.sub(r'\s+', ' ', filename_clean).strip()
|
|
h1_name = filename_clean
|
|
|
|
# Generate slug
|
|
slug = clean_filename_to_slug(html_path.name)
|
|
|
|
# Try to extract basic metadata
|
|
follower_count = ''
|
|
associated_members = 0
|
|
|
|
# Look for follower count (e.g., "86K followers")
|
|
follower_match = re.search(r'(\d+K?)\s+followers?', html_content, re.IGNORECASE)
|
|
if follower_match:
|
|
follower_count = follower_match.group(1)
|
|
|
|
# Look for associated members
|
|
member_match = re.search(r'(\d+)\s+associated\s+members?', html_content, re.IGNORECASE)
|
|
if member_match:
|
|
associated_members = int(member_match.group(1))
|
|
|
|
# Count staff mentions (rough count of LinkedIn profiles)
|
|
# Look for profile cards
|
|
profile_count = len(re.findall(r'org-people-profile-card', html_content))
|
|
|
|
# Create custodian data
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
custodian_data = {
|
|
'ghcid_current': f"NL-XX-XXX-PENDING-{slug.upper()}",
|
|
'custodian_name': h1_name,
|
|
'institution_type': 'MUSEUM', # Default, can be refined later
|
|
'custodian_name': {
|
|
'emic_name': h1_name,
|
|
'english_name': None,
|
|
'name_verified': True,
|
|
'name_source': 'linkedin_html_h1',
|
|
},
|
|
'linkedin_enrichment': {
|
|
'source_file': html_path.name,
|
|
'extraction_date': timestamp,
|
|
'follower_count': follower_count,
|
|
'associated_members': associated_members,
|
|
'profile_cards_detected': profile_count,
|
|
'source_type': 'linkedin_company_people_page_html',
|
|
'extraction_method': 'h1_name_extraction_only',
|
|
},
|
|
'provenance': {
|
|
'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': timestamp,
|
|
'extraction_method': 'Fast H1 name extraction',
|
|
'confidence_score': 0.90,
|
|
'notes': f'H1 institution name extracted from HTML. Profile cards detected: {profile_count}. Detailed staff extraction not performed due to performance constraints.',
|
|
},
|
|
}
|
|
|
|
return {
|
|
'status': 'success',
|
|
'slug': slug,
|
|
'filename': html_path.name,
|
|
'custodian_name': h1_name,
|
|
'custodian_data': custodian_data,
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Fast LinkedIn H1 name extraction - solves name extraction issues'
|
|
)
|
|
parser.add_argument('--input-dir', type=Path, required=True,
|
|
help='Directory containing LinkedIn HTML files')
|
|
parser.add_argument('--output-dir', type=Path, required=True,
|
|
help='Output directory for custodian YAML files')
|
|
parser.add_argument('--limit', type=int, default=0,
|
|
help='Limit processing to first N files (0 = all)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input_dir.exists():
|
|
print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Create output directory
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Get all HTML files
|
|
html_files = sorted(args.input_dir.glob('*.html'))
|
|
|
|
if args.limit > 0:
|
|
html_files = html_files[:args.limit]
|
|
|
|
print(f"Processing {len(html_files)} HTML files...")
|
|
print(f"Input directory: {args.input_dir}")
|
|
print(f"Output directory: {args.output_dir}")
|
|
print(f"This will extract H1 names and create custodian YAMLs")
|
|
print(f"Estimated time: ~{len(html_files)} seconds (~{len(html_files)//60} minutes)")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(html_files),
|
|
'success': 0,
|
|
'errors': 0,
|
|
'name_from_h1': 0,
|
|
'name_from_filename': 0,
|
|
'with_profiles': 0,
|
|
'total_profiles_detected': 0,
|
|
}
|
|
|
|
# Process files
|
|
for i, html_path in enumerate(html_files, 1):
|
|
try:
|
|
if i % 100 == 0:
|
|
print(f"Progress: [{i}/{len(html_files)}]", end='\r')
|
|
|
|
result = process_single_file(html_path, args.output_dir)
|
|
|
|
if result['status'] == 'success':
|
|
stats['success'] += 1
|
|
stats['total_profiles_detected'] += result['custodian_data'].get('linkedin_enrichment', {}).get('profile_cards_detected', 0)
|
|
|
|
# Save custodian YAML
|
|
custodian_file = args.output_dir / f"{result['slug']}.yaml"
|
|
with open(custodian_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(result['custodian_data'], f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Track name source
|
|
if 'linkedin_html_h1' in result['custodian_data'].get('custodian_name', {}).get('name_source', ''):
|
|
stats['name_from_h1'] += 1
|
|
else:
|
|
stats['name_from_filename'] += 1
|
|
|
|
elif result['status'] == 'error':
|
|
stats['errors'] += 1
|
|
print(f"Error: {result['filename']}: {result.get('error')}", file=sys.stderr)
|
|
|
|
except Exception as e:
|
|
stats['errors'] += 1
|
|
print(f"Error: {html_path.name}: {e}", file=sys.stderr)
|
|
|
|
print(f"\nProcessing complete!")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("PROCESSING COMPLETE")
|
|
print("=" * 60)
|
|
print(f"\nStatistics:")
|
|
print(f" Total HTML files: {stats['total']}")
|
|
print(f" Successfully processed: {stats['success']}")
|
|
print(f" Errors: {stats['errors']}")
|
|
print(f" Names from H1: {stats['name_from_h1']}")
|
|
print(f" Names from filename: {stats['name_from_filename']}")
|
|
print(f" Total profiles detected: {stats['total_profiles_detected']}")
|
|
print(f"\nOutput directory: {args.output_dir}")
|
|
|
|
# Save processing report
|
|
report = {
|
|
'processing_date': datetime.now(timezone.utc).isoformat(),
|
|
'input_directory': str(args.input_dir),
|
|
'output_directory': str(args.output_dir),
|
|
'statistics': stats,
|
|
}
|
|
|
|
report_file = Path('reports/linkedin_h1_fast_report.json')
|
|
report_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nReport saved to: {report_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|