glam/scripts/batch_parse_linkedin_simple.py
2025-12-30 03:43:31 +01:00

130 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Simple batch processor for LinkedIn HTML files.
Runs parse_linkedin_html.py on all HTML files in manual directory.
Creates staff JSON files in bu/ directory.
"""
import json
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Optional
# Directory paths
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu")
def extract_institution_name_from_filename(filename: str) -> Optional[str]:
"""Extract institution name from LinkedIn People HTML filename."""
name = Path(filename).name
name = name.replace('.html', '')
name = re.sub(r'_?People _ LinkedIn$', '', name)
name = re.sub(r'^\(\d+\)\s*', '', name)
name = re.sub(r'^,\s*', '', name)
name = re.sub(r'\s+', ' ', name).strip()
name = name.rstrip('_')
name = name.lstrip('_') # Remove leading underscores
name = name.strip()
return name if name else None
def generate_slug_from_name(name: str) -> str:
"""Generate URL-friendly slug from institution name."""
slug = name.lower()
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
slug = re.sub(r'[\s-]+', '-', slug)
slug = slug.strip('-')
return slug
def main():
"""Process all HTML files."""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
html_files = sorted(MANUAL_DIR.glob("*.html"))
print(f"Found {len(html_files)} HTML files")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nStarting processing...")
print()
stats = {
'total': len(html_files),
'processed': 0,
'errors': 0,
'total_staff': 0,
}
for i, html_file in enumerate(html_files, 1):
print(f"[{i}/{len(html_files)}] {html_file.name}", end='\r')
institution_name = extract_institution_name_from_filename(html_file.name)
if not institution_name:
stats['errors'] += 1
continue
slug = generate_slug_from_name(institution_name)
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json"
try:
result = subprocess.run(
[
sys.executable,
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
str(html_file),
"--custodian-name", institution_name,
"--custodian-slug", slug,
"--output", str(output_file)
],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0:
stats['processed'] += 1
# Load result to get staff count
try:
with open(output_file, 'r') as f:
data = json.load(f)
staff_count = len(data.get('staff', []))
stats['total_staff'] += staff_count
except:
pass
else:
stats['errors'] += 1
except subprocess.TimeoutExpired:
stats['errors'] += 1
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
stats['errors'] += 1
if i % 100 == 0:
print()
print(f"Progress: {i}/{len(html_files)}")
print(f" Processed: {stats['processed']}, Errors: {stats['errors']}")
print()
print()
print("="*60)
print("PROCESSING COMPLETE")
print("="*60)
print(f"Total files: {stats['total']}")
print(f"Processed: {stats['processed']}")
print(f"Errors: {stats['errors']}")
print(f"Total staff extracted: {stats['total_staff']}")
print("="*60)
if __name__ == '__main__':
main()