130 lines
3.9 KiB
Python
130 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple batch processor for LinkedIn HTML files.
|
|
|
|
Runs parse_linkedin_html.py on all HTML files in manual directory.
|
|
Creates staff JSON files in bu/ directory.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Optional
|
|
|
|
|
|
# Directory paths
|
|
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu")
|
|
|
|
|
|
def extract_institution_name_from_filename(filename: str) -> Optional[str]:
|
|
"""Extract institution name from LinkedIn People HTML filename."""
|
|
name = Path(filename).name
|
|
name = name.replace('.html', '')
|
|
name = re.sub(r'_?People _ LinkedIn$', '', name)
|
|
name = re.sub(r'^\(\d+\)\s*', '', name)
|
|
name = re.sub(r'^,\s*', '', name)
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
name = name.rstrip('_')
|
|
name = name.lstrip('_') # Remove leading underscores
|
|
name = name.strip()
|
|
return name if name else None
|
|
|
|
|
|
def generate_slug_from_name(name: str) -> str:
|
|
"""Generate URL-friendly slug from institution name."""
|
|
slug = name.lower()
|
|
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
|
|
slug = re.sub(r'[\s-]+', '-', slug)
|
|
slug = slug.strip('-')
|
|
return slug
|
|
|
|
|
|
def main():
|
|
"""Process all HTML files."""
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
html_files = sorted(MANUAL_DIR.glob("*.html"))
|
|
|
|
print(f"Found {len(html_files)} HTML files")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"\nStarting processing...")
|
|
print()
|
|
|
|
stats = {
|
|
'total': len(html_files),
|
|
'processed': 0,
|
|
'errors': 0,
|
|
'total_staff': 0,
|
|
}
|
|
|
|
for i, html_file in enumerate(html_files, 1):
|
|
print(f"[{i}/{len(html_files)}] {html_file.name}", end='\r')
|
|
|
|
institution_name = extract_institution_name_from_filename(html_file.name)
|
|
if not institution_name:
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
slug = generate_slug_from_name(institution_name)
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
|
|
str(html_file),
|
|
"--custodian-name", institution_name,
|
|
"--custodian-slug", slug,
|
|
"--output", str(output_file)
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
stats['processed'] += 1
|
|
# Load result to get staff count
|
|
try:
|
|
with open(output_file, 'r') as f:
|
|
data = json.load(f)
|
|
staff_count = len(data.get('staff', []))
|
|
stats['total_staff'] += staff_count
|
|
except:
|
|
pass
|
|
else:
|
|
stats['errors'] += 1
|
|
|
|
except subprocess.TimeoutExpired:
|
|
stats['errors'] += 1
|
|
except Exception as e:
|
|
print(f"ERROR: {e}", file=sys.stderr)
|
|
stats['errors'] += 1
|
|
|
|
if i % 100 == 0:
|
|
print()
|
|
print(f"Progress: {i}/{len(html_files)}")
|
|
print(f" Processed: {stats['processed']}, Errors: {stats['errors']}")
|
|
print()
|
|
|
|
print()
|
|
print("="*60)
|
|
print("PROCESSING COMPLETE")
|
|
print("="*60)
|
|
print(f"Total files: {stats['total']}")
|
|
print(f"Processed: {stats['processed']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
print(f"Total staff extracted: {stats['total_staff']}")
|
|
print("="*60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|