#!/usr/bin/env python3 """ Simple batch processor for LinkedIn HTML files. Runs parse_linkedin_html.py on all HTML files in manual directory. Creates staff JSON files in bu/ directory. """ import json import os import re import subprocess import sys from datetime import datetime, timezone from pathlib import Path from typing import Dict, Optional # Directory paths MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual") OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu") def extract_institution_name_from_filename(filename: str) -> Optional[str]: """Extract institution name from LinkedIn People HTML filename.""" name = Path(filename).name name = name.replace('.html', '') name = re.sub(r'_?People _ LinkedIn$', '', name) name = re.sub(r'^\(\d+\)\s*', '', name) name = re.sub(r'^,\s*', '', name) name = re.sub(r'\s+', ' ', name).strip() name = name.rstrip('_') name = name.lstrip('_') # Remove leading underscores name = name.strip() return name if name else None def generate_slug_from_name(name: str) -> str: """Generate URL-friendly slug from institution name.""" slug = name.lower() slug = re.sub(r'[^a-z0-9\s-]', '', slug) slug = re.sub(r'[\s-]+', '-', slug) slug = slug.strip('-') return slug def main(): """Process all HTML files.""" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) html_files = sorted(MANUAL_DIR.glob("*.html")) print(f"Found {len(html_files)} HTML files") print(f"Output directory: {OUTPUT_DIR}") print(f"\nStarting processing...") print() stats = { 'total': len(html_files), 'processed': 0, 'errors': 0, 'total_staff': 0, } for i, html_file in enumerate(html_files, 1): print(f"[{i}/{len(html_files)}] {html_file.name}", end='\r') institution_name = extract_institution_name_from_filename(html_file.name) if not institution_name: stats['errors'] += 1 continue slug = generate_slug_from_name(institution_name) timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json" try: result = subprocess.run( [ sys.executable, "/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py", str(html_file), "--custodian-name", institution_name, "--custodian-slug", slug, "--output", str(output_file) ], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: stats['processed'] += 1 # Load result to get staff count try: with open(output_file, 'r') as f: data = json.load(f) staff_count = len(data.get('staff', [])) stats['total_staff'] += staff_count except: pass else: stats['errors'] += 1 except subprocess.TimeoutExpired: stats['errors'] += 1 except Exception as e: print(f"ERROR: {e}", file=sys.stderr) stats['errors'] += 1 if i % 100 == 0: print() print(f"Progress: {i}/{len(html_files)}") print(f" Processed: {stats['processed']}, Errors: {stats['errors']}") print() print() print("="*60) print("PROCESSING COMPLETE") print("="*60) print(f"Total files: {stats['total']}") print(f"Processed: {stats['processed']}") print(f"Errors: {stats['errors']}") print(f"Total staff extracted: {stats['total_staff']}") print("="*60) if __name__ == '__main__': main()