#!/usr/bin/env python3 """ Batch process LinkedIn organization HTML files. Parses organization staff/member pages and outputs JSON files with extracted profiles. """ import os import re import json import subprocess from pathlib import Path from datetime import datetime, timezone def extract_org_name_from_filename(filename: str) -> tuple[str, str]: """Extract organization name and generate slug from filename. Args: filename: e.g., "(8) Eye Filmmuseum_ People _ LinkedIn.html" Returns: tuple of (org_name, slug) """ # Remove "(8) " prefix if present name = re.sub(r'^\(\d+\)\s*', '', filename) # Remove "_ People _ LinkedIn.html" suffix name = re.sub(r'_\s*People\s*_\s*LinkedIn\.html$', '', name) # Clean up underscores used as colons in filenames name = name.replace('_', ':').strip() # Handle cases like "ACP: ICA- Archival..." name = re.sub(r':\s*:', ':', name) name = name.strip(':').strip() # Generate slug slug = name.lower() slug = re.sub(r'[^a-z0-9]+', '-', slug) slug = re.sub(r'-+', '-', slug) slug = slug.strip('-') # Truncate slug to reasonable length if len(slug) > 50: slug = slug[:50].rstrip('-') return name, slug def process_file(html_path: Path, output_dir: Path) -> dict: """Process a single HTML file. Args: html_path: Path to HTML file output_dir: Directory for output JSON Returns: dict with processing results """ org_name, slug = extract_org_name_from_filename(html_path.name) timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') output_file = output_dir / f"{slug}_staff_{timestamp}.json" # Run the parser cmd = [ 'python', 'scripts/parse_linkedin_html.py', str(html_path), '--custodian-name', org_name, '--custodian-slug', slug, '--output', str(output_file) ] result = subprocess.run(cmd, capture_output=True, text=True) # Parse results extracted_count = 0 expected_count = None if result.returncode == 0 and output_file.exists(): with open(output_file, 'r') as f: data = json.load(f) # Count staff from the staff array directly extracted_count = len(data.get('staff', [])) expected_count = data.get('custodian_metadata', {}).get('associated_members') return { 'org_name': org_name, 'slug': slug, 'html_file': html_path.name, 'output_file': output_file.name if output_file.exists() else None, 'extracted_count': extracted_count, 'expected_count': expected_count, 'variance': (extracted_count - expected_count) if expected_count else None, 'success': result.returncode == 0, 'stderr': result.stderr if result.returncode != 0 else None } def main(): """Process all HTML files in the manual directory.""" input_dir = Path('data/custodian/person/affiliated/manual') output_dir = Path('data/custodian/person/affiliated/parsed') # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) # Find all HTML files html_files = sorted(input_dir.glob('*.html')) print(f"Found {len(html_files)} HTML files to process\n") print("=" * 100) results = [] for i, html_file in enumerate(html_files, 1): print(f"\n[{i}/{len(html_files)}] Processing: {html_file.name[:60]}...") result = process_file(html_file, output_dir) results.append(result) if result['success']: variance_str = "" if result['variance'] is not None: variance_str = f" (variance: {result['variance']:+d})" print(f" OK: {result['extracted_count']} extracted, {result['expected_count']} expected{variance_str}") else: print(f" FAILED: {result['stderr'][:100] if result['stderr'] else 'Unknown error'}") # Summary print("\n" + "=" * 100) print("\nSUMMARY") print("=" * 100) # Table header print(f"\n{'Organization':<45} {'Expected':>10} {'Extracted':>10} {'Variance':>10} {'Status':>10}") print("-" * 90) total_expected = 0 total_extracted = 0 success_count = 0 for r in results: org_display = r['org_name'][:43] + '..' if len(r['org_name']) > 45 else r['org_name'] exp = str(r['expected_count']) if r['expected_count'] is not None else 'N/A' ext = str(r['extracted_count']) var = f"{r['variance']:+d}" if r['variance'] is not None else 'N/A' status = "OK" if r['success'] else "FAILED" print(f"{org_display:<45} {exp:>10} {ext:>10} {var:>10} {status:>10}") if r['expected_count']: total_expected += r['expected_count'] total_extracted += r['extracted_count'] if r['success']: success_count += 1 print("-" * 90) print(f"{'TOTAL':<45} {total_expected:>10} {total_extracted:>10} {total_extracted - total_expected:>+10}") print(f"\nProcessed: {success_count}/{len(results)} files successfully") # Save results results_file = output_dir / f"batch_results_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json" with open(results_file, 'w') as f: json.dump({ 'processed_at': datetime.now(timezone.utc).isoformat(), 'total_files': len(results), 'success_count': success_count, 'total_expected': total_expected, 'total_extracted': total_extracted, 'results': results }, f, indent=2) print(f"\nResults saved to: {results_file}") if __name__ == '__main__': main()