#!/usr/bin/env python3 """ Batch processor for LinkedIn HTML files - DATA COLLECTION PHASE This script: 1. Processes all HTML files in manual directory 2. Runs parse_linkedin_html.py for each 3. Creates staff JSON files in bu/ directory 4. Generates summary report NOTE: Name extraction refinement (using full "about" description instead of abbreviated names) is a POST-PROCESSING step. """ import json import os import re import subprocess import sys from datetime import datetime, timezone from pathlib import Path from collections import defaultdict # Directory paths MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual") OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu") SUMMARY_FILE = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/batch_processing_summary.json") def extract_institution_name_from_filename(filename: str) -> str: """Extract institution name from LinkedIn People HTML filename. Removes: - Leading number in parentheses: (10), (15), (16) - Trailing " People _ LinkedIn" - Trailing ".html" - Leading commas - Leading underscores Returns clean institution name. """ name = Path(filename).name name = name.replace('.html', '') name = re.sub(r'_?People _ LinkedIn$', '', name) name = re.sub(r'^\(\d+\)\s*', '', name) name = re.sub(r'^,\s*', '', name) name = re.sub(r'\s+', ' ', name).strip() name = name.rstrip('_') name = name.lstrip('_') return name.strip() def generate_slug_from_name(name: str) -> str: """Generate URL-friendly slug from institution name.""" slug = name.lower() slug = re.sub(r'[^a-z0-9\s-]', '', slug) slug = re.sub(r'[\s-]+', '-', slug) slug = slug.strip('-') return slug def process_single_file(html_file: Path, index: int, total: int) -> dict: """Process a single HTML file and return result summary.""" institution_name = extract_institution_name_from_filename(html_file.name) if not institution_name or len(institution_name) < 3: return { 'status': 'skipped', 'file': html_file.name, 'reason': f'Invalid name extracted: "{institution_name}"' } slug = generate_slug_from_name(institution_name) timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json" try: result = subprocess.run( [ sys.executable, "/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py", str(html_file), "--custodian-name", institution_name, "--custodian-slug", slug, "--output", str(output_file) ], capture_output=True, text=True, timeout=30 ) if result.returncode != 0: return { 'status': 'error', 'file': html_file.name, 'institution_name': institution_name, 'reason': result.stderr[:200] if result.stderr else 'Unknown error', 'returncode': result.returncode } # Load result to extract statistics try: with open(output_file, 'r', encoding='utf-8') as f: data = json.load(f) staff_analysis = data.get('staff_analysis', {}) custodian_metadata = data.get('custodian_metadata', {}) source_metadata = data.get('source_metadata', {}) return { 'status': 'success', 'file': html_file.name, 'institution_name': institution_name, 'slug': slug, 'output_file': output_file.name, 'stats': { 'total_staff': staff_analysis.get('total_staff_extracted', 0), 'with_linkedin_url': staff_analysis.get('with_linkedin_url', 0), 'with_alternate_profiles': staff_analysis.get('with_alternate_profiles', 0), 'anonymous_members': staff_analysis.get('anonymous_members', 0), 'heritage_relevant': staff_analysis.get('heritage_relevant_count', 0), 'heritage_types': staff_analysis.get('staff_by_heritage_type', {}), 'associated_members': custodian_metadata.get('associated_members', 0), 'pymk_filtered': source_metadata.get('pymk_cards_filtered', 0), 'duplicates_merged': source_metadata.get('duplicate_profiles_merged', 0), 'industry': custodian_metadata.get('industry', 'Unknown'), 'follower_count': custodian_metadata.get('follower_count', 'Unknown'), } } except Exception as e: return { 'status': 'parsed_error', 'file': html_file.name, 'institution_name': institution_name, 'reason': f'Failed to load output JSON: {e}' } except subprocess.TimeoutExpired: return { 'status': 'timeout', 'file': html_file.name, 'institution_name': institution_name, 'reason': 'Processing timeout (30s)' } except Exception as e: return { 'status': 'exception', 'file': html_file.name, 'institution_name': institution_name, 'reason': str(e) } def main(): """Main batch processing function.""" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) html_files = sorted(MANUAL_DIR.glob("*.html")) print("="*70) print("LINKEDIN MANUAL DIRECTORY - BATCH DATA COLLECTION") print("="*70) print(f"\nInput directory: {MANUAL_DIR}") print(f"Output directory: {OUTPUT_DIR}") print(f"Total HTML files to process: {len(html_files)}") print(f"\nStarting batch processing at: {datetime.now(timezone.utc).isoformat()}") print() # Track statistics all_results = [] stats = { 'total_files': len(html_files), 'processed': 0, 'errors': 0, 'skipped': 0, 'total_staff': 0, 'with_linkedin_url': 0, 'heritage_relevant': 0, 'anonymous_members': 0, 'custodians_by_heritage_type': defaultdict(int), 'errors_list': [], } # Process each file for i, html_file in enumerate(html_files, 1): result = process_single_file(html_file, i, len(html_files)) all_results.append(result) if result['status'] == 'success': stats['processed'] += 1 stats_result = result.get('stats', {}) stats['total_staff'] += stats_result.get('total_staff', 0) stats['with_linkedin_url'] += stats_result.get('with_linkedin_url', 0) stats['heritage_relevant'] += stats_result.get('heritage_relevant', 0) stats['anonymous_members'] += stats_result.get('anonymous_members', 0) # Track heritage types heritage_types = stats_result.get('heritage_types', {}) for htype, count in heritage_types.items(): stats['custodians_by_heritage_type'][htype] += count elif result['status'] in ['error', 'timeout', 'exception', 'parsed_error']: stats['errors'] += 1 stats['errors_list'].append({ 'file': result['file'], 'status': result['status'], 'reason': result.get('reason', '') }) elif result['status'] == 'skipped': stats['skipped'] += 1 # Progress reporting if i % 100 == 0: progress = (i / len(html_files)) * 100 print(f"[{i:4d}/{len(html_files)}] {progress:5.1f}% - {result['status']:7s} - {result.get('institution_name', 'N/A')}") # Final report print() print("="*70) print("BATCH PROCESSING COMPLETE") print("="*70) print(f"\nTotal files: {stats['total_files']}") print(f"Successfully processed: {stats['processed']}") print(f"Skipped: {stats['skipped']}") print(f"Errors: {stats['errors']}") print() print(f"Total staff extracted: {stats['total_staff']}") print(f"Staff with LinkedIn URLs: {stats['with_linkedin_url']}") print(f"Heritage-relevant staff: {stats['heritage_relevant']}") print(f"Anonymous members: {stats['anonymous_members']}") print() print("Staff by heritage type:") for htype in sorted(stats['custodians_by_heritage_type'].keys()): count = stats['custodians_by_heritage_type'][htype] print(f" {htype}: {count}") print() if stats['errors'] > 0: print(f"\nFirst 20 errors:") for err in stats['errors_list'][:20]: print(f" [{err['status']:7s}] {err['file']} - {err['reason'][:80]}") if len(stats['errors_list']) > 20: print(f" ... and {len(stats['errors_list']) - 20} more errors") print() # Save detailed summary summary = { 'processing_timestamp': datetime.now(timezone.utc).isoformat(), 'summary_stats': stats, 'all_results': all_results, } with open(SUMMARY_FILE, 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print(f"\nDetailed summary saved to: {SUMMARY_FILE}") print(f"Staff JSON files saved to: {OUTPUT_DIR}") print("="*70) return 0 if __name__ == '__main__': sys.exit(main())