glam/scripts/batch_parse_linkedin_manual_v2.py
2025-12-30 03:43:31 +01:00

267 lines
9.5 KiB
Python

#!/usr/bin/env python3
"""
Batch processor for LinkedIn HTML files - DATA COLLECTION PHASE
This script:
1. Processes all HTML files in manual directory
2. Runs parse_linkedin_html.py for each
3. Creates staff JSON files in bu/ directory
4. Generates summary report
NOTE: Name extraction refinement (using full "about" description instead
of abbreviated names) is a POST-PROCESSING step.
"""
import json
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections import defaultdict
# Directory paths
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu")
SUMMARY_FILE = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/batch_processing_summary.json")
def extract_institution_name_from_filename(filename: str) -> str:
"""Extract institution name from LinkedIn People HTML filename.
Removes:
- Leading number in parentheses: (10), (15), (16)
- Trailing " People _ LinkedIn"
- Trailing ".html"
- Leading commas
- Leading underscores
Returns clean institution name.
"""
name = Path(filename).name
name = name.replace('.html', '')
name = re.sub(r'_?People _ LinkedIn$', '', name)
name = re.sub(r'^\(\d+\)\s*', '', name)
name = re.sub(r'^,\s*', '', name)
name = re.sub(r'\s+', ' ', name).strip()
name = name.rstrip('_')
name = name.lstrip('_')
return name.strip()
def generate_slug_from_name(name: str) -> str:
"""Generate URL-friendly slug from institution name."""
slug = name.lower()
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
slug = re.sub(r'[\s-]+', '-', slug)
slug = slug.strip('-')
return slug
def process_single_file(html_file: Path, index: int, total: int) -> dict:
"""Process a single HTML file and return result summary."""
institution_name = extract_institution_name_from_filename(html_file.name)
if not institution_name or len(institution_name) < 3:
return {
'status': 'skipped',
'file': html_file.name,
'reason': f'Invalid name extracted: "{institution_name}"'
}
slug = generate_slug_from_name(institution_name)
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json"
try:
result = subprocess.run(
[
sys.executable,
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
str(html_file),
"--custodian-name", institution_name,
"--custodian-slug", slug,
"--output", str(output_file)
],
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0:
return {
'status': 'error',
'file': html_file.name,
'institution_name': institution_name,
'reason': result.stderr[:200] if result.stderr else 'Unknown error',
'returncode': result.returncode
}
# Load result to extract statistics
try:
with open(output_file, 'r', encoding='utf-8') as f:
data = json.load(f)
staff_analysis = data.get('staff_analysis', {})
custodian_metadata = data.get('custodian_metadata', {})
source_metadata = data.get('source_metadata', {})
return {
'status': 'success',
'file': html_file.name,
'institution_name': institution_name,
'slug': slug,
'output_file': output_file.name,
'stats': {
'total_staff': staff_analysis.get('total_staff_extracted', 0),
'with_linkedin_url': staff_analysis.get('with_linkedin_url', 0),
'with_alternate_profiles': staff_analysis.get('with_alternate_profiles', 0),
'anonymous_members': staff_analysis.get('anonymous_members', 0),
'heritage_relevant': staff_analysis.get('heritage_relevant_count', 0),
'heritage_types': staff_analysis.get('staff_by_heritage_type', {}),
'associated_members': custodian_metadata.get('associated_members', 0),
'pymk_filtered': source_metadata.get('pymk_cards_filtered', 0),
'duplicates_merged': source_metadata.get('duplicate_profiles_merged', 0),
'industry': custodian_metadata.get('industry', 'Unknown'),
'follower_count': custodian_metadata.get('follower_count', 'Unknown'),
}
}
except Exception as e:
return {
'status': 'parsed_error',
'file': html_file.name,
'institution_name': institution_name,
'reason': f'Failed to load output JSON: {e}'
}
except subprocess.TimeoutExpired:
return {
'status': 'timeout',
'file': html_file.name,
'institution_name': institution_name,
'reason': 'Processing timeout (30s)'
}
except Exception as e:
return {
'status': 'exception',
'file': html_file.name,
'institution_name': institution_name,
'reason': str(e)
}
def main():
"""Main batch processing function."""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
html_files = sorted(MANUAL_DIR.glob("*.html"))
print("="*70)
print("LINKEDIN MANUAL DIRECTORY - BATCH DATA COLLECTION")
print("="*70)
print(f"\nInput directory: {MANUAL_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Total HTML files to process: {len(html_files)}")
print(f"\nStarting batch processing at: {datetime.now(timezone.utc).isoformat()}")
print()
# Track statistics
all_results = []
stats = {
'total_files': len(html_files),
'processed': 0,
'errors': 0,
'skipped': 0,
'total_staff': 0,
'with_linkedin_url': 0,
'heritage_relevant': 0,
'anonymous_members': 0,
'custodians_by_heritage_type': defaultdict(int),
'errors_list': [],
}
# Process each file
for i, html_file in enumerate(html_files, 1):
result = process_single_file(html_file, i, len(html_files))
all_results.append(result)
if result['status'] == 'success':
stats['processed'] += 1
stats_result = result.get('stats', {})
stats['total_staff'] += stats_result.get('total_staff', 0)
stats['with_linkedin_url'] += stats_result.get('with_linkedin_url', 0)
stats['heritage_relevant'] += stats_result.get('heritage_relevant', 0)
stats['anonymous_members'] += stats_result.get('anonymous_members', 0)
# Track heritage types
heritage_types = stats_result.get('heritage_types', {})
for htype, count in heritage_types.items():
stats['custodians_by_heritage_type'][htype] += count
elif result['status'] in ['error', 'timeout', 'exception', 'parsed_error']:
stats['errors'] += 1
stats['errors_list'].append({
'file': result['file'],
'status': result['status'],
'reason': result.get('reason', '')
})
elif result['status'] == 'skipped':
stats['skipped'] += 1
# Progress reporting
if i % 100 == 0:
progress = (i / len(html_files)) * 100
print(f"[{i:4d}/{len(html_files)}] {progress:5.1f}% - {result['status']:7s} - {result.get('institution_name', 'N/A')}")
# Final report
print()
print("="*70)
print("BATCH PROCESSING COMPLETE")
print("="*70)
print(f"\nTotal files: {stats['total_files']}")
print(f"Successfully processed: {stats['processed']}")
print(f"Skipped: {stats['skipped']}")
print(f"Errors: {stats['errors']}")
print()
print(f"Total staff extracted: {stats['total_staff']}")
print(f"Staff with LinkedIn URLs: {stats['with_linkedin_url']}")
print(f"Heritage-relevant staff: {stats['heritage_relevant']}")
print(f"Anonymous members: {stats['anonymous_members']}")
print()
print("Staff by heritage type:")
for htype in sorted(stats['custodians_by_heritage_type'].keys()):
count = stats['custodians_by_heritage_type'][htype]
print(f" {htype}: {count}")
print()
if stats['errors'] > 0:
print(f"\nFirst 20 errors:")
for err in stats['errors_list'][:20]:
print(f" [{err['status']:7s}] {err['file']} - {err['reason'][:80]}")
if len(stats['errors_list']) > 20:
print(f" ... and {len(stats['errors_list']) - 20} more errors")
print()
# Save detailed summary
summary = {
'processing_timestamp': datetime.now(timezone.utc).isoformat(),
'summary_stats': stats,
'all_results': all_results,
}
with open(SUMMARY_FILE, 'w', encoding='utf-8') as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"\nDetailed summary saved to: {SUMMARY_FILE}")
print(f"Staff JSON files saved to: {OUTPUT_DIR}")
print("="*70)
return 0
if __name__ == '__main__':
sys.exit(main())