glam/scripts/batch_parse_linkedin_orgs.py
2025-12-10 18:04:25 +01:00

179 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Batch process LinkedIn organization HTML files.
Parses organization staff/member pages and outputs JSON files with extracted profiles.
"""
import os
import re
import json
import subprocess
from pathlib import Path
from datetime import datetime, timezone
def extract_org_name_from_filename(filename: str) -> tuple[str, str]:
"""Extract organization name and generate slug from filename.
Args:
filename: e.g., "(8) Eye Filmmuseum_ People _ LinkedIn.html"
Returns:
tuple of (org_name, slug)
"""
# Remove "(8) " prefix if present
name = re.sub(r'^\(\d+\)\s*', '', filename)
# Remove "_ People _ LinkedIn.html" suffix
name = re.sub(r'_\s*People\s*_\s*LinkedIn\.html$', '', name)
# Clean up underscores used as colons in filenames
name = name.replace('_', ':').strip()
# Handle cases like "ACP: ICA- Archival..."
name = re.sub(r':\s*:', ':', name)
name = name.strip(':').strip()
# Generate slug
slug = name.lower()
slug = re.sub(r'[^a-z0-9]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
# Truncate slug to reasonable length
if len(slug) > 50:
slug = slug[:50].rstrip('-')
return name, slug
def process_file(html_path: Path, output_dir: Path) -> dict:
"""Process a single HTML file.
Args:
html_path: Path to HTML file
output_dir: Directory for output JSON
Returns:
dict with processing results
"""
org_name, slug = extract_org_name_from_filename(html_path.name)
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
output_file = output_dir / f"{slug}_staff_{timestamp}.json"
# Run the parser
cmd = [
'python', 'scripts/parse_linkedin_html.py',
str(html_path),
'--custodian-name', org_name,
'--custodian-slug', slug,
'--output', str(output_file)
]
result = subprocess.run(cmd, capture_output=True, text=True)
# Parse results
extracted_count = 0
expected_count = None
if result.returncode == 0 and output_file.exists():
with open(output_file, 'r') as f:
data = json.load(f)
# Count staff from the staff array directly
extracted_count = len(data.get('staff', []))
expected_count = data.get('custodian_metadata', {}).get('associated_members')
return {
'org_name': org_name,
'slug': slug,
'html_file': html_path.name,
'output_file': output_file.name if output_file.exists() else None,
'extracted_count': extracted_count,
'expected_count': expected_count,
'variance': (extracted_count - expected_count) if expected_count else None,
'success': result.returncode == 0,
'stderr': result.stderr if result.returncode != 0 else None
}
def main():
"""Process all HTML files in the manual directory."""
input_dir = Path('data/custodian/person/affiliated/manual')
output_dir = Path('data/custodian/person/affiliated/parsed')
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# Find all HTML files
html_files = sorted(input_dir.glob('*.html'))
print(f"Found {len(html_files)} HTML files to process\n")
print("=" * 100)
results = []
for i, html_file in enumerate(html_files, 1):
print(f"\n[{i}/{len(html_files)}] Processing: {html_file.name[:60]}...")
result = process_file(html_file, output_dir)
results.append(result)
if result['success']:
variance_str = ""
if result['variance'] is not None:
variance_str = f" (variance: {result['variance']:+d})"
print(f" OK: {result['extracted_count']} extracted, {result['expected_count']} expected{variance_str}")
else:
print(f" FAILED: {result['stderr'][:100] if result['stderr'] else 'Unknown error'}")
# Summary
print("\n" + "=" * 100)
print("\nSUMMARY")
print("=" * 100)
# Table header
print(f"\n{'Organization':<45} {'Expected':>10} {'Extracted':>10} {'Variance':>10} {'Status':>10}")
print("-" * 90)
total_expected = 0
total_extracted = 0
success_count = 0
for r in results:
org_display = r['org_name'][:43] + '..' if len(r['org_name']) > 45 else r['org_name']
exp = str(r['expected_count']) if r['expected_count'] is not None else 'N/A'
ext = str(r['extracted_count'])
var = f"{r['variance']:+d}" if r['variance'] is not None else 'N/A'
status = "OK" if r['success'] else "FAILED"
print(f"{org_display:<45} {exp:>10} {ext:>10} {var:>10} {status:>10}")
if r['expected_count']:
total_expected += r['expected_count']
total_extracted += r['extracted_count']
if r['success']:
success_count += 1
print("-" * 90)
print(f"{'TOTAL':<45} {total_expected:>10} {total_extracted:>10} {total_extracted - total_expected:>+10}")
print(f"\nProcessed: {success_count}/{len(results)} files successfully")
# Save results
results_file = output_dir / f"batch_results_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json"
with open(results_file, 'w') as f:
json.dump({
'processed_at': datetime.now(timezone.utc).isoformat(),
'total_files': len(results),
'success_count': success_count,
'total_expected': total_expected,
'total_extracted': total_extracted,
'results': results
}, f, indent=2)
print(f"\nResults saved to: {results_file}")
if __name__ == '__main__':
main()