179 lines
5.7 KiB
Python
179 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch process LinkedIn organization HTML files.
|
|
|
|
Parses organization staff/member pages and outputs JSON files with extracted profiles.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def extract_org_name_from_filename(filename: str) -> tuple[str, str]:
|
|
"""Extract organization name and generate slug from filename.
|
|
|
|
Args:
|
|
filename: e.g., "(8) Eye Filmmuseum_ People _ LinkedIn.html"
|
|
|
|
Returns:
|
|
tuple of (org_name, slug)
|
|
"""
|
|
# Remove "(8) " prefix if present
|
|
name = re.sub(r'^\(\d+\)\s*', '', filename)
|
|
|
|
# Remove "_ People _ LinkedIn.html" suffix
|
|
name = re.sub(r'_\s*People\s*_\s*LinkedIn\.html$', '', name)
|
|
|
|
# Clean up underscores used as colons in filenames
|
|
name = name.replace('_', ':').strip()
|
|
|
|
# Handle cases like "ACP: ICA- Archival..."
|
|
name = re.sub(r':\s*:', ':', name)
|
|
name = name.strip(':').strip()
|
|
|
|
# Generate slug
|
|
slug = name.lower()
|
|
slug = re.sub(r'[^a-z0-9]+', '-', slug)
|
|
slug = re.sub(r'-+', '-', slug)
|
|
slug = slug.strip('-')
|
|
|
|
# Truncate slug to reasonable length
|
|
if len(slug) > 50:
|
|
slug = slug[:50].rstrip('-')
|
|
|
|
return name, slug
|
|
|
|
|
|
def process_file(html_path: Path, output_dir: Path) -> dict:
|
|
"""Process a single HTML file.
|
|
|
|
Args:
|
|
html_path: Path to HTML file
|
|
output_dir: Directory for output JSON
|
|
|
|
Returns:
|
|
dict with processing results
|
|
"""
|
|
org_name, slug = extract_org_name_from_filename(html_path.name)
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
output_file = output_dir / f"{slug}_staff_{timestamp}.json"
|
|
|
|
# Run the parser
|
|
cmd = [
|
|
'python', 'scripts/parse_linkedin_html.py',
|
|
str(html_path),
|
|
'--custodian-name', org_name,
|
|
'--custodian-slug', slug,
|
|
'--output', str(output_file)
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
# Parse results
|
|
extracted_count = 0
|
|
expected_count = None
|
|
|
|
if result.returncode == 0 and output_file.exists():
|
|
with open(output_file, 'r') as f:
|
|
data = json.load(f)
|
|
# Count staff from the staff array directly
|
|
extracted_count = len(data.get('staff', []))
|
|
expected_count = data.get('custodian_metadata', {}).get('associated_members')
|
|
|
|
return {
|
|
'org_name': org_name,
|
|
'slug': slug,
|
|
'html_file': html_path.name,
|
|
'output_file': output_file.name if output_file.exists() else None,
|
|
'extracted_count': extracted_count,
|
|
'expected_count': expected_count,
|
|
'variance': (extracted_count - expected_count) if expected_count else None,
|
|
'success': result.returncode == 0,
|
|
'stderr': result.stderr if result.returncode != 0 else None
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Process all HTML files in the manual directory."""
|
|
input_dir = Path('data/custodian/person/affiliated/manual')
|
|
output_dir = Path('data/custodian/person/affiliated/parsed')
|
|
|
|
# Ensure output directory exists
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Find all HTML files
|
|
html_files = sorted(input_dir.glob('*.html'))
|
|
|
|
print(f"Found {len(html_files)} HTML files to process\n")
|
|
print("=" * 100)
|
|
|
|
results = []
|
|
|
|
for i, html_file in enumerate(html_files, 1):
|
|
print(f"\n[{i}/{len(html_files)}] Processing: {html_file.name[:60]}...")
|
|
|
|
result = process_file(html_file, output_dir)
|
|
results.append(result)
|
|
|
|
if result['success']:
|
|
variance_str = ""
|
|
if result['variance'] is not None:
|
|
variance_str = f" (variance: {result['variance']:+d})"
|
|
print(f" OK: {result['extracted_count']} extracted, {result['expected_count']} expected{variance_str}")
|
|
else:
|
|
print(f" FAILED: {result['stderr'][:100] if result['stderr'] else 'Unknown error'}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 100)
|
|
print("\nSUMMARY")
|
|
print("=" * 100)
|
|
|
|
# Table header
|
|
print(f"\n{'Organization':<45} {'Expected':>10} {'Extracted':>10} {'Variance':>10} {'Status':>10}")
|
|
print("-" * 90)
|
|
|
|
total_expected = 0
|
|
total_extracted = 0
|
|
success_count = 0
|
|
|
|
for r in results:
|
|
org_display = r['org_name'][:43] + '..' if len(r['org_name']) > 45 else r['org_name']
|
|
exp = str(r['expected_count']) if r['expected_count'] is not None else 'N/A'
|
|
ext = str(r['extracted_count'])
|
|
var = f"{r['variance']:+d}" if r['variance'] is not None else 'N/A'
|
|
status = "OK" if r['success'] else "FAILED"
|
|
|
|
print(f"{org_display:<45} {exp:>10} {ext:>10} {var:>10} {status:>10}")
|
|
|
|
if r['expected_count']:
|
|
total_expected += r['expected_count']
|
|
total_extracted += r['extracted_count']
|
|
if r['success']:
|
|
success_count += 1
|
|
|
|
print("-" * 90)
|
|
print(f"{'TOTAL':<45} {total_expected:>10} {total_extracted:>10} {total_extracted - total_expected:>+10}")
|
|
print(f"\nProcessed: {success_count}/{len(results)} files successfully")
|
|
|
|
# Save results
|
|
results_file = output_dir / f"batch_results_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json"
|
|
with open(results_file, 'w') as f:
|
|
json.dump({
|
|
'processed_at': datetime.now(timezone.utc).isoformat(),
|
|
'total_files': len(results),
|
|
'success_count': success_count,
|
|
'total_expected': total_expected,
|
|
'total_extracted': total_extracted,
|
|
'results': results
|
|
}, f, indent=2)
|
|
|
|
print(f"\nResults saved to: {results_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|