267 lines
9.5 KiB
Python
267 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch processor for LinkedIn HTML files - DATA COLLECTION PHASE
|
|
|
|
This script:
|
|
1. Processes all HTML files in manual directory
|
|
2. Runs parse_linkedin_html.py for each
|
|
3. Creates staff JSON files in bu/ directory
|
|
4. Generates summary report
|
|
|
|
NOTE: Name extraction refinement (using full "about" description instead
|
|
of abbreviated names) is a POST-PROCESSING step.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
|
|
# Directory paths
|
|
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/bu")
|
|
SUMMARY_FILE = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/batch_processing_summary.json")
|
|
|
|
|
|
def extract_institution_name_from_filename(filename: str) -> str:
|
|
"""Extract institution name from LinkedIn People HTML filename.
|
|
|
|
Removes:
|
|
- Leading number in parentheses: (10), (15), (16)
|
|
- Trailing " People _ LinkedIn"
|
|
- Trailing ".html"
|
|
- Leading commas
|
|
- Leading underscores
|
|
|
|
Returns clean institution name.
|
|
"""
|
|
name = Path(filename).name
|
|
name = name.replace('.html', '')
|
|
name = re.sub(r'_?People _ LinkedIn$', '', name)
|
|
name = re.sub(r'^\(\d+\)\s*', '', name)
|
|
name = re.sub(r'^,\s*', '', name)
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
name = name.rstrip('_')
|
|
name = name.lstrip('_')
|
|
return name.strip()
|
|
|
|
|
|
def generate_slug_from_name(name: str) -> str:
|
|
"""Generate URL-friendly slug from institution name."""
|
|
slug = name.lower()
|
|
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
|
|
slug = re.sub(r'[\s-]+', '-', slug)
|
|
slug = slug.strip('-')
|
|
return slug
|
|
|
|
|
|
def process_single_file(html_file: Path, index: int, total: int) -> dict:
|
|
"""Process a single HTML file and return result summary."""
|
|
|
|
institution_name = extract_institution_name_from_filename(html_file.name)
|
|
if not institution_name or len(institution_name) < 3:
|
|
return {
|
|
'status': 'skipped',
|
|
'file': html_file.name,
|
|
'reason': f'Invalid name extracted: "{institution_name}"'
|
|
}
|
|
|
|
slug = generate_slug_from_name(institution_name)
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
output_file = OUTPUT_DIR / f"{slug}_staff_{timestamp}.json"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
|
|
str(html_file),
|
|
"--custodian-name", institution_name,
|
|
"--custodian-slug", slug,
|
|
"--output", str(output_file)
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
return {
|
|
'status': 'error',
|
|
'file': html_file.name,
|
|
'institution_name': institution_name,
|
|
'reason': result.stderr[:200] if result.stderr else 'Unknown error',
|
|
'returncode': result.returncode
|
|
}
|
|
|
|
# Load result to extract statistics
|
|
try:
|
|
with open(output_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
staff_analysis = data.get('staff_analysis', {})
|
|
custodian_metadata = data.get('custodian_metadata', {})
|
|
source_metadata = data.get('source_metadata', {})
|
|
|
|
return {
|
|
'status': 'success',
|
|
'file': html_file.name,
|
|
'institution_name': institution_name,
|
|
'slug': slug,
|
|
'output_file': output_file.name,
|
|
'stats': {
|
|
'total_staff': staff_analysis.get('total_staff_extracted', 0),
|
|
'with_linkedin_url': staff_analysis.get('with_linkedin_url', 0),
|
|
'with_alternate_profiles': staff_analysis.get('with_alternate_profiles', 0),
|
|
'anonymous_members': staff_analysis.get('anonymous_members', 0),
|
|
'heritage_relevant': staff_analysis.get('heritage_relevant_count', 0),
|
|
'heritage_types': staff_analysis.get('staff_by_heritage_type', {}),
|
|
'associated_members': custodian_metadata.get('associated_members', 0),
|
|
'pymk_filtered': source_metadata.get('pymk_cards_filtered', 0),
|
|
'duplicates_merged': source_metadata.get('duplicate_profiles_merged', 0),
|
|
'industry': custodian_metadata.get('industry', 'Unknown'),
|
|
'follower_count': custodian_metadata.get('follower_count', 'Unknown'),
|
|
}
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'status': 'parsed_error',
|
|
'file': html_file.name,
|
|
'institution_name': institution_name,
|
|
'reason': f'Failed to load output JSON: {e}'
|
|
}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return {
|
|
'status': 'timeout',
|
|
'file': html_file.name,
|
|
'institution_name': institution_name,
|
|
'reason': 'Processing timeout (30s)'
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'status': 'exception',
|
|
'file': html_file.name,
|
|
'institution_name': institution_name,
|
|
'reason': str(e)
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main batch processing function."""
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
html_files = sorted(MANUAL_DIR.glob("*.html"))
|
|
|
|
print("="*70)
|
|
print("LINKEDIN MANUAL DIRECTORY - BATCH DATA COLLECTION")
|
|
print("="*70)
|
|
print(f"\nInput directory: {MANUAL_DIR}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"Total HTML files to process: {len(html_files)}")
|
|
print(f"\nStarting batch processing at: {datetime.now(timezone.utc).isoformat()}")
|
|
print()
|
|
|
|
# Track statistics
|
|
all_results = []
|
|
stats = {
|
|
'total_files': len(html_files),
|
|
'processed': 0,
|
|
'errors': 0,
|
|
'skipped': 0,
|
|
'total_staff': 0,
|
|
'with_linkedin_url': 0,
|
|
'heritage_relevant': 0,
|
|
'anonymous_members': 0,
|
|
'custodians_by_heritage_type': defaultdict(int),
|
|
'errors_list': [],
|
|
}
|
|
|
|
# Process each file
|
|
for i, html_file in enumerate(html_files, 1):
|
|
result = process_single_file(html_file, i, len(html_files))
|
|
all_results.append(result)
|
|
|
|
if result['status'] == 'success':
|
|
stats['processed'] += 1
|
|
stats_result = result.get('stats', {})
|
|
stats['total_staff'] += stats_result.get('total_staff', 0)
|
|
stats['with_linkedin_url'] += stats_result.get('with_linkedin_url', 0)
|
|
stats['heritage_relevant'] += stats_result.get('heritage_relevant', 0)
|
|
stats['anonymous_members'] += stats_result.get('anonymous_members', 0)
|
|
|
|
# Track heritage types
|
|
heritage_types = stats_result.get('heritage_types', {})
|
|
for htype, count in heritage_types.items():
|
|
stats['custodians_by_heritage_type'][htype] += count
|
|
|
|
elif result['status'] in ['error', 'timeout', 'exception', 'parsed_error']:
|
|
stats['errors'] += 1
|
|
stats['errors_list'].append({
|
|
'file': result['file'],
|
|
'status': result['status'],
|
|
'reason': result.get('reason', '')
|
|
})
|
|
elif result['status'] == 'skipped':
|
|
stats['skipped'] += 1
|
|
|
|
# Progress reporting
|
|
if i % 100 == 0:
|
|
progress = (i / len(html_files)) * 100
|
|
print(f"[{i:4d}/{len(html_files)}] {progress:5.1f}% - {result['status']:7s} - {result.get('institution_name', 'N/A')}")
|
|
|
|
# Final report
|
|
print()
|
|
print("="*70)
|
|
print("BATCH PROCESSING COMPLETE")
|
|
print("="*70)
|
|
print(f"\nTotal files: {stats['total_files']}")
|
|
print(f"Successfully processed: {stats['processed']}")
|
|
print(f"Skipped: {stats['skipped']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
print()
|
|
print(f"Total staff extracted: {stats['total_staff']}")
|
|
print(f"Staff with LinkedIn URLs: {stats['with_linkedin_url']}")
|
|
print(f"Heritage-relevant staff: {stats['heritage_relevant']}")
|
|
print(f"Anonymous members: {stats['anonymous_members']}")
|
|
print()
|
|
print("Staff by heritage type:")
|
|
for htype in sorted(stats['custodians_by_heritage_type'].keys()):
|
|
count = stats['custodians_by_heritage_type'][htype]
|
|
print(f" {htype}: {count}")
|
|
print()
|
|
|
|
if stats['errors'] > 0:
|
|
print(f"\nFirst 20 errors:")
|
|
for err in stats['errors_list'][:20]:
|
|
print(f" [{err['status']:7s}] {err['file']} - {err['reason'][:80]}")
|
|
if len(stats['errors_list']) > 20:
|
|
print(f" ... and {len(stats['errors_list']) - 20} more errors")
|
|
print()
|
|
|
|
# Save detailed summary
|
|
summary = {
|
|
'processing_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'summary_stats': stats,
|
|
'all_results': all_results,
|
|
}
|
|
|
|
with open(SUMMARY_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nDetailed summary saved to: {SUMMARY_FILE}")
|
|
print(f"Staff JSON files saved to: {OUTPUT_DIR}")
|
|
print("="*70)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|