glam/scripts/linkedin_batch_comprehensive.py
2025-12-30 03:43:31 +01:00

494 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive LinkedIn Batch Processing - Fix All Issues
This script fixes all identified issues with the previous batch processing:
1. Properly cleans filenames (removes macOS resource forks, periods, spaces, parentheses)
2. Extracts full institution name from HTML H1 tag (not from filename)
3. Re-processes all HTML files to extract correct staff data
4. Creates person entity files from staff JSON
5. Creates/updates custodian YAML files
Usage:
python scripts/linkedin_batch_comprehensive.py \
--input-dir /path/to/html/files \
--output-dir data/custodian/person/bu_fixed \
--entity-dir data/custodian/person/entity \
--custodian-dir data/custodian/
"""
import argparse
import json
import re
import sys
import unicodedata
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
try:
from bs4 import BeautifulSoup
except ImportError:
print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr)
sys.exit(1)
try:
import yaml
except ImportError:
print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr)
sys.exit(1)
# Import existing parser (we'll enhance it)
sys.path.insert(0, str(Path(__file__).parent))
from parse_linkedin_html import parse_html_file, generate_staff_id
def clean_filename_to_name(filename: str) -> str:
"""
Clean HTML filename to extract institution name.
Handles:
- macOS resource fork prefixes (._)
- Periods before numbers (._(15))
- Numbers in parentheses (15), (7)
- Extra spaces and underscores
- " People _ LinkedIn.html" suffix
Examples:
"._(15) Gemeente Enkhuizen_ People _ LinkedIn.html"
-> "Gemeente Enkhuizen"
"(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html"
-> "ADVN archief voor nationale bewegingen"
"15-arabian-oud_ People _ LinkedIn.html"
-> "arabian oud"
"""
# Remove " People _ LinkedIn.html" suffix
name = filename.replace(' People _ LinkedIn.html', '')
# Remove .html extension
name = name.replace('.html', '')
# Remove macOS resource fork prefix (._)
if name.startswith('._'):
name = name[2:]
# Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15)
name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name)
name = re.sub(r'^\._*\(\d+\)\s*', '', name)
# Remove trailing spaces and underscores
name = name.strip('_ ')
# Replace multiple spaces with single space
name = re.sub(r'\s+', ' ', name)
return name.strip()
def extract_institution_name_from_html(html_content: str) -> Optional[str]:
"""
Extract full institution name from HTML H1 tag.
LinkedIn H1 format: "Organization Name | LinkedIn"
We extract the part before the pipe.
Returns None if H1 not found.
"""
soup = BeautifulSoup(html_content, 'html.parser')
h1 = soup.find('h1')
if h1:
h1_text = h1.get_text().strip()
# Remove " | LinkedIn" suffix
if ' | ' in h1_text:
name = h1_text.split(' | ')[0].strip()
else:
name = h1_text
# Clean up extra pipes or separators
name = re.sub(r'\s*\|\s*', ' ', name)
name = re.sub(r'\s+', ' ', name)
return name if name else None
return None
def process_html_file(html_path: Path, output_dir: Path) -> dict[str, Any]:
"""
Process a single HTML file to extract staff data.
Extracts institution name from HTML H1 tag, not from filename.
Cleans filename to generate slug.
"""
# Extract name from HTML
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
# Try to get name from HTML H1
html_name = extract_institution_name_from_html(html_content)
if not html_name:
# Fallback: extract from filename
html_name = clean_filename_to_name(html_path.name)
print(f"Warning: H1 not found in {html_path.name}, using filename: {html_name}", file=sys.stderr)
# Generate slug from cleaned filename
slug_base = clean_filename_to_name(html_path.name)
# Convert to URL-safe slug
slug = re.sub(r'[^a-z0-9]+', '-', slug_base.lower())
slug = re.sub(r'-+', '-', slug).strip('-')
# Parse HTML using existing parser
result = parse_html_file(html_path, html_name, slug)
# Update custodian name in result with the one from HTML
result['custodian_metadata']['custodian_name'] = html_name
result['custodian_metadata']['name'] = html_name
# Update source filename
result['source_metadata']['source_file'] = html_path.name
return result
def create_person_entity(staff_entry: dict, custodian_name: str, html_filename: str, entity_dir: Path) -> Optional[Path]:
"""
Create a person entity JSON file from a staff entry.
Follows Rule 20: Person Entity Profiles - Individual File Storage
"""
name = staff_entry.get('name', '')
if not name or name.startswith('LinkedIn Member'):
# Skip anonymous profiles - they don't have entity profiles
return None
# Generate person profile path
# Format: {linkedin-slug}_{ISO-timestamp}.json
linkedin_slug = staff_entry.get('linkedin_slug', '')
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
entity_filename = f"{linkedin_slug}_{timestamp}.json"
entity_path = entity_dir / entity_filename
# Check if file already exists
if entity_path.exists():
return entity_path
# Create person entity structure
person_entity = {
'extraction_agent': 'claude-opus-4.5',
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'profile_data': {
'person_id': staff_entry.get('staff_id'),
'full_name': name,
'linkedin_slug': linkedin_slug,
'linkedin_profile_url': staff_entry.get('linkedin_profile_url'),
'headline': staff_entry.get('headline', ''),
'degree': staff_entry.get('degree', 'unknown'),
'mutual_connections': staff_entry.get('mutual_connections', ''),
},
'affiliations': [{
'organization_name': custodian_name,
'organization_slug': None, # Will be filled during custodian matching
'role_title': staff_entry.get('headline', ''),
'affiliation_type': 'staff',
'affiliation_provenance': {
'source_type': 'linkedin_company_people_page_html',
'source_file': html_filename,
'registered_timestamp': timestamp,
'registration_method': 'html_parsing',
}
}],
'web_claims': [], # Could be enhanced by scraping profile pages
'extraction_metadata': {
'heritage_relevant': staff_entry.get('heritage_relevant', False),
'heritage_type': staff_entry.get('heritage_type'),
'name_type': staff_entry.get('name_type', 'unknown'),
}
}
# Add name correction if present
if 'name_correction' in staff_entry:
person_entity['extraction_metadata']['name_correction'] = staff_entry['name_correction']
# Write entity file
entity_path.parent.mkdir(parents=True, exist_ok=True)
with open(entity_path, 'w', encoding='utf-8') as f:
json.dump(person_entity, f, indent=2, ensure_ascii=False)
return entity_path
def find_or_create_custodian(custodian_name: str, custodian_dir: Path, staff_data: dict) -> tuple[Path, bool]:
"""
Find existing custodian YAML file or create new one.
Returns (file_path, is_new)
"""
# Try to find existing custodian by name (case-insensitive)
existing_file = None
for custodian_file in custodian_dir.glob('*.yaml'):
try:
with open(custodian_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and data.get('custodian_name', '').lower() == custodian_name.lower():
existing_file = custodian_file
break
except:
continue
if existing_file:
# Update existing file
custodian_file = existing_file
is_new = False
# Read existing data
with open(custodian_file, 'r', encoding='utf-8') as f:
custodian_data = yaml.safe_load(f) or {}
# Add/update staff section
custodian_data['staff'] = {
'provenance': {
'source_type': 'linkedin_company_people_page_html',
'registered_timestamp': staff_data['source_metadata']['registered_timestamp'],
'registration_method': 'html_parsing',
'total_staff_extracted': len(staff_data['staff']),
},
'staff_list': [
{
'staff_id': s.get('staff_id'),
'person_name': s.get('name'),
'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json",
'role_title': s.get('headline', ''),
'heritage_relevant': s.get('heritage_relevant', False),
'heritage_type': s.get('heritage_type'),
}
for s in staff_data['staff']
if s.get('linkedin_slug') # Only include staff with profiles
]
}
# Update custodian name
custodian_data['custodian_name'] = custodian_name
# Write back
with open(custodian_file, 'w', encoding='utf-8') as f:
yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return (custodian_file, False)
else:
# Create new custodian file
# Generate placeholder GHCID (requires geographic research)
slug = re.sub(r'[^a-z0-9]+', '-', custodian_name.lower())
slug = re.sub(r'-+', '-', slug).strip('-')
slug = slug[:30] # Limit length
placeholder_ghcid = f"NL-XX-XXX-PENDING-{slug.upper()}"
custodian_data = {
'ghcid_current': placeholder_ghcid,
'custodian_name': custodian_name,
'institution_type': 'MUSEUM', # Default, will be refined based on staff
'custodian_name': {
'emic_name': custodian_name,
'english_name': None,
'name_verified': True,
'name_source': 'linkedin_html_h1',
},
'staff': {
'provenance': {
'source_type': 'linkedin_company_people_page_html',
'registered_timestamp': staff_data['source_metadata']['registered_timestamp'],
'registration_method': 'html_parsing',
'total_staff_extracted': len(staff_data['staff']),
},
'staff_list': [
{
'staff_id': s.get('staff_id'),
'person_name': s.get('name'),
'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json",
'role_title': s.get('headline', ''),
'heritage_relevant': s.get('heritage_relevant', False),
'heritage_type': s.get('heritage_type'),
}
for s in staff_data['staff']
if s.get('linkedin_slug') # Only include staff with profiles
]
},
'provenance': {
'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Comprehensive batch processing with HTML H1 name extraction',
'confidence_score': 0.85,
'notes': f'Staff extracted from LinkedIn company People page. Location research needed for GHCID. Total staff: {len(staff_data["staff"])}',
}
}
# Determine institution type based on staff heritage analysis
heritage_types = staff_data['staff_analysis'].get('staff_by_heritage_type', {})
if heritage_types:
# Find most common heritage type
most_common = Counter(heritage_types).most_common(1)
if most_common:
type_code = most_common[0][0]
type_map = {
'M': 'MUSEUM',
'L': 'LIBRARY',
'A': 'ARCHIVE',
'G': 'GALLERY',
'R': 'RESEARCH_CENTER',
'E': 'EDUCATION_PROVIDER',
'S': 'COLLECTING_SOCIETY',
'D': 'DIGITAL_PLATFORM',
}
if type_code in type_map:
custodian_data['institution_type'] = type_map[type_code]
# Create new file
custodian_file = custodian_dir / f"{placeholder_ghcid}.yaml"
with open(custodian_file, 'w', encoding='utf-8') as f:
yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return (custodian_file, True)
def main():
parser = argparse.ArgumentParser(
description='Comprehensive LinkedIn batch processing - fixes name extraction and creates full dataset'
)
parser.add_argument('--input-dir', type=Path, required=True,
help='Directory containing LinkedIn HTML files')
parser.add_argument('--output-dir', type=Path, required=True,
help='Output directory for staff JSON files')
parser.add_argument('--entity-dir', type=Path, required=True,
help='Output directory for person entity files')
parser.add_argument('--custodian-dir', type=Path, required=True,
help='Directory containing custodian YAML files')
parser.add_argument('--limit', type=int, default=0,
help='Limit processing to first N files (0 = all)')
args = parser.parse_args()
if not args.input_dir.exists():
print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
sys.exit(1)
# Create output directories
args.output_dir.mkdir(parents=True, exist_ok=True)
args.entity_dir.mkdir(parents=True, exist_ok=True)
args.custodian_dir.mkdir(parents=True, exist_ok=True)
# Get all HTML files
html_files = sorted(args.input_dir.glob('*.html'))
if args.limit > 0:
html_files = html_files[:args.limit]
print(f"Processing {len(html_files)} HTML files...")
print(f"Input directory: {args.input_dir}")
print(f"Staff output directory: {args.output_dir}")
print(f"Entity output directory: {args.entity_dir}")
print(f"Custodian directory: {args.custodian_dir}")
# Statistics
stats = {
'total_html': len(html_files),
'processed': 0,
'errors': 0,
'with_staff': 0,
'total_staff': 0,
'entities_created': 0,
'custodians_updated': 0,
'custodians_created': 0,
'name_fixes': 0, # Files where H1 name differs from filename
}
for i, html_path in enumerate(html_files, 1):
try:
print(f"[{i}/{len(html_files)}] Processing: {html_path.name}")
# Step 1: Parse HTML and extract staff
result = process_html_file(html_path, args.output_dir)
# Generate staff JSON filename
slug = result['custodian_metadata']['custodian_slug']
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
staff_filename = args.output_dir / f"{slug}_staff_{timestamp}.json"
# Save staff JSON
with open(staff_filename, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
stats['processed'] += 1
# Step 2: Create person entity files
staff_list = result.get('staff', [])
staff_with_profiles = [s for s in staff_list if s.get('linkedin_slug')]
if staff_with_profiles:
custodian_name = result['custodian_metadata'].get('custodian_name')
for staff_entry in staff_with_profiles:
entity_path = create_person_entity(
staff_entry,
custodian_name,
html_path.name,
args.entity_dir
)
if entity_path:
stats['entities_created'] += 1
# Step 3: Create or update custodian YAML
if staff_with_profiles:
custodian_file, is_new = find_or_create_custodian(
result['custodian_metadata'].get('custodian_name'),
args.custodian_dir,
result
)
if is_new:
stats['custodians_created'] += 1
else:
stats['custodians_updated'] += 1
stats['with_staff'] += 1
stats['total_staff'] += len(staff_with_profiles)
# Check if name was fixed (H1 different from filename)
filename_name = clean_filename_to_name(html_path.name)
html_name = result['custodian_metadata'].get('custodian_name')
if html_name and filename_name and html_name != filename_name:
stats['name_fixes'] += 1
print(f" Name fixed: '{filename_name}' -> '{html_name}'")
except Exception as e:
print(f"Error processing {html_path.name}: {e}", file=sys.stderr)
stats['errors'] += 1
# Print summary
print("\n" + "="*60)
print("PROCESSING COMPLETE")
print("="*60)
print(f"\nStatistics:")
print(f" Total HTML files: {stats['total_html']}")
print(f" Successfully processed: {stats['processed']}")
print(f" Errors: {stats['errors']}")
print(f" Institutions with staff: {stats['with_staff']}")
print(f" Total staff extracted: {stats['total_staff']}")
print(f" Person entities created: {stats['entities_created']}")
print(f" Custodians updated: {stats['custodians_updated']}")
print(f" Custodians created: {stats['custodians_created']}")
print(f" Name fixes applied: {stats['name_fixes']}")
print(f"\nOutput directories:")
print(f" Staff JSON files: {args.output_dir}")
print(f" Person entity files: {args.entity_dir}")
print(f" Custodian YAML files: {args.custodian_dir}")
return 0
if __name__ == '__main__':
sys.exit(main())