#!/usr/bin/env python3 """ Comprehensive LinkedIn Batch Processing - Fix All Issues This script fixes all identified issues with the previous batch processing: 1. Properly cleans filenames (removes macOS resource forks, periods, spaces, parentheses) 2. Extracts full institution name from HTML H1 tag (not from filename) 3. Re-processes all HTML files to extract correct staff data 4. Creates person entity files from staff JSON 5. Creates/updates custodian YAML files Usage: python scripts/linkedin_batch_comprehensive.py \ --input-dir /path/to/html/files \ --output-dir data/custodian/person/bu_fixed \ --entity-dir data/custodian/person/entity \ --custodian-dir data/custodian/ """ import argparse import json import re import sys import unicodedata from collections import Counter from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional try: from bs4 import BeautifulSoup except ImportError: print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr) sys.exit(1) try: import yaml except ImportError: print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr) sys.exit(1) # Import existing parser (we'll enhance it) sys.path.insert(0, str(Path(__file__).parent)) from parse_linkedin_html import parse_html_file, generate_staff_id def clean_filename_to_name(filename: str) -> str: """ Clean HTML filename to extract institution name. Handles: - macOS resource fork prefixes (._) - Periods before numbers (._(15)) - Numbers in parentheses (15), (7) - Extra spaces and underscores - " People _ LinkedIn.html" suffix Examples: "._(15) Gemeente Enkhuizen_ People _ LinkedIn.html" -> "Gemeente Enkhuizen" "(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html" -> "ADVN archief voor nationale bewegingen" "15-arabian-oud_ People _ LinkedIn.html" -> "arabian oud" """ # Remove " People _ LinkedIn.html" suffix name = filename.replace(' People _ LinkedIn.html', '') # Remove .html extension name = name.replace('.html', '') # Remove macOS resource fork prefix (._) if name.startswith('._'): name = name[2:] # Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15) name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name) name = re.sub(r'^\._*\(\d+\)\s*', '', name) # Remove trailing spaces and underscores name = name.strip('_ ') # Replace multiple spaces with single space name = re.sub(r'\s+', ' ', name) return name.strip() def extract_institution_name_from_html(html_content: str) -> Optional[str]: """ Extract full institution name from HTML H1 tag. LinkedIn H1 format: "Organization Name | LinkedIn" We extract the part before the pipe. Returns None if H1 not found. """ soup = BeautifulSoup(html_content, 'html.parser') h1 = soup.find('h1') if h1: h1_text = h1.get_text().strip() # Remove " | LinkedIn" suffix if ' | ' in h1_text: name = h1_text.split(' | ')[0].strip() else: name = h1_text # Clean up extra pipes or separators name = re.sub(r'\s*\|\s*', ' ', name) name = re.sub(r'\s+', ' ', name) return name if name else None return None def process_html_file(html_path: Path, output_dir: Path) -> dict[str, Any]: """ Process a single HTML file to extract staff data. Extracts institution name from HTML H1 tag, not from filename. Cleans filename to generate slug. """ # Extract name from HTML with open(html_path, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() # Try to get name from HTML H1 html_name = extract_institution_name_from_html(html_content) if not html_name: # Fallback: extract from filename html_name = clean_filename_to_name(html_path.name) print(f"Warning: H1 not found in {html_path.name}, using filename: {html_name}", file=sys.stderr) # Generate slug from cleaned filename slug_base = clean_filename_to_name(html_path.name) # Convert to URL-safe slug slug = re.sub(r'[^a-z0-9]+', '-', slug_base.lower()) slug = re.sub(r'-+', '-', slug).strip('-') # Parse HTML using existing parser result = parse_html_file(html_path, html_name, slug) # Update custodian name in result with the one from HTML result['custodian_metadata']['custodian_name'] = html_name result['custodian_metadata']['name'] = html_name # Update source filename result['source_metadata']['source_file'] = html_path.name return result def create_person_entity(staff_entry: dict, custodian_name: str, html_filename: str, entity_dir: Path) -> Optional[Path]: """ Create a person entity JSON file from a staff entry. Follows Rule 20: Person Entity Profiles - Individual File Storage """ name = staff_entry.get('name', '') if not name or name.startswith('LinkedIn Member'): # Skip anonymous profiles - they don't have entity profiles return None # Generate person profile path # Format: {linkedin-slug}_{ISO-timestamp}.json linkedin_slug = staff_entry.get('linkedin_slug', '') timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') entity_filename = f"{linkedin_slug}_{timestamp}.json" entity_path = entity_dir / entity_filename # Check if file already exists if entity_path.exists(): return entity_path # Create person entity structure person_entity = { 'extraction_agent': 'claude-opus-4.5', 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'profile_data': { 'person_id': staff_entry.get('staff_id'), 'full_name': name, 'linkedin_slug': linkedin_slug, 'linkedin_profile_url': staff_entry.get('linkedin_profile_url'), 'headline': staff_entry.get('headline', ''), 'degree': staff_entry.get('degree', 'unknown'), 'mutual_connections': staff_entry.get('mutual_connections', ''), }, 'affiliations': [{ 'organization_name': custodian_name, 'organization_slug': None, # Will be filled during custodian matching 'role_title': staff_entry.get('headline', ''), 'affiliation_type': 'staff', 'affiliation_provenance': { 'source_type': 'linkedin_company_people_page_html', 'source_file': html_filename, 'registered_timestamp': timestamp, 'registration_method': 'html_parsing', } }], 'web_claims': [], # Could be enhanced by scraping profile pages 'extraction_metadata': { 'heritage_relevant': staff_entry.get('heritage_relevant', False), 'heritage_type': staff_entry.get('heritage_type'), 'name_type': staff_entry.get('name_type', 'unknown'), } } # Add name correction if present if 'name_correction' in staff_entry: person_entity['extraction_metadata']['name_correction'] = staff_entry['name_correction'] # Write entity file entity_path.parent.mkdir(parents=True, exist_ok=True) with open(entity_path, 'w', encoding='utf-8') as f: json.dump(person_entity, f, indent=2, ensure_ascii=False) return entity_path def find_or_create_custodian(custodian_name: str, custodian_dir: Path, staff_data: dict) -> tuple[Path, bool]: """ Find existing custodian YAML file or create new one. Returns (file_path, is_new) """ # Try to find existing custodian by name (case-insensitive) existing_file = None for custodian_file in custodian_dir.glob('*.yaml'): try: with open(custodian_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and data.get('custodian_name', '').lower() == custodian_name.lower(): existing_file = custodian_file break except: continue if existing_file: # Update existing file custodian_file = existing_file is_new = False # Read existing data with open(custodian_file, 'r', encoding='utf-8') as f: custodian_data = yaml.safe_load(f) or {} # Add/update staff section custodian_data['staff'] = { 'provenance': { 'source_type': 'linkedin_company_people_page_html', 'registered_timestamp': staff_data['source_metadata']['registered_timestamp'], 'registration_method': 'html_parsing', 'total_staff_extracted': len(staff_data['staff']), }, 'staff_list': [ { 'staff_id': s.get('staff_id'), 'person_name': s.get('name'), 'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json", 'role_title': s.get('headline', ''), 'heritage_relevant': s.get('heritage_relevant', False), 'heritage_type': s.get('heritage_type'), } for s in staff_data['staff'] if s.get('linkedin_slug') # Only include staff with profiles ] } # Update custodian name custodian_data['custodian_name'] = custodian_name # Write back with open(custodian_file, 'w', encoding='utf-8') as f: yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return (custodian_file, False) else: # Create new custodian file # Generate placeholder GHCID (requires geographic research) slug = re.sub(r'[^a-z0-9]+', '-', custodian_name.lower()) slug = re.sub(r'-+', '-', slug).strip('-') slug = slug[:30] # Limit length placeholder_ghcid = f"NL-XX-XXX-PENDING-{slug.upper()}" custodian_data = { 'ghcid_current': placeholder_ghcid, 'custodian_name': custodian_name, 'institution_type': 'MUSEUM', # Default, will be refined based on staff 'custodian_name': { 'emic_name': custodian_name, 'english_name': None, 'name_verified': True, 'name_source': 'linkedin_html_h1', }, 'staff': { 'provenance': { 'source_type': 'linkedin_company_people_page_html', 'registered_timestamp': staff_data['source_metadata']['registered_timestamp'], 'registration_method': 'html_parsing', 'total_staff_extracted': len(staff_data['staff']), }, 'staff_list': [ { 'staff_id': s.get('staff_id'), 'person_name': s.get('name'), 'person_profile_path': f"data/custodian/person/entity/{s.get('linkedin_slug', '')}_*.json", 'role_title': s.get('headline', ''), 'heritage_relevant': s.get('heritage_relevant', False), 'heritage_type': s.get('heritage_type'), } for s in staff_data['staff'] if s.get('linkedin_slug') # Only include staff with profiles ] }, 'provenance': { 'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Comprehensive batch processing with HTML H1 name extraction', 'confidence_score': 0.85, 'notes': f'Staff extracted from LinkedIn company People page. Location research needed for GHCID. Total staff: {len(staff_data["staff"])}', } } # Determine institution type based on staff heritage analysis heritage_types = staff_data['staff_analysis'].get('staff_by_heritage_type', {}) if heritage_types: # Find most common heritage type most_common = Counter(heritage_types).most_common(1) if most_common: type_code = most_common[0][0] type_map = { 'M': 'MUSEUM', 'L': 'LIBRARY', 'A': 'ARCHIVE', 'G': 'GALLERY', 'R': 'RESEARCH_CENTER', 'E': 'EDUCATION_PROVIDER', 'S': 'COLLECTING_SOCIETY', 'D': 'DIGITAL_PLATFORM', } if type_code in type_map: custodian_data['institution_type'] = type_map[type_code] # Create new file custodian_file = custodian_dir / f"{placeholder_ghcid}.yaml" with open(custodian_file, 'w', encoding='utf-8') as f: yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return (custodian_file, True) def main(): parser = argparse.ArgumentParser( description='Comprehensive LinkedIn batch processing - fixes name extraction and creates full dataset' ) parser.add_argument('--input-dir', type=Path, required=True, help='Directory containing LinkedIn HTML files') parser.add_argument('--output-dir', type=Path, required=True, help='Output directory for staff JSON files') parser.add_argument('--entity-dir', type=Path, required=True, help='Output directory for person entity files') parser.add_argument('--custodian-dir', type=Path, required=True, help='Directory containing custodian YAML files') parser.add_argument('--limit', type=int, default=0, help='Limit processing to first N files (0 = all)') args = parser.parse_args() if not args.input_dir.exists(): print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr) sys.exit(1) # Create output directories args.output_dir.mkdir(parents=True, exist_ok=True) args.entity_dir.mkdir(parents=True, exist_ok=True) args.custodian_dir.mkdir(parents=True, exist_ok=True) # Get all HTML files html_files = sorted(args.input_dir.glob('*.html')) if args.limit > 0: html_files = html_files[:args.limit] print(f"Processing {len(html_files)} HTML files...") print(f"Input directory: {args.input_dir}") print(f"Staff output directory: {args.output_dir}") print(f"Entity output directory: {args.entity_dir}") print(f"Custodian directory: {args.custodian_dir}") # Statistics stats = { 'total_html': len(html_files), 'processed': 0, 'errors': 0, 'with_staff': 0, 'total_staff': 0, 'entities_created': 0, 'custodians_updated': 0, 'custodians_created': 0, 'name_fixes': 0, # Files where H1 name differs from filename } for i, html_path in enumerate(html_files, 1): try: print(f"[{i}/{len(html_files)}] Processing: {html_path.name}") # Step 1: Parse HTML and extract staff result = process_html_file(html_path, args.output_dir) # Generate staff JSON filename slug = result['custodian_metadata']['custodian_slug'] timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') staff_filename = args.output_dir / f"{slug}_staff_{timestamp}.json" # Save staff JSON with open(staff_filename, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) stats['processed'] += 1 # Step 2: Create person entity files staff_list = result.get('staff', []) staff_with_profiles = [s for s in staff_list if s.get('linkedin_slug')] if staff_with_profiles: custodian_name = result['custodian_metadata'].get('custodian_name') for staff_entry in staff_with_profiles: entity_path = create_person_entity( staff_entry, custodian_name, html_path.name, args.entity_dir ) if entity_path: stats['entities_created'] += 1 # Step 3: Create or update custodian YAML if staff_with_profiles: custodian_file, is_new = find_or_create_custodian( result['custodian_metadata'].get('custodian_name'), args.custodian_dir, result ) if is_new: stats['custodians_created'] += 1 else: stats['custodians_updated'] += 1 stats['with_staff'] += 1 stats['total_staff'] += len(staff_with_profiles) # Check if name was fixed (H1 different from filename) filename_name = clean_filename_to_name(html_path.name) html_name = result['custodian_metadata'].get('custodian_name') if html_name and filename_name and html_name != filename_name: stats['name_fixes'] += 1 print(f" Name fixed: '{filename_name}' -> '{html_name}'") except Exception as e: print(f"Error processing {html_path.name}: {e}", file=sys.stderr) stats['errors'] += 1 # Print summary print("\n" + "="*60) print("PROCESSING COMPLETE") print("="*60) print(f"\nStatistics:") print(f" Total HTML files: {stats['total_html']}") print(f" Successfully processed: {stats['processed']}") print(f" Errors: {stats['errors']}") print(f" Institutions with staff: {stats['with_staff']}") print(f" Total staff extracted: {stats['total_staff']}") print(f" Person entities created: {stats['entities_created']}") print(f" Custodians updated: {stats['custodians_updated']}") print(f" Custodians created: {stats['custodians_created']}") print(f" Name fixes applied: {stats['name_fixes']}") print(f"\nOutput directories:") print(f" Staff JSON files: {args.output_dir}") print(f" Person entity files: {args.entity_dir}") print(f" Custodian YAML files: {args.custodian_dir}") return 0 if __name__ == '__main__': sys.exit(main())