#!/usr/bin/env python3 """ Fast LinkedIn H1 Name Extraction This is a FAST version that: 1. Extracts H1 institution names from HTML files 2. Cleans filenames properly (removes macOS resource forks, periods, parentheses) 3. Creates custodian YAML files with basic metadata 4. Does NOT extract detailed staff (too slow for 3335 files) This solves the critical issues: - Name extraction from H1 tags (not filenames) - Proper filename cleaning Usage: python scripts/linkedin_h1_fast.py \ --input-dir /path/to/html/files \ --output-dir data/custodian/ """ import argparse import json import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional try: from bs4 import BeautifulSoup except ImportError: print("Error: beautifulsoup4 not installed. Run: pip install beautifulsoup4", file=sys.stderr) sys.exit(1) try: import yaml except ImportError: print("Error: yaml not installed. Run: pip install pyyaml", file=sys.stderr) sys.exit(1) def clean_filename_to_slug(filename: str) -> str: """ Clean HTML filename to generate URL-safe slug. Handles: - macOS resource fork prefixes (._) - Periods before numbers (._(15)) - Numbers in parentheses (15), (7) - Extra spaces and underscores - " People _ LinkedIn.html" suffix Examples: "._(15) Gemeente Enkhuizen_ People _ LinkedIn.html" -> "gemeente-enkhuizen" "(7) ADVN _ archief voor nationale bewegingen_ People _ LinkedIn.html" -> "advn-archief-voor-nationale-bewegingen" """ # Remove " People _ LinkedIn.html" suffix name = filename.replace(' People _ LinkedIn.html', '') name = name.replace('.html', '') # Remove macOS resource fork prefix (._) if name.startswith('._'): name = name[2:] # Remove leading period followed by numbers/parentheses: ._(15), .(15), _(15) name = re.sub(r'^\.?\_?\(\d+\)\s*', '', name) name = re.sub(r'^\._*\(\d+\)\s*', '', name) # Remove trailing spaces and underscores name = name.strip('_ ') # Replace multiple spaces with single space name = re.sub(r'\s+', ' ', name) # Convert to URL-safe slug slug = re.sub(r'[^a-z0-9]+', '-', name.lower()) slug = re.sub(r'-+', '-', slug).strip('-') return slug def extract_h1_name(html_content: str) -> Optional[str]: """ Extract institution name from HTML H1 tag. LinkedIn H1 format: "Organization Name | LinkedIn" We extract the part before the pipe. Returns None if H1 not found. """ soup = BeautifulSoup(html_content, 'html.parser') h1 = soup.find('h1') if h1: h1_text = h1.get_text().strip() # Remove " | LinkedIn" suffix if ' | ' in h1_text: name = h1_text.split(' | ')[0].strip() else: name = h1_text # Clean up extra pipes or separators name = re.sub(r'\s*\|\s*', ' ', name) name = re.sub(r'\s+', ' ', name) return name if name else None return None def process_single_file(html_path: Path, output_dir: Path) -> dict: """ Process a single HTML file. Extracts H1 name and creates custodian YAML. """ # Read HTML with open(html_path, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() # Extract name from H1 h1_name = extract_h1_name(html_content) if not h1_name: # Fallback: extract from filename filename_clean = html_path.name.replace(' People _ LinkedIn.html', '') filename_clean = filename_clean.replace('.html', '') if filename_clean.startswith('._'): filename_clean = filename_clean[2:] filename_clean = re.sub(r'^\.?\_?\(\d+\)\s*', '', filename_clean) filename_clean = re.sub(r'^\._*\(\d+\)\s*', '', filename_clean) filename_clean = re.sub(r'\s+', ' ', filename_clean).strip() h1_name = filename_clean # Generate slug slug = clean_filename_to_slug(html_path.name) # Try to extract basic metadata follower_count = '' associated_members = 0 # Look for follower count (e.g., "86K followers") follower_match = re.search(r'(\d+K?)\s+followers?', html_content, re.IGNORECASE) if follower_match: follower_count = follower_match.group(1) # Look for associated members member_match = re.search(r'(\d+)\s+associated\s+members?', html_content, re.IGNORECASE) if member_match: associated_members = int(member_match.group(1)) # Count staff mentions (rough count of LinkedIn profiles) # Look for profile cards profile_count = len(re.findall(r'org-people-profile-card', html_content)) # Create custodian data timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') custodian_data = { 'ghcid_current': f"NL-XX-XXX-PENDING-{slug.upper()}", 'custodian_name': h1_name, 'institution_type': 'MUSEUM', # Default, can be refined later 'custodian_name': { 'emic_name': h1_name, 'english_name': None, 'name_verified': True, 'name_source': 'linkedin_html_h1', }, 'linkedin_enrichment': { 'source_file': html_path.name, 'extraction_date': timestamp, 'follower_count': follower_count, 'associated_members': associated_members, 'profile_cards_detected': profile_count, 'source_type': 'linkedin_company_people_page_html', 'extraction_method': 'h1_name_extraction_only', }, 'provenance': { 'data_source': 'LINKEDIN_HTML_PEOPLE_PAGE', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': timestamp, 'extraction_method': 'Fast H1 name extraction', 'confidence_score': 0.90, 'notes': f'H1 institution name extracted from HTML. Profile cards detected: {profile_count}. Detailed staff extraction not performed due to performance constraints.', }, } return { 'status': 'success', 'slug': slug, 'filename': html_path.name, 'custodian_name': h1_name, 'custodian_data': custodian_data, } def main(): parser = argparse.ArgumentParser( description='Fast LinkedIn H1 name extraction - solves name extraction issues' ) parser.add_argument('--input-dir', type=Path, required=True, help='Directory containing LinkedIn HTML files') parser.add_argument('--output-dir', type=Path, required=True, help='Output directory for custodian YAML files') parser.add_argument('--limit', type=int, default=0, help='Limit processing to first N files (0 = all)') args = parser.parse_args() if not args.input_dir.exists(): print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr) sys.exit(1) # Create output directory args.output_dir.mkdir(parents=True, exist_ok=True) # Get all HTML files html_files = sorted(args.input_dir.glob('*.html')) if args.limit > 0: html_files = html_files[:args.limit] print(f"Processing {len(html_files)} HTML files...") print(f"Input directory: {args.input_dir}") print(f"Output directory: {args.output_dir}") print(f"This will extract H1 names and create custodian YAMLs") print(f"Estimated time: ~{len(html_files)} seconds (~{len(html_files)//60} minutes)") # Statistics stats = { 'total': len(html_files), 'success': 0, 'errors': 0, 'name_from_h1': 0, 'name_from_filename': 0, 'with_profiles': 0, 'total_profiles_detected': 0, } # Process files for i, html_path in enumerate(html_files, 1): try: if i % 100 == 0: print(f"Progress: [{i}/{len(html_files)}]", end='\r') result = process_single_file(html_path, args.output_dir) if result['status'] == 'success': stats['success'] += 1 stats['total_profiles_detected'] += result['custodian_data'].get('linkedin_enrichment', {}).get('profile_cards_detected', 0) # Save custodian YAML custodian_file = args.output_dir / f"{result['slug']}.yaml" with open(custodian_file, 'w', encoding='utf-8') as f: yaml.dump(result['custodian_data'], f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Track name source if 'linkedin_html_h1' in result['custodian_data'].get('custodian_name', {}).get('name_source', ''): stats['name_from_h1'] += 1 else: stats['name_from_filename'] += 1 elif result['status'] == 'error': stats['errors'] += 1 print(f"Error: {result['filename']}: {result.get('error')}", file=sys.stderr) except Exception as e: stats['errors'] += 1 print(f"Error: {html_path.name}: {e}", file=sys.stderr) print(f"\nProcessing complete!") # Print summary print("\n" + "=" * 60) print("PROCESSING COMPLETE") print("=" * 60) print(f"\nStatistics:") print(f" Total HTML files: {stats['total']}") print(f" Successfully processed: {stats['success']}") print(f" Errors: {stats['errors']}") print(f" Names from H1: {stats['name_from_h1']}") print(f" Names from filename: {stats['name_from_filename']}") print(f" Total profiles detected: {stats['total_profiles_detected']}") print(f"\nOutput directory: {args.output_dir}") # Save processing report report = { 'processing_date': datetime.now(timezone.utc).isoformat(), 'input_directory': str(args.input_dir), 'output_directory': str(args.output_dir), 'statistics': stats, } report_file = Path('reports/linkedin_h1_fast_report.json') report_file.parent.mkdir(parents=True, exist_ok=True) with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nReport saved to: {report_file}") return 0 if __name__ == '__main__': sys.exit(main())