#!/usr/bin/env python3 """ Batch process all LinkedIn company People HTML files from manual directory. This script: 1. Scans manual directory for all HTML files 2. Extracts institution names from filenames 3. Runs parse_linkedin_html.py for each file 4. Creates person entity files for each staff member 5. Creates or updates custodian YAML files Usage: python scripts/batch_parse_linkedin_manual.py [--limit N] Options: --limit N Only process first N files (for testing) """ import json import os import re import subprocess import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional try: import yaml except ImportError: yaml = None # Directory paths MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual") CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") PERSON_ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") def extract_institution_name_from_filename(filename: str) -> Optional[str]: """Extract institution name from LinkedIn People HTML filename.""" name = Path(filename).name name = name.replace('.html', '') name = re.sub(r'_?People _ LinkedIn$', '', name) name = re.sub(r'^\(\d+\)\s*', '', name) name = re.sub(r'\s+', ' ', name).strip() name = name.rstrip('_') return name if name else None def generate_slug_from_name(name: str) -> str: """Generate URL-friendly slug from institution name.""" slug = name.lower() slug = re.sub(r'[^a-z0-9\s-]', '', slug) slug = re.sub(r'[\s-]+', '-', slug) slug = slug.strip('-') return slug def parse_html_file(html_path: Path, institution_name: str, slug: str) -> Optional[dict]: """Parse a single HTML file using parse_linkedin_html.py script.""" output_path = Path(f"/tmp/{slug}_staff_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json") try: result = subprocess.run( [ sys.executable, "/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py", str(html_path), "--custodian-name", institution_name, "--custodian-slug", slug, "--output", str(output_path) ], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"ERROR parsing {html_path.name}: {result.stderr}", file=sys.stderr) return None with open(output_path, 'r', encoding='utf-8') as f: return json.load(f) except subprocess.TimeoutExpired: print(f"TIMEOUT parsing {html_path.name}", file=sys.stderr) return None except Exception as e: print(f"ERROR parsing {html_path.name}: {e}", file=sys.stderr) return None def find_existing_custodian(institution_name: str) -> Optional[Path]: """Find existing custodian YAML file by institution name.""" if not yaml: return None for yaml_file in CUSTODIAN_DIR.glob("*.yaml"): try: with open(yaml_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if isinstance(data, list) and len(data) > 0: data = data[0] if data and isinstance(data, dict) and 'name' in data: name = data.get('name') if name and name.lower() == institution_name.lower(): return yaml_file alt_names = data.get('alternative_names', []) if isinstance(alt_names, list): for alt in alt_names: alt_str = str(alt) if not isinstance(alt, str) else alt if alt_str.lower() == institution_name.lower(): return yaml_file except Exception: continue return None def create_person_entity(staff_member: dict, custodian_slug: str, custodian_name: str, institution_type: str) -> Optional[str]: """ Create a person entity file following Rule 20 (Person Entity Profiles). Returns path to created file or None on error. """ person_id = staff_member.get('staff_id') if not person_id: return None timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') filename = f"{person_id}_{timestamp}.json" output_path = PERSON_ENTITY_DIR / filename person_entity = { 'person_id': person_id, 'extraction_metadata': { 'extraction_agent': 'claude-opus-4.5', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_source': f'LinkedIn company page: {custodian_name}', 'source_url': staff_member.get('linkedin_profile_url'), }, 'profile_data': { 'full_name': staff_member.get('name'), 'name_type': staff_member.get('name_type'), 'headline': staff_member.get('headline', ''), 'linkedin_slug': staff_member.get('linkedin_slug'), 'linkedin_profile_url': staff_member.get('linkedin_profile_url'), 'connection_degree': staff_member.get('degree'), 'mutual_connections': staff_member.get('mutual_connections', ''), }, 'heritage_relevance': { 'is_heritage_relevant': staff_member.get('heritage_relevant', False), 'heritage_type': staff_member.get('heritage_type'), 'custodian_name': custodian_name, 'institution_type': institution_type, }, 'affiliations': [ { 'custodian_name': custodian_name, 'custodian_slug': custodian_slug, 'role_title': staff_member.get('headline', ''), 'affiliation_type': 'employment', 'affiliation_source': 'LinkedIn company page', 'affiliation_source_url': staff_member.get('linkedin_profile_url', ''), } ] } try: with open(output_path, 'w', encoding='utf-8') as f: json.dump(person_entity, f, indent=2, ensure_ascii=False) return str(output_path) except Exception as e: print(f"ERROR creating person entity: {e}", file=sys.stderr) return None def create_or_update_custodian(institution_name: str, slug: str, parse_result: dict, html_file: str) -> Optional[Path]: """ Create new custodian YAML file or update existing one. Returns path to custodian file. """ existing_file = find_existing_custodian(institution_name) custodian_metadata = parse_result.get('custodian_metadata', {}) staff_list = parse_result.get('staff', []) source_metadata = parse_result.get('source_metadata', {}) staff_count = len([s for s in staff_list if s.get('name_type') != 'organization']) institution_type = 'UNKNOWN' staff_analysis = parse_result.get('staff_analysis', {}) heritage_types = staff_analysis.get('staff_by_heritage_type', {}) if heritage_types: # Map to GLAMORCUBESFIXPHDNT type type_mapping = { 'G': 'GALLERY', 'L': 'LIBRARY', 'A': 'ARCHIVE', 'M': 'MUSEUM', 'O': 'OFFICIAL_INSTITUTION', 'R': 'RESEARCH_CENTER', 'C': 'CORPORATION', 'E': 'EDUCATION_PROVIDER', 'S': 'COLLECTING_SOCIETY', 'D': 'DIGITAL_PLATFORM', 'I': 'INTANGIBLE_HERITAGE_GROUP', 'T': 'TASTE_SMELL', 'B': 'BOTANICAL_ZOO', 'H': 'HOLY_SITES', 'F': 'FEATURES', 'N': 'NGO', 'X': 'MIXED', 'P': 'PERSONAL_COLLECTION', 'U': 'UNKNOWN' } for htype in heritage_types.keys(): if heritage_types[htype] > 0: institution_type = type_mapping.get(htype, 'UNKNOWN') break if existing_file: print(f" UPDATING: {existing_file.name}") with open(existing_file, 'r', encoding='utf-8') as f: if yaml: custodian_data = yaml.safe_load(f) else: custodian_data = json.load(f) if isinstance(custodian_data, list) and len(custodian_data) > 0: custodian_data = custodian_data[0] # Add or update staff section staff_section = { 'staff_count': staff_count, 'staff_source': { 'source_type': 'linkedin_company_people_page_html', 'source_file': html_file, 'registered_timestamp': source_metadata.get('registered_timestamp'), 'registration_method': 'html_parsing', }, 'staff': [ { 'person_id': s.get('staff_id'), 'person_name': s.get('name'), 'role_title': s.get('headline', ''), 'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json", 'affiliation_provenance': { 'source': 'LinkedIn company page', 'source_url': s.get('linkedin_profile_url', ''), 'retrieved_on': datetime.now(timezone.utc).isoformat(), } } for s in staff_list if s.get('name_type') != 'organization' ] } custodian_data['staff'] = staff_section custodian_data['provenance'] = custodian_data.get('provenance', {}) custodian_data['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() with open(existing_file, 'w', encoding='utf-8') as f: if yaml: yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False) else: json.dump([custodian_data], f, indent=2, ensure_ascii=False) return existing_file else: # Create new custodian file print(f" CREATING: NL-XX-XXX-?-{slug}.yaml (placeholder GHCID)") custodian_data = { 'name': institution_name, 'institution_type': institution_type, 'description': f"Institution profile extracted from LinkedIn company page. Industry: {custodian_metadata.get('industry', 'Unknown')}", 'ghcid': { 'ghcid_current': 'NL-XX-XXX-PENDING', # Placeholder - needs research 'location_resolution': { 'method': 'PENDING', 'notes': 'GHCID not assigned - requires geographic research' } }, 'staff': { 'staff_count': staff_count, 'staff_source': { 'source_type': 'linkedin_company_people_page_html', 'source_file': html_file, 'registered_timestamp': source_metadata.get('registered_timestamp'), 'registration_method': 'html_parsing', }, 'staff': [ { 'person_id': s.get('staff_id'), 'person_name': s.get('name'), 'role_title': s.get('headline', ''), 'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json", 'affiliation_provenance': { 'source': 'LinkedIn company page', 'source_url': s.get('linkedin_profile_url', ''), 'retrieved_on': datetime.now(timezone.utc).isoformat(), } } for s in staff_list if s.get('name_type') != 'organization' ] }, 'provenance': { 'data_source': 'LINKEDIN_COMPANY_PAGE', 'data_tier': 'TIER_3_CROWD_SOURCED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Batch HTML parsing from manual directory', 'confidence_score': 0.70, 'source_metadata': { 'linkedin_page_type': 'company_people_page', 'source_file': html_file, 'staff_extracted': staff_count, } } } # Generate output filename output_filename = f"NL-XX-UNKNOWN-{institution_type[0:3]}-{slug}.yaml" output_path = CUSTODIAN_DIR / output_filename with open(output_path, 'w', encoding='utf-8') as f: if yaml: yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False) else: json.dump([custodian_data], f, indent=2, ensure_ascii=False) print(f" → Created: {output_filename}") return output_path def main(): """Main batch processing function.""" # Parse command line arguments limit = None if '--limit' in sys.argv: idx = sys.argv.index('--limit') if idx + 1 < len(sys.argv): limit = int(sys.argv[idx + 1]) # Ensure output directories exist PERSON_ENTITY_DIR.mkdir(parents=True, exist_ok=True) CUSTODIAN_DIR.mkdir(parents=True, exist_ok=True) # Get all HTML files html_files = sorted(MANUAL_DIR.glob("*.html")) if limit: html_files = html_files[:limit] print(f"LIMIT MODE: Processing first {limit} files (of {len(sorted(MANUAL_DIR.glob('*.html')))} total)") print(f"\nFound {len(html_files)} HTML files to process") print(f"Input directory: {MANUAL_DIR}") print(f"Output directories:") print(f" - Person entities: {PERSON_ENTITY_DIR}") print(f" - Custodian files: {CUSTODIAN_DIR}") print(f"\nStarting batch processing...") print() # Track statistics stats = { 'total_files': len(html_files), 'processed': 0, 'errors': 0, 'new_custodians': 0, 'existing_custodians': 0, 'total_staff': 0, 'person_entities_created': 0, 'anonymous_members': 0, 'heritage_relevant_staff': 0, 'custodians_by_type': defaultdict(int), 'errors_list': [], } # Process each HTML file for i, html_file in enumerate(html_files, 1): try: print(f"[{i}/{len(html_files)}] Processing: {html_file.name}") # Extract institution name from filename institution_name = extract_institution_name_from_filename(html_file.name) if not institution_name: print(f" SKIP: Could not extract name from filename") stats['errors'] += 1 stats['errors_list'].append(html_file.name) continue # Generate slug slug = generate_slug_from_name(institution_name) # Parse HTML file parse_result = parse_html_file(html_file, institution_name, slug) if not parse_result: stats['errors'] += 1 stats['errors_list'].append(html_file.name) continue stats['processed'] += 1 staff_list = parse_result.get('staff', []) staff_analysis = parse_result.get('staff_analysis', {}) stats['total_staff'] += staff_analysis.get('total_staff_extracted', 0) stats['anonymous_members'] += staff_analysis.get('anonymous_members', 0) stats['heritage_relevant_staff'] += staff_analysis.get('heritage_relevant_count', 0) # Create or update custodian custodian_file = create_or_update_custodian(institution_name, slug, parse_result, html_file.name) if custodian_file: # Check if new or existing existing = find_existing_custodian(institution_name) if existing: stats['existing_custodians'] += 1 else: stats['new_custodians'] += 1 # Track institution type staff_by_type = staff_analysis.get('staff_by_heritage_type', {}) if staff_by_type: for htype in staff_by_type.keys(): if staff_by_type[htype] > 0: # Map to GH type type_map = {'M': 'MUSEUM', 'L': 'LIBRARY', 'A': 'ARCHIVE'} stats['custodians_by_type'][type_map.get(htype, 'UNKNOWN')] += 1 # Create person entity files for each staff member staff_count = 0 for staff_member in staff_list: if staff_member.get('name_type') != 'organization': staff_count += 1 # Only create person entity if heritage-relevant or has LinkedIn URL if staff_member.get('heritage_relevant') or staff_member.get('linkedin_profile_url'): person_file = create_person_entity( staff_member, slug, institution_name, parse_result.get('custodian_metadata', {}).get('institution_type', 'UNKNOWN') ) if person_file: stats['person_entities_created'] += 1 if i % 50 == 0 or i == len(html_files): print() print(f"Progress: {i}/{len(html_files)} files processed") print(f" New custodians: {stats['new_custodians']}") print(f" Existing custodians: {stats['existing_custodians']}") print(f" Total staff extracted: {stats['total_staff']}") print(f" Person entities created: {stats['person_entities_created']}") print() except Exception as e: print(f"ERROR processing {html_file.name}: {e}", file=sys.stderr) stats['errors'] += 1 stats['errors_list'].append(f"{html_file.name}: {e}") continue # Print final statistics print("\n" + "="*60) print("BATCH PROCESSING COMPLETE") print("="*60) print(f"Total files: {stats['total_files']}") print(f"Successfully processed: {stats['processed']}") print(f"Errors: {stats['errors']}") if stats['errors'] > 0 and stats['errors_list']: print(f"\nError details:") for err in stats['errors_list'][:10]: print(f" - {err}") if len(stats['errors_list']) > 10: print(f" ... and {len(stats['errors_list']) - 10} more errors") print() print(f"New custodians: {stats['new_custodians']}") print(f"Existing custodians: {stats['existing_custodians']}") print() print(f"Total staff extracted: {stats['total_staff']}") print(f"Heritage-relevant staff: {stats['heritage_relevant_staff']}") print(f"Anonymous members: {stats['anonymous_members']}") print(f"Person entity files created: {stats['person_entities_created']}") print() print("Custodians by type:") for ctype, count in sorted(stats['custodians_by_type'].items()): print(f" {ctype}: {count}") print("="*60) return 0 if __name__ == '__main__': sys.exit(main())