glam/scripts/batch_parse_linkedin_manual.py
2025-12-30 03:43:31 +01:00

501 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Batch process all LinkedIn company People HTML files from manual directory.
This script:
1. Scans manual directory for all HTML files
2. Extracts institution names from filenames
3. Runs parse_linkedin_html.py for each file
4. Creates person entity files for each staff member
5. Creates or updates custodian YAML files
Usage:
python scripts/batch_parse_linkedin_manual.py [--limit N]
Options:
--limit N Only process first N files (for testing)
"""
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
try:
import yaml
except ImportError:
yaml = None
# Directory paths
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
PERSON_ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
def extract_institution_name_from_filename(filename: str) -> Optional[str]:
"""Extract institution name from LinkedIn People HTML filename."""
name = Path(filename).name
name = name.replace('.html', '')
name = re.sub(r'_?People _ LinkedIn$', '', name)
name = re.sub(r'^\(\d+\)\s*', '', name)
name = re.sub(r'\s+', ' ', name).strip()
name = name.rstrip('_')
return name if name else None
def generate_slug_from_name(name: str) -> str:
"""Generate URL-friendly slug from institution name."""
slug = name.lower()
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
slug = re.sub(r'[\s-]+', '-', slug)
slug = slug.strip('-')
return slug
def parse_html_file(html_path: Path, institution_name: str, slug: str) -> Optional[dict]:
"""Parse a single HTML file using parse_linkedin_html.py script."""
output_path = Path(f"/tmp/{slug}_staff_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json")
try:
result = subprocess.run(
[
sys.executable,
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
str(html_path),
"--custodian-name", institution_name,
"--custodian-slug", slug,
"--output", str(output_path)
],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"ERROR parsing {html_path.name}: {result.stderr}", file=sys.stderr)
return None
with open(output_path, 'r', encoding='utf-8') as f:
return json.load(f)
except subprocess.TimeoutExpired:
print(f"TIMEOUT parsing {html_path.name}", file=sys.stderr)
return None
except Exception as e:
print(f"ERROR parsing {html_path.name}: {e}", file=sys.stderr)
return None
def find_existing_custodian(institution_name: str) -> Optional[Path]:
"""Find existing custodian YAML file by institution name."""
if not yaml:
return None
for yaml_file in CUSTODIAN_DIR.glob("*.yaml"):
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if isinstance(data, list) and len(data) > 0:
data = data[0]
if data and isinstance(data, dict) and 'name' in data:
name = data.get('name')
if name and name.lower() == institution_name.lower():
return yaml_file
alt_names = data.get('alternative_names', [])
if isinstance(alt_names, list):
for alt in alt_names:
alt_str = str(alt) if not isinstance(alt, str) else alt
if alt_str.lower() == institution_name.lower():
return yaml_file
except Exception:
continue
return None
def create_person_entity(staff_member: dict, custodian_slug: str, custodian_name: str, institution_type: str) -> Optional[str]:
"""
Create a person entity file following Rule 20 (Person Entity Profiles).
Returns path to created file or None on error.
"""
person_id = staff_member.get('staff_id')
if not person_id:
return None
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
filename = f"{person_id}_{timestamp}.json"
output_path = PERSON_ENTITY_DIR / filename
person_entity = {
'person_id': person_id,
'extraction_metadata': {
'extraction_agent': 'claude-opus-4.5',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_source': f'LinkedIn company page: {custodian_name}',
'source_url': staff_member.get('linkedin_profile_url'),
},
'profile_data': {
'full_name': staff_member.get('name'),
'name_type': staff_member.get('name_type'),
'headline': staff_member.get('headline', ''),
'linkedin_slug': staff_member.get('linkedin_slug'),
'linkedin_profile_url': staff_member.get('linkedin_profile_url'),
'connection_degree': staff_member.get('degree'),
'mutual_connections': staff_member.get('mutual_connections', ''),
},
'heritage_relevance': {
'is_heritage_relevant': staff_member.get('heritage_relevant', False),
'heritage_type': staff_member.get('heritage_type'),
'custodian_name': custodian_name,
'institution_type': institution_type,
},
'affiliations': [
{
'custodian_name': custodian_name,
'custodian_slug': custodian_slug,
'role_title': staff_member.get('headline', ''),
'affiliation_type': 'employment',
'affiliation_source': 'LinkedIn company page',
'affiliation_source_url': staff_member.get('linkedin_profile_url', ''),
}
]
}
try:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(person_entity, f, indent=2, ensure_ascii=False)
return str(output_path)
except Exception as e:
print(f"ERROR creating person entity: {e}", file=sys.stderr)
return None
def create_or_update_custodian(institution_name: str, slug: str, parse_result: dict, html_file: str) -> Optional[Path]:
"""
Create new custodian YAML file or update existing one.
Returns path to custodian file.
"""
existing_file = find_existing_custodian(institution_name)
custodian_metadata = parse_result.get('custodian_metadata', {})
staff_list = parse_result.get('staff', [])
source_metadata = parse_result.get('source_metadata', {})
staff_count = len([s for s in staff_list if s.get('name_type') != 'organization'])
institution_type = 'UNKNOWN'
staff_analysis = parse_result.get('staff_analysis', {})
heritage_types = staff_analysis.get('staff_by_heritage_type', {})
if heritage_types:
# Map to GLAMORCUBESFIXPHDNT type
type_mapping = {
'G': 'GALLERY',
'L': 'LIBRARY',
'A': 'ARCHIVE',
'M': 'MUSEUM',
'O': 'OFFICIAL_INSTITUTION',
'R': 'RESEARCH_CENTER',
'C': 'CORPORATION',
'E': 'EDUCATION_PROVIDER',
'S': 'COLLECTING_SOCIETY',
'D': 'DIGITAL_PLATFORM',
'I': 'INTANGIBLE_HERITAGE_GROUP',
'T': 'TASTE_SMELL',
'B': 'BOTANICAL_ZOO',
'H': 'HOLY_SITES',
'F': 'FEATURES',
'N': 'NGO',
'X': 'MIXED',
'P': 'PERSONAL_COLLECTION',
'U': 'UNKNOWN'
}
for htype in heritage_types.keys():
if heritage_types[htype] > 0:
institution_type = type_mapping.get(htype, 'UNKNOWN')
break
if existing_file:
print(f" UPDATING: {existing_file.name}")
with open(existing_file, 'r', encoding='utf-8') as f:
if yaml:
custodian_data = yaml.safe_load(f)
else:
custodian_data = json.load(f)
if isinstance(custodian_data, list) and len(custodian_data) > 0:
custodian_data = custodian_data[0]
# Add or update staff section
staff_section = {
'staff_count': staff_count,
'staff_source': {
'source_type': 'linkedin_company_people_page_html',
'source_file': html_file,
'registered_timestamp': source_metadata.get('registered_timestamp'),
'registration_method': 'html_parsing',
},
'staff': [
{
'person_id': s.get('staff_id'),
'person_name': s.get('name'),
'role_title': s.get('headline', ''),
'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
'affiliation_provenance': {
'source': 'LinkedIn company page',
'source_url': s.get('linkedin_profile_url', ''),
'retrieved_on': datetime.now(timezone.utc).isoformat(),
}
}
for s in staff_list if s.get('name_type') != 'organization'
]
}
custodian_data['staff'] = staff_section
custodian_data['provenance'] = custodian_data.get('provenance', {})
custodian_data['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
with open(existing_file, 'w', encoding='utf-8') as f:
if yaml:
yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
else:
json.dump([custodian_data], f, indent=2, ensure_ascii=False)
return existing_file
else:
# Create new custodian file
print(f" CREATING: NL-XX-XXX-?-{slug}.yaml (placeholder GHCID)")
custodian_data = {
'name': institution_name,
'institution_type': institution_type,
'description': f"Institution profile extracted from LinkedIn company page. Industry: {custodian_metadata.get('industry', 'Unknown')}",
'ghcid': {
'ghcid_current': 'NL-XX-XXX-PENDING', # Placeholder - needs research
'location_resolution': {
'method': 'PENDING',
'notes': 'GHCID not assigned - requires geographic research'
}
},
'staff': {
'staff_count': staff_count,
'staff_source': {
'source_type': 'linkedin_company_people_page_html',
'source_file': html_file,
'registered_timestamp': source_metadata.get('registered_timestamp'),
'registration_method': 'html_parsing',
},
'staff': [
{
'person_id': s.get('staff_id'),
'person_name': s.get('name'),
'role_title': s.get('headline', ''),
'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
'affiliation_provenance': {
'source': 'LinkedIn company page',
'source_url': s.get('linkedin_profile_url', ''),
'retrieved_on': datetime.now(timezone.utc).isoformat(),
}
}
for s in staff_list if s.get('name_type') != 'organization'
]
},
'provenance': {
'data_source': 'LINKEDIN_COMPANY_PAGE',
'data_tier': 'TIER_3_CROWD_SOURCED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Batch HTML parsing from manual directory',
'confidence_score': 0.70,
'source_metadata': {
'linkedin_page_type': 'company_people_page',
'source_file': html_file,
'staff_extracted': staff_count,
}
}
}
# Generate output filename
output_filename = f"NL-XX-UNKNOWN-{institution_type[0:3]}-{slug}.yaml"
output_path = CUSTODIAN_DIR / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
if yaml:
yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
else:
json.dump([custodian_data], f, indent=2, ensure_ascii=False)
print(f" → Created: {output_filename}")
return output_path
def main():
"""Main batch processing function."""
# Parse command line arguments
limit = None
if '--limit' in sys.argv:
idx = sys.argv.index('--limit')
if idx + 1 < len(sys.argv):
limit = int(sys.argv[idx + 1])
# Ensure output directories exist
PERSON_ENTITY_DIR.mkdir(parents=True, exist_ok=True)
CUSTODIAN_DIR.mkdir(parents=True, exist_ok=True)
# Get all HTML files
html_files = sorted(MANUAL_DIR.glob("*.html"))
if limit:
html_files = html_files[:limit]
print(f"LIMIT MODE: Processing first {limit} files (of {len(sorted(MANUAL_DIR.glob('*.html')))} total)")
print(f"\nFound {len(html_files)} HTML files to process")
print(f"Input directory: {MANUAL_DIR}")
print(f"Output directories:")
print(f" - Person entities: {PERSON_ENTITY_DIR}")
print(f" - Custodian files: {CUSTODIAN_DIR}")
print(f"\nStarting batch processing...")
print()
# Track statistics
stats = {
'total_files': len(html_files),
'processed': 0,
'errors': 0,
'new_custodians': 0,
'existing_custodians': 0,
'total_staff': 0,
'person_entities_created': 0,
'anonymous_members': 0,
'heritage_relevant_staff': 0,
'custodians_by_type': defaultdict(int),
'errors_list': [],
}
# Process each HTML file
for i, html_file in enumerate(html_files, 1):
try:
print(f"[{i}/{len(html_files)}] Processing: {html_file.name}")
# Extract institution name from filename
institution_name = extract_institution_name_from_filename(html_file.name)
if not institution_name:
print(f" SKIP: Could not extract name from filename")
stats['errors'] += 1
stats['errors_list'].append(html_file.name)
continue
# Generate slug
slug = generate_slug_from_name(institution_name)
# Parse HTML file
parse_result = parse_html_file(html_file, institution_name, slug)
if not parse_result:
stats['errors'] += 1
stats['errors_list'].append(html_file.name)
continue
stats['processed'] += 1
staff_list = parse_result.get('staff', [])
staff_analysis = parse_result.get('staff_analysis', {})
stats['total_staff'] += staff_analysis.get('total_staff_extracted', 0)
stats['anonymous_members'] += staff_analysis.get('anonymous_members', 0)
stats['heritage_relevant_staff'] += staff_analysis.get('heritage_relevant_count', 0)
# Create or update custodian
custodian_file = create_or_update_custodian(institution_name, slug, parse_result, html_file.name)
if custodian_file:
# Check if new or existing
existing = find_existing_custodian(institution_name)
if existing:
stats['existing_custodians'] += 1
else:
stats['new_custodians'] += 1
# Track institution type
staff_by_type = staff_analysis.get('staff_by_heritage_type', {})
if staff_by_type:
for htype in staff_by_type.keys():
if staff_by_type[htype] > 0:
# Map to GH type
type_map = {'M': 'MUSEUM', 'L': 'LIBRARY', 'A': 'ARCHIVE'}
stats['custodians_by_type'][type_map.get(htype, 'UNKNOWN')] += 1
# Create person entity files for each staff member
staff_count = 0
for staff_member in staff_list:
if staff_member.get('name_type') != 'organization':
staff_count += 1
# Only create person entity if heritage-relevant or has LinkedIn URL
if staff_member.get('heritage_relevant') or staff_member.get('linkedin_profile_url'):
person_file = create_person_entity(
staff_member, slug, institution_name,
parse_result.get('custodian_metadata', {}).get('institution_type', 'UNKNOWN')
)
if person_file:
stats['person_entities_created'] += 1
if i % 50 == 0 or i == len(html_files):
print()
print(f"Progress: {i}/{len(html_files)} files processed")
print(f" New custodians: {stats['new_custodians']}")
print(f" Existing custodians: {stats['existing_custodians']}")
print(f" Total staff extracted: {stats['total_staff']}")
print(f" Person entities created: {stats['person_entities_created']}")
print()
except Exception as e:
print(f"ERROR processing {html_file.name}: {e}", file=sys.stderr)
stats['errors'] += 1
stats['errors_list'].append(f"{html_file.name}: {e}")
continue
# Print final statistics
print("\n" + "="*60)
print("BATCH PROCESSING COMPLETE")
print("="*60)
print(f"Total files: {stats['total_files']}")
print(f"Successfully processed: {stats['processed']}")
print(f"Errors: {stats['errors']}")
if stats['errors'] > 0 and stats['errors_list']:
print(f"\nError details:")
for err in stats['errors_list'][:10]:
print(f" - {err}")
if len(stats['errors_list']) > 10:
print(f" ... and {len(stats['errors_list']) - 10} more errors")
print()
print(f"New custodians: {stats['new_custodians']}")
print(f"Existing custodians: {stats['existing_custodians']}")
print()
print(f"Total staff extracted: {stats['total_staff']}")
print(f"Heritage-relevant staff: {stats['heritage_relevant_staff']}")
print(f"Anonymous members: {stats['anonymous_members']}")
print(f"Person entity files created: {stats['person_entities_created']}")
print()
print("Custodians by type:")
for ctype, count in sorted(stats['custodians_by_type'].items()):
print(f" {ctype}: {count}")
print("="*60)
return 0
if __name__ == '__main__':
sys.exit(main())