501 lines
19 KiB
Python
501 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch process all LinkedIn company People HTML files from manual directory.
|
|
|
|
This script:
|
|
1. Scans manual directory for all HTML files
|
|
2. Extracts institution names from filenames
|
|
3. Runs parse_linkedin_html.py for each file
|
|
4. Creates person entity files for each staff member
|
|
5. Creates or updates custodian YAML files
|
|
|
|
Usage:
|
|
python scripts/batch_parse_linkedin_manual.py [--limit N]
|
|
|
|
Options:
|
|
--limit N Only process first N files (for testing)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
yaml = None
|
|
|
|
|
|
# Directory paths
|
|
MANUAL_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
PERSON_ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
|
|
def extract_institution_name_from_filename(filename: str) -> Optional[str]:
|
|
"""Extract institution name from LinkedIn People HTML filename."""
|
|
name = Path(filename).name
|
|
name = name.replace('.html', '')
|
|
name = re.sub(r'_?People _ LinkedIn$', '', name)
|
|
name = re.sub(r'^\(\d+\)\s*', '', name)
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
name = name.rstrip('_')
|
|
return name if name else None
|
|
|
|
|
|
def generate_slug_from_name(name: str) -> str:
|
|
"""Generate URL-friendly slug from institution name."""
|
|
slug = name.lower()
|
|
slug = re.sub(r'[^a-z0-9\s-]', '', slug)
|
|
slug = re.sub(r'[\s-]+', '-', slug)
|
|
slug = slug.strip('-')
|
|
return slug
|
|
|
|
|
|
def parse_html_file(html_path: Path, institution_name: str, slug: str) -> Optional[dict]:
|
|
"""Parse a single HTML file using parse_linkedin_html.py script."""
|
|
output_path = Path(f"/tmp/{slug}_staff_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json")
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"/Users/kempersc/apps/glam/scripts/parse_linkedin_html.py",
|
|
str(html_path),
|
|
"--custodian-name", institution_name,
|
|
"--custodian-slug", slug,
|
|
"--output", str(output_path)
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
print(f"ERROR parsing {html_path.name}: {result.stderr}", file=sys.stderr)
|
|
return None
|
|
|
|
with open(output_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print(f"TIMEOUT parsing {html_path.name}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f"ERROR parsing {html_path.name}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def find_existing_custodian(institution_name: str) -> Optional[Path]:
|
|
"""Find existing custodian YAML file by institution name."""
|
|
if not yaml:
|
|
return None
|
|
|
|
for yaml_file in CUSTODIAN_DIR.glob("*.yaml"):
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list) and len(data) > 0:
|
|
data = data[0]
|
|
|
|
if data and isinstance(data, dict) and 'name' in data:
|
|
name = data.get('name')
|
|
if name and name.lower() == institution_name.lower():
|
|
return yaml_file
|
|
|
|
alt_names = data.get('alternative_names', [])
|
|
if isinstance(alt_names, list):
|
|
for alt in alt_names:
|
|
alt_str = str(alt) if not isinstance(alt, str) else alt
|
|
if alt_str.lower() == institution_name.lower():
|
|
return yaml_file
|
|
except Exception:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def create_person_entity(staff_member: dict, custodian_slug: str, custodian_name: str, institution_type: str) -> Optional[str]:
|
|
"""
|
|
Create a person entity file following Rule 20 (Person Entity Profiles).
|
|
|
|
Returns path to created file or None on error.
|
|
"""
|
|
person_id = staff_member.get('staff_id')
|
|
if not person_id:
|
|
return None
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
filename = f"{person_id}_{timestamp}.json"
|
|
output_path = PERSON_ENTITY_DIR / filename
|
|
|
|
person_entity = {
|
|
'person_id': person_id,
|
|
'extraction_metadata': {
|
|
'extraction_agent': 'claude-opus-4.5',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_source': f'LinkedIn company page: {custodian_name}',
|
|
'source_url': staff_member.get('linkedin_profile_url'),
|
|
},
|
|
'profile_data': {
|
|
'full_name': staff_member.get('name'),
|
|
'name_type': staff_member.get('name_type'),
|
|
'headline': staff_member.get('headline', ''),
|
|
'linkedin_slug': staff_member.get('linkedin_slug'),
|
|
'linkedin_profile_url': staff_member.get('linkedin_profile_url'),
|
|
'connection_degree': staff_member.get('degree'),
|
|
'mutual_connections': staff_member.get('mutual_connections', ''),
|
|
},
|
|
'heritage_relevance': {
|
|
'is_heritage_relevant': staff_member.get('heritage_relevant', False),
|
|
'heritage_type': staff_member.get('heritage_type'),
|
|
'custodian_name': custodian_name,
|
|
'institution_type': institution_type,
|
|
},
|
|
'affiliations': [
|
|
{
|
|
'custodian_name': custodian_name,
|
|
'custodian_slug': custodian_slug,
|
|
'role_title': staff_member.get('headline', ''),
|
|
'affiliation_type': 'employment',
|
|
'affiliation_source': 'LinkedIn company page',
|
|
'affiliation_source_url': staff_member.get('linkedin_profile_url', ''),
|
|
}
|
|
]
|
|
}
|
|
|
|
try:
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(person_entity, f, indent=2, ensure_ascii=False)
|
|
return str(output_path)
|
|
except Exception as e:
|
|
print(f"ERROR creating person entity: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def create_or_update_custodian(institution_name: str, slug: str, parse_result: dict, html_file: str) -> Optional[Path]:
|
|
"""
|
|
Create new custodian YAML file or update existing one.
|
|
|
|
Returns path to custodian file.
|
|
"""
|
|
existing_file = find_existing_custodian(institution_name)
|
|
custodian_metadata = parse_result.get('custodian_metadata', {})
|
|
staff_list = parse_result.get('staff', [])
|
|
source_metadata = parse_result.get('source_metadata', {})
|
|
|
|
staff_count = len([s for s in staff_list if s.get('name_type') != 'organization'])
|
|
|
|
institution_type = 'UNKNOWN'
|
|
staff_analysis = parse_result.get('staff_analysis', {})
|
|
heritage_types = staff_analysis.get('staff_by_heritage_type', {})
|
|
|
|
if heritage_types:
|
|
# Map to GLAMORCUBESFIXPHDNT type
|
|
type_mapping = {
|
|
'G': 'GALLERY',
|
|
'L': 'LIBRARY',
|
|
'A': 'ARCHIVE',
|
|
'M': 'MUSEUM',
|
|
'O': 'OFFICIAL_INSTITUTION',
|
|
'R': 'RESEARCH_CENTER',
|
|
'C': 'CORPORATION',
|
|
'E': 'EDUCATION_PROVIDER',
|
|
'S': 'COLLECTING_SOCIETY',
|
|
'D': 'DIGITAL_PLATFORM',
|
|
'I': 'INTANGIBLE_HERITAGE_GROUP',
|
|
'T': 'TASTE_SMELL',
|
|
'B': 'BOTANICAL_ZOO',
|
|
'H': 'HOLY_SITES',
|
|
'F': 'FEATURES',
|
|
'N': 'NGO',
|
|
'X': 'MIXED',
|
|
'P': 'PERSONAL_COLLECTION',
|
|
'U': 'UNKNOWN'
|
|
}
|
|
|
|
for htype in heritage_types.keys():
|
|
if heritage_types[htype] > 0:
|
|
institution_type = type_mapping.get(htype, 'UNKNOWN')
|
|
break
|
|
|
|
if existing_file:
|
|
print(f" UPDATING: {existing_file.name}")
|
|
|
|
with open(existing_file, 'r', encoding='utf-8') as f:
|
|
if yaml:
|
|
custodian_data = yaml.safe_load(f)
|
|
else:
|
|
custodian_data = json.load(f)
|
|
|
|
if isinstance(custodian_data, list) and len(custodian_data) > 0:
|
|
custodian_data = custodian_data[0]
|
|
|
|
# Add or update staff section
|
|
staff_section = {
|
|
'staff_count': staff_count,
|
|
'staff_source': {
|
|
'source_type': 'linkedin_company_people_page_html',
|
|
'source_file': html_file,
|
|
'registered_timestamp': source_metadata.get('registered_timestamp'),
|
|
'registration_method': 'html_parsing',
|
|
},
|
|
'staff': [
|
|
{
|
|
'person_id': s.get('staff_id'),
|
|
'person_name': s.get('name'),
|
|
'role_title': s.get('headline', ''),
|
|
'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
|
|
'affiliation_provenance': {
|
|
'source': 'LinkedIn company page',
|
|
'source_url': s.get('linkedin_profile_url', ''),
|
|
'retrieved_on': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
}
|
|
for s in staff_list if s.get('name_type') != 'organization'
|
|
]
|
|
}
|
|
|
|
custodian_data['staff'] = staff_section
|
|
custodian_data['provenance'] = custodian_data.get('provenance', {})
|
|
custodian_data['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(existing_file, 'w', encoding='utf-8') as f:
|
|
if yaml:
|
|
yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
else:
|
|
json.dump([custodian_data], f, indent=2, ensure_ascii=False)
|
|
|
|
return existing_file
|
|
else:
|
|
# Create new custodian file
|
|
print(f" CREATING: NL-XX-XXX-?-{slug}.yaml (placeholder GHCID)")
|
|
|
|
custodian_data = {
|
|
'name': institution_name,
|
|
'institution_type': institution_type,
|
|
'description': f"Institution profile extracted from LinkedIn company page. Industry: {custodian_metadata.get('industry', 'Unknown')}",
|
|
'ghcid': {
|
|
'ghcid_current': 'NL-XX-XXX-PENDING', # Placeholder - needs research
|
|
'location_resolution': {
|
|
'method': 'PENDING',
|
|
'notes': 'GHCID not assigned - requires geographic research'
|
|
}
|
|
},
|
|
'staff': {
|
|
'staff_count': staff_count,
|
|
'staff_source': {
|
|
'source_type': 'linkedin_company_people_page_html',
|
|
'source_file': html_file,
|
|
'registered_timestamp': source_metadata.get('registered_timestamp'),
|
|
'registration_method': 'html_parsing',
|
|
},
|
|
'staff': [
|
|
{
|
|
'person_id': s.get('staff_id'),
|
|
'person_name': s.get('name'),
|
|
'role_title': s.get('headline', ''),
|
|
'linkedin_profile_path': f"data/custodian/person/entity/{s.get('staff_id')}_{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}.json",
|
|
'affiliation_provenance': {
|
|
'source': 'LinkedIn company page',
|
|
'source_url': s.get('linkedin_profile_url', ''),
|
|
'retrieved_on': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
}
|
|
for s in staff_list if s.get('name_type') != 'organization'
|
|
]
|
|
},
|
|
'provenance': {
|
|
'data_source': 'LINKEDIN_COMPANY_PAGE',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Batch HTML parsing from manual directory',
|
|
'confidence_score': 0.70,
|
|
'source_metadata': {
|
|
'linkedin_page_type': 'company_people_page',
|
|
'source_file': html_file,
|
|
'staff_extracted': staff_count,
|
|
}
|
|
}
|
|
}
|
|
|
|
# Generate output filename
|
|
output_filename = f"NL-XX-UNKNOWN-{institution_type[0:3]}-{slug}.yaml"
|
|
output_path = CUSTODIAN_DIR / output_filename
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
if yaml:
|
|
yaml.dump([custodian_data], f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
else:
|
|
json.dump([custodian_data], f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" → Created: {output_filename}")
|
|
return output_path
|
|
|
|
|
|
def main():
|
|
"""Main batch processing function."""
|
|
|
|
# Parse command line arguments
|
|
limit = None
|
|
if '--limit' in sys.argv:
|
|
idx = sys.argv.index('--limit')
|
|
if idx + 1 < len(sys.argv):
|
|
limit = int(sys.argv[idx + 1])
|
|
|
|
# Ensure output directories exist
|
|
PERSON_ENTITY_DIR.mkdir(parents=True, exist_ok=True)
|
|
CUSTODIAN_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Get all HTML files
|
|
html_files = sorted(MANUAL_DIR.glob("*.html"))
|
|
|
|
if limit:
|
|
html_files = html_files[:limit]
|
|
print(f"LIMIT MODE: Processing first {limit} files (of {len(sorted(MANUAL_DIR.glob('*.html')))} total)")
|
|
|
|
print(f"\nFound {len(html_files)} HTML files to process")
|
|
print(f"Input directory: {MANUAL_DIR}")
|
|
print(f"Output directories:")
|
|
print(f" - Person entities: {PERSON_ENTITY_DIR}")
|
|
print(f" - Custodian files: {CUSTODIAN_DIR}")
|
|
print(f"\nStarting batch processing...")
|
|
print()
|
|
|
|
# Track statistics
|
|
stats = {
|
|
'total_files': len(html_files),
|
|
'processed': 0,
|
|
'errors': 0,
|
|
'new_custodians': 0,
|
|
'existing_custodians': 0,
|
|
'total_staff': 0,
|
|
'person_entities_created': 0,
|
|
'anonymous_members': 0,
|
|
'heritage_relevant_staff': 0,
|
|
'custodians_by_type': defaultdict(int),
|
|
'errors_list': [],
|
|
}
|
|
|
|
# Process each HTML file
|
|
for i, html_file in enumerate(html_files, 1):
|
|
try:
|
|
print(f"[{i}/{len(html_files)}] Processing: {html_file.name}")
|
|
|
|
# Extract institution name from filename
|
|
institution_name = extract_institution_name_from_filename(html_file.name)
|
|
if not institution_name:
|
|
print(f" SKIP: Could not extract name from filename")
|
|
stats['errors'] += 1
|
|
stats['errors_list'].append(html_file.name)
|
|
continue
|
|
|
|
# Generate slug
|
|
slug = generate_slug_from_name(institution_name)
|
|
|
|
# Parse HTML file
|
|
parse_result = parse_html_file(html_file, institution_name, slug)
|
|
if not parse_result:
|
|
stats['errors'] += 1
|
|
stats['errors_list'].append(html_file.name)
|
|
continue
|
|
|
|
stats['processed'] += 1
|
|
|
|
staff_list = parse_result.get('staff', [])
|
|
staff_analysis = parse_result.get('staff_analysis', {})
|
|
|
|
stats['total_staff'] += staff_analysis.get('total_staff_extracted', 0)
|
|
stats['anonymous_members'] += staff_analysis.get('anonymous_members', 0)
|
|
stats['heritage_relevant_staff'] += staff_analysis.get('heritage_relevant_count', 0)
|
|
|
|
# Create or update custodian
|
|
custodian_file = create_or_update_custodian(institution_name, slug, parse_result, html_file.name)
|
|
|
|
if custodian_file:
|
|
# Check if new or existing
|
|
existing = find_existing_custodian(institution_name)
|
|
if existing:
|
|
stats['existing_custodians'] += 1
|
|
else:
|
|
stats['new_custodians'] += 1
|
|
|
|
# Track institution type
|
|
staff_by_type = staff_analysis.get('staff_by_heritage_type', {})
|
|
if staff_by_type:
|
|
for htype in staff_by_type.keys():
|
|
if staff_by_type[htype] > 0:
|
|
# Map to GH type
|
|
type_map = {'M': 'MUSEUM', 'L': 'LIBRARY', 'A': 'ARCHIVE'}
|
|
stats['custodians_by_type'][type_map.get(htype, 'UNKNOWN')] += 1
|
|
|
|
# Create person entity files for each staff member
|
|
staff_count = 0
|
|
for staff_member in staff_list:
|
|
if staff_member.get('name_type') != 'organization':
|
|
staff_count += 1
|
|
|
|
# Only create person entity if heritage-relevant or has LinkedIn URL
|
|
if staff_member.get('heritage_relevant') or staff_member.get('linkedin_profile_url'):
|
|
person_file = create_person_entity(
|
|
staff_member, slug, institution_name,
|
|
parse_result.get('custodian_metadata', {}).get('institution_type', 'UNKNOWN')
|
|
)
|
|
if person_file:
|
|
stats['person_entities_created'] += 1
|
|
|
|
if i % 50 == 0 or i == len(html_files):
|
|
print()
|
|
print(f"Progress: {i}/{len(html_files)} files processed")
|
|
print(f" New custodians: {stats['new_custodians']}")
|
|
print(f" Existing custodians: {stats['existing_custodians']}")
|
|
print(f" Total staff extracted: {stats['total_staff']}")
|
|
print(f" Person entities created: {stats['person_entities_created']}")
|
|
print()
|
|
|
|
except Exception as e:
|
|
print(f"ERROR processing {html_file.name}: {e}", file=sys.stderr)
|
|
stats['errors'] += 1
|
|
stats['errors_list'].append(f"{html_file.name}: {e}")
|
|
continue
|
|
|
|
# Print final statistics
|
|
print("\n" + "="*60)
|
|
print("BATCH PROCESSING COMPLETE")
|
|
print("="*60)
|
|
print(f"Total files: {stats['total_files']}")
|
|
print(f"Successfully processed: {stats['processed']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
if stats['errors'] > 0 and stats['errors_list']:
|
|
print(f"\nError details:")
|
|
for err in stats['errors_list'][:10]:
|
|
print(f" - {err}")
|
|
if len(stats['errors_list']) > 10:
|
|
print(f" ... and {len(stats['errors_list']) - 10} more errors")
|
|
|
|
print()
|
|
print(f"New custodians: {stats['new_custodians']}")
|
|
print(f"Existing custodians: {stats['existing_custodians']}")
|
|
print()
|
|
print(f"Total staff extracted: {stats['total_staff']}")
|
|
print(f"Heritage-relevant staff: {stats['heritage_relevant_staff']}")
|
|
print(f"Anonymous members: {stats['anonymous_members']}")
|
|
print(f"Person entity files created: {stats['person_entities_created']}")
|
|
print()
|
|
print("Custodians by type:")
|
|
for ctype, count in sorted(stats['custodians_by_type'].items()):
|
|
print(f" {ctype}: {count}")
|
|
print("="*60)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|