glam/scripts/generate_linkedin_custodian_yaml.py
2025-12-16 20:27:39 +01:00

471 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Generate custodian YAML files from LinkedIn parsed data.
Merges data from:
- Parsed staff JSON files (data/custodian/person/affiliated/parsed/*.json)
- About page data (website URLs, industry, employee count)
- URL verification results (alive/dead/redirect status)
- URL corrections (resolved dead links)
Output: data/custodian/linkedin/{slug}.yaml
Usage:
python scripts/generate_linkedin_custodian_yaml.py
python scripts/generate_linkedin_custodian_yaml.py --dry-run
python scripts/generate_linkedin_custodian_yaml.py --limit 10
"""
import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
from typing import Any
import argparse
# Use ruamel.yaml for better YAML output with preserved order
try:
from ruamel.yaml import YAML
yaml = YAML()
yaml.default_flow_style = False
yaml.allow_unicode = True
yaml.width = 120
USE_RUAMEL = True
except ImportError:
import yaml as pyyaml
USE_RUAMEL = False
BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / "data/custodian/person/affiliated"
PARSED_DIR = DATA_DIR / "parsed"
ABOUT_DATA_FILE = DATA_DIR / "about_data/about_data_20251216T152238Z.json"
VERIFICATION_FILE = DATA_DIR / "verified_links/verification_results_20251216T152538Z.json"
CORRECTIONS_FILE = DATA_DIR / "verified_links/corrected_urls_20251216T160000Z.json"
OUTPUT_DIR = BASE_DIR / "data/custodian/linkedin"
# Industry to institution type mapping
INDUSTRY_TO_TYPE = {
"Museums, Historical Sites, and Zoos": ["M"],
"Museums and Institutions": ["M"],
"Libraries": ["L"],
"Archives": ["A"],
"Research Services": ["R"],
"Higher Education": ["E"],
"Non-profit Organizations": ["F"],
"Government Administration": ["O"],
"Performing Arts": ["C"],
"Fine Art": ["G"],
"Civic and Social Organizations": ["S"],
"Environmental Services": ["B"],
"Religious Institutions": ["H"],
}
def load_parsed_staff() -> dict[str, dict]:
"""Load most recent parsed JSON per custodian slug.
Returns dict keyed by custodian_slug with most recent staff data.
"""
parsed_files: dict[str, list[tuple[str, Path]]] = defaultdict(list)
for json_file in PARSED_DIR.glob("*.json"):
# Extract slug from filename: {slug}_staff_{timestamp}.json
match = re.match(r"(.+)_staff_(\d{8}T\d{6}Z)\.json", json_file.name)
if match:
slug = match.group(1)
timestamp = match.group(2)
parsed_files[slug].append((timestamp, json_file))
# Keep only most recent file per slug
result = {}
for slug, files in parsed_files.items():
if not files:
continue
# Sort by timestamp descending
files.sort(key=lambda x: x[0], reverse=True)
most_recent = files[0][1]
try:
with open(most_recent, 'r', encoding='utf-8') as f:
data = json.load(f)
result[slug] = data
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"WARNING: Failed to load {most_recent}: {e}", file=sys.stderr)
return result
def load_about_data() -> dict[str, dict]:
"""Load about page data indexed by custodian_slug."""
if not ABOUT_DATA_FILE.exists():
print(f"WARNING: About data file not found: {ABOUT_DATA_FILE}", file=sys.stderr)
return {}
with open(ABOUT_DATA_FILE, 'r', encoding='utf-8') as f:
raw = json.load(f)
# Handle nested structure - data is in 'data' key
entries = []
if isinstance(raw, list):
entries = raw
elif isinstance(raw, dict) and 'data' in raw:
entries = raw['data']
elif isinstance(raw, dict):
# Might be keyed by slug already
return raw
result = {}
for entry in entries:
slug = entry.get('custodian_slug')
if slug:
result[slug] = entry
return result
def load_verification_results() -> dict[str, dict]:
"""Load URL verification results indexed by custodian_name."""
if not VERIFICATION_FILE.exists():
print(f"WARNING: Verification file not found: {VERIFICATION_FILE}", file=sys.stderr)
return {}
with open(VERIFICATION_FILE, 'r', encoding='utf-8') as f:
raw = json.load(f)
results = raw.get('results', [])
return {r['custodian_name']: r for r in results}
def load_corrections() -> tuple[dict[str, dict], set[str], set[str]]:
"""Load URL corrections and special cases.
Returns:
- corrections: dict mapping custodian_name to correction info
- permanently_closed: set of custodian names that are permanently closed
- no_website: set of custodian names with no website
"""
if not CORRECTIONS_FILE.exists():
print(f"WARNING: Corrections file not found: {CORRECTIONS_FILE}", file=sys.stderr)
return {}, set(), set()
with open(CORRECTIONS_FILE, 'r', encoding='utf-8') as f:
raw = json.load(f)
corrections = {c['custodian_name']: c for c in raw.get('corrections', [])}
permanently_closed = {c['custodian_name'] for c in raw.get('permanently_closed', [])}
no_website = {c['custodian_name'] for c in raw.get('no_website', [])}
# Also add google_business_defunct as having no usable website
for item in raw.get('google_business_defunct', []):
no_website.add(item['custodian_name'])
return corrections, permanently_closed, no_website
def infer_institution_type(staff_data: dict, about_data: dict) -> list[str]:
"""Infer institution type from industry and staff heritage types."""
types_found = set()
# From industry
industry = (about_data.get('industry') or
staff_data.get('custodian_metadata', {}).get('industry'))
if industry and industry in INDUSTRY_TO_TYPE:
types_found.update(INDUSTRY_TO_TYPE[industry])
# From staff heritage types
staff_analysis = staff_data.get('staff_analysis', {})
heritage_types = staff_analysis.get('staff_by_heritage_type', {})
if heritage_types:
# Take the most common heritage type
sorted_types = sorted(heritage_types.items(), key=lambda x: x[1], reverse=True)
if sorted_types:
types_found.add(sorted_types[0][0])
# Default to M (Museum) if nothing found
if not types_found:
types_found.add("M")
return sorted(list(types_found))
def determine_website_status(
custodian_name: str,
about_data: dict,
verification: dict,
corrections: dict,
no_website: set
) -> tuple[str | None, str, str | None]:
"""Determine final website URL and status.
Returns:
- website_url: Final URL to use (may be corrected)
- status: 'verified', 'corrected', 'unverified', 'dead', 'none'
- original_url: Original URL if corrected, else None
"""
original_url = about_data.get('website_url')
# Check if custodian has no website
if custodian_name in no_website:
return None, 'none', original_url
# Check corrections first
if custodian_name in corrections:
correction = corrections[custodian_name]
return correction['corrected_url'], 'corrected', correction.get('original_url', original_url)
# No original URL from about page
if not original_url:
return None, 'none', None
# Check verification status
if custodian_name in verification:
v = verification[custodian_name]
if v.get('is_alive'):
final_url = v.get('final_url') or original_url
return final_url, 'verified', None if final_url == original_url else original_url
else:
# Dead link not in corrections
return original_url, 'dead', None
# Not verified
return original_url, 'unverified', None
def build_heritage_staff_list(staff_data: dict) -> list[dict]:
"""Build list of heritage-relevant staff members."""
staff = staff_data.get('staff', [])
heritage_staff = []
for person in staff:
if not person.get('heritage_relevant'):
continue
entry = {
'name': person.get('name'),
'headline': person.get('headline'),
}
if person.get('linkedin_profile_url'):
entry['linkedin_url'] = person['linkedin_profile_url']
elif person.get('linkedin_slug'):
entry['linkedin_url'] = f"https://www.linkedin.com/in/{person['linkedin_slug']}"
if person.get('heritage_type'):
entry['heritage_type'] = person['heritage_type']
heritage_staff.append(entry)
return heritage_staff
def generate_custodian_yaml(
slug: str,
staff_data: dict,
about_data: dict,
verification: dict,
corrections: dict,
no_website: set
) -> dict[str, Any]:
"""Generate YAML structure for a single custodian."""
metadata = staff_data.get('custodian_metadata', {})
source_meta = staff_data.get('source_metadata', {})
staff_analysis = staff_data.get('staff_analysis', {})
custodian_name = metadata.get('custodian_name') or metadata.get('name', slug)
# Determine website
website_url, website_status, original_url = determine_website_status(
custodian_name, about_data, verification, corrections, no_website
)
# Build YAML structure
result = {
'ghcid_temp': f"ghcid:linkedin:{slug}",
'name': custodian_name,
'linkedin_slug': slug,
}
# Website info
if website_url:
result['website'] = website_url
result['website_status'] = website_status
if original_url:
result['website_original'] = original_url
# LinkedIn URL
result['linkedin_url'] = f"https://www.linkedin.com/company/{slug}"
# Industry
industry = about_data.get('industry') or metadata.get('industry')
if industry:
result['industry'] = industry
# Location
location = metadata.get('location')
if location and isinstance(location, dict):
result['location'] = location
# Institution type
result['institution_type'] = infer_institution_type(staff_data, about_data)
# Staff counts
if metadata.get('follower_count'):
result['follower_count'] = metadata['follower_count']
result['staff_count'] = staff_analysis.get('total_staff_extracted', len(staff_data.get('staff', [])))
result['heritage_staff_count'] = staff_analysis.get('heritage_relevant_count', 0)
# Heritage staff list
heritage_staff = build_heritage_staff_list(staff_data)
if heritage_staff:
result['heritage_staff'] = heritage_staff
# Provenance
now = datetime.now(timezone.utc).isoformat()
result['provenance'] = {
'schema_version': '1.0.0',
'generated_at': now,
'sources': {
'linkedin_people_page': [{
'source_type': source_meta.get('source_type', 'linkedin_company_people_page_html'),
'source_file': source_meta.get('source_file'),
'extraction_timestamp': source_meta.get('registered_timestamp'),
'claims_extracted': ['staff', 'industry', 'location', 'follower_count'],
}]
}
}
# Add about page source if available
if about_data and about_data.get('source_metadata'):
about_source = about_data['source_metadata']
result['provenance']['sources']['linkedin_about_page'] = [{
'source_type': about_source.get('source_type', 'linkedin_company_about_page_html'),
'source_file': about_source.get('source_file'),
'extraction_timestamp': about_source.get('extraction_timestamp'),
'claims_extracted': ['website_url', 'employee_count'],
}]
# Add verification source if website was verified
if website_status in ('verified', 'corrected'):
result['provenance']['sources']['url_verification'] = [{
'source_type': 'http_verification' if website_status == 'verified' else 'manual_correction',
'verification_timestamp': now,
'claims_extracted': ['website_status', 'final_url'],
}]
return result
def write_yaml(data: dict, path: Path):
"""Write YAML file."""
if USE_RUAMEL:
with open(path, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
else:
with open(path, 'w', encoding='utf-8') as f:
pyyaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def main():
parser = argparse.ArgumentParser(description='Generate custodian YAML from LinkedIn data')
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without writing files')
parser.add_argument('--limit', type=int, help='Limit number of custodians to process')
parser.add_argument('--slug', type=str, help='Process only a specific custodian slug')
args = parser.parse_args()
print("Loading data sources...")
# Load all data sources
staff_by_slug = load_parsed_staff()
print(f" Loaded {len(staff_by_slug)} parsed staff files")
about_by_slug = load_about_data()
print(f" Loaded {len(about_by_slug)} about page entries")
verification_by_name = load_verification_results()
print(f" Loaded {len(verification_by_name)} verification results")
corrections, permanently_closed, no_website = load_corrections()
print(f" Loaded {len(corrections)} URL corrections")
print(f" Found {len(permanently_closed)} permanently closed")
print(f" Found {len(no_website)} with no website")
# Create output directory
if not args.dry_run:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Process custodians
processed = 0
skipped_closed = 0
skipped_errors = 0
slugs_to_process = [args.slug] if args.slug else sorted(staff_by_slug.keys())
for slug in slugs_to_process:
if args.limit and processed >= args.limit:
break
if slug not in staff_by_slug:
print(f"WARNING: Slug not found: {slug}", file=sys.stderr)
continue
staff_data = staff_by_slug[slug]
metadata = staff_data.get('custodian_metadata', {})
name = metadata.get('custodian_name') or metadata.get('name', slug)
# Skip permanently closed
if name in permanently_closed:
print(f"SKIPPED (permanently closed): {name}")
skipped_closed += 1
continue
try:
# Get matching about data
about_data = about_by_slug.get(slug, {})
# Generate YAML
yaml_data = generate_custodian_yaml(
slug,
staff_data,
about_data,
verification_by_name,
corrections,
no_website
)
if args.dry_run:
print(f"Would generate: {slug}.yaml ({name})")
if processed < 3: # Show first 3 examples
if USE_RUAMEL:
from io import StringIO
stream = StringIO()
yaml.dump(yaml_data, stream)
print(stream.getvalue()[:500])
else:
print(pyyaml.dump(yaml_data, default_flow_style=False)[:500])
print("---")
else:
output_path = OUTPUT_DIR / f"{slug}.yaml"
write_yaml(yaml_data, output_path)
print(f"Generated: {output_path.name}")
processed += 1
except Exception as e:
print(f"ERROR processing {slug}: {e}", file=sys.stderr)
skipped_errors += 1
continue
print("\n" + "="*60)
print(f"SUMMARY:")
print(f" Processed: {processed}")
print(f" Skipped (closed): {skipped_closed}")
print(f" Skipped (errors): {skipped_errors}")
if not args.dry_run:
print(f" Output directory: {OUTPUT_DIR}")
if __name__ == "__main__":
main()