471 lines
16 KiB
Python
471 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate custodian YAML files from LinkedIn parsed data.
|
|
|
|
Merges data from:
|
|
- Parsed staff JSON files (data/custodian/person/affiliated/parsed/*.json)
|
|
- About page data (website URLs, industry, employee count)
|
|
- URL verification results (alive/dead/redirect status)
|
|
- URL corrections (resolved dead links)
|
|
|
|
Output: data/custodian/linkedin/{slug}.yaml
|
|
|
|
Usage:
|
|
python scripts/generate_linkedin_custodian_yaml.py
|
|
python scripts/generate_linkedin_custodian_yaml.py --dry-run
|
|
python scripts/generate_linkedin_custodian_yaml.py --limit 10
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
import argparse
|
|
|
|
# Use ruamel.yaml for better YAML output with preserved order
|
|
try:
|
|
from ruamel.yaml import YAML
|
|
yaml = YAML()
|
|
yaml.default_flow_style = False
|
|
yaml.allow_unicode = True
|
|
yaml.width = 120
|
|
USE_RUAMEL = True
|
|
except ImportError:
|
|
import yaml as pyyaml
|
|
USE_RUAMEL = False
|
|
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
DATA_DIR = BASE_DIR / "data/custodian/person/affiliated"
|
|
PARSED_DIR = DATA_DIR / "parsed"
|
|
ABOUT_DATA_FILE = DATA_DIR / "about_data/about_data_20251216T152238Z.json"
|
|
VERIFICATION_FILE = DATA_DIR / "verified_links/verification_results_20251216T152538Z.json"
|
|
CORRECTIONS_FILE = DATA_DIR / "verified_links/corrected_urls_20251216T160000Z.json"
|
|
OUTPUT_DIR = BASE_DIR / "data/custodian/linkedin"
|
|
|
|
# Industry to institution type mapping
|
|
INDUSTRY_TO_TYPE = {
|
|
"Museums, Historical Sites, and Zoos": ["M"],
|
|
"Museums and Institutions": ["M"],
|
|
"Libraries": ["L"],
|
|
"Archives": ["A"],
|
|
"Research Services": ["R"],
|
|
"Higher Education": ["E"],
|
|
"Non-profit Organizations": ["F"],
|
|
"Government Administration": ["O"],
|
|
"Performing Arts": ["C"],
|
|
"Fine Art": ["G"],
|
|
"Civic and Social Organizations": ["S"],
|
|
"Environmental Services": ["B"],
|
|
"Religious Institutions": ["H"],
|
|
}
|
|
|
|
|
|
def load_parsed_staff() -> dict[str, dict]:
|
|
"""Load most recent parsed JSON per custodian slug.
|
|
|
|
Returns dict keyed by custodian_slug with most recent staff data.
|
|
"""
|
|
parsed_files: dict[str, list[tuple[str, Path]]] = defaultdict(list)
|
|
|
|
for json_file in PARSED_DIR.glob("*.json"):
|
|
# Extract slug from filename: {slug}_staff_{timestamp}.json
|
|
match = re.match(r"(.+)_staff_(\d{8}T\d{6}Z)\.json", json_file.name)
|
|
if match:
|
|
slug = match.group(1)
|
|
timestamp = match.group(2)
|
|
parsed_files[slug].append((timestamp, json_file))
|
|
|
|
# Keep only most recent file per slug
|
|
result = {}
|
|
for slug, files in parsed_files.items():
|
|
if not files:
|
|
continue
|
|
# Sort by timestamp descending
|
|
files.sort(key=lambda x: x[0], reverse=True)
|
|
most_recent = files[0][1]
|
|
|
|
try:
|
|
with open(most_recent, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
result[slug] = data
|
|
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
print(f"WARNING: Failed to load {most_recent}: {e}", file=sys.stderr)
|
|
|
|
return result
|
|
|
|
|
|
def load_about_data() -> dict[str, dict]:
|
|
"""Load about page data indexed by custodian_slug."""
|
|
if not ABOUT_DATA_FILE.exists():
|
|
print(f"WARNING: About data file not found: {ABOUT_DATA_FILE}", file=sys.stderr)
|
|
return {}
|
|
|
|
with open(ABOUT_DATA_FILE, 'r', encoding='utf-8') as f:
|
|
raw = json.load(f)
|
|
|
|
# Handle nested structure - data is in 'data' key
|
|
entries = []
|
|
if isinstance(raw, list):
|
|
entries = raw
|
|
elif isinstance(raw, dict) and 'data' in raw:
|
|
entries = raw['data']
|
|
elif isinstance(raw, dict):
|
|
# Might be keyed by slug already
|
|
return raw
|
|
|
|
result = {}
|
|
for entry in entries:
|
|
slug = entry.get('custodian_slug')
|
|
if slug:
|
|
result[slug] = entry
|
|
|
|
return result
|
|
|
|
|
|
def load_verification_results() -> dict[str, dict]:
|
|
"""Load URL verification results indexed by custodian_name."""
|
|
if not VERIFICATION_FILE.exists():
|
|
print(f"WARNING: Verification file not found: {VERIFICATION_FILE}", file=sys.stderr)
|
|
return {}
|
|
|
|
with open(VERIFICATION_FILE, 'r', encoding='utf-8') as f:
|
|
raw = json.load(f)
|
|
|
|
results = raw.get('results', [])
|
|
return {r['custodian_name']: r for r in results}
|
|
|
|
|
|
def load_corrections() -> tuple[dict[str, dict], set[str], set[str]]:
|
|
"""Load URL corrections and special cases.
|
|
|
|
Returns:
|
|
- corrections: dict mapping custodian_name to correction info
|
|
- permanently_closed: set of custodian names that are permanently closed
|
|
- no_website: set of custodian names with no website
|
|
"""
|
|
if not CORRECTIONS_FILE.exists():
|
|
print(f"WARNING: Corrections file not found: {CORRECTIONS_FILE}", file=sys.stderr)
|
|
return {}, set(), set()
|
|
|
|
with open(CORRECTIONS_FILE, 'r', encoding='utf-8') as f:
|
|
raw = json.load(f)
|
|
|
|
corrections = {c['custodian_name']: c for c in raw.get('corrections', [])}
|
|
permanently_closed = {c['custodian_name'] for c in raw.get('permanently_closed', [])}
|
|
no_website = {c['custodian_name'] for c in raw.get('no_website', [])}
|
|
|
|
# Also add google_business_defunct as having no usable website
|
|
for item in raw.get('google_business_defunct', []):
|
|
no_website.add(item['custodian_name'])
|
|
|
|
return corrections, permanently_closed, no_website
|
|
|
|
|
|
def infer_institution_type(staff_data: dict, about_data: dict) -> list[str]:
|
|
"""Infer institution type from industry and staff heritage types."""
|
|
types_found = set()
|
|
|
|
# From industry
|
|
industry = (about_data.get('industry') or
|
|
staff_data.get('custodian_metadata', {}).get('industry'))
|
|
if industry and industry in INDUSTRY_TO_TYPE:
|
|
types_found.update(INDUSTRY_TO_TYPE[industry])
|
|
|
|
# From staff heritage types
|
|
staff_analysis = staff_data.get('staff_analysis', {})
|
|
heritage_types = staff_analysis.get('staff_by_heritage_type', {})
|
|
if heritage_types:
|
|
# Take the most common heritage type
|
|
sorted_types = sorted(heritage_types.items(), key=lambda x: x[1], reverse=True)
|
|
if sorted_types:
|
|
types_found.add(sorted_types[0][0])
|
|
|
|
# Default to M (Museum) if nothing found
|
|
if not types_found:
|
|
types_found.add("M")
|
|
|
|
return sorted(list(types_found))
|
|
|
|
|
|
def determine_website_status(
|
|
custodian_name: str,
|
|
about_data: dict,
|
|
verification: dict,
|
|
corrections: dict,
|
|
no_website: set
|
|
) -> tuple[str | None, str, str | None]:
|
|
"""Determine final website URL and status.
|
|
|
|
Returns:
|
|
- website_url: Final URL to use (may be corrected)
|
|
- status: 'verified', 'corrected', 'unverified', 'dead', 'none'
|
|
- original_url: Original URL if corrected, else None
|
|
"""
|
|
original_url = about_data.get('website_url')
|
|
|
|
# Check if custodian has no website
|
|
if custodian_name in no_website:
|
|
return None, 'none', original_url
|
|
|
|
# Check corrections first
|
|
if custodian_name in corrections:
|
|
correction = corrections[custodian_name]
|
|
return correction['corrected_url'], 'corrected', correction.get('original_url', original_url)
|
|
|
|
# No original URL from about page
|
|
if not original_url:
|
|
return None, 'none', None
|
|
|
|
# Check verification status
|
|
if custodian_name in verification:
|
|
v = verification[custodian_name]
|
|
if v.get('is_alive'):
|
|
final_url = v.get('final_url') or original_url
|
|
return final_url, 'verified', None if final_url == original_url else original_url
|
|
else:
|
|
# Dead link not in corrections
|
|
return original_url, 'dead', None
|
|
|
|
# Not verified
|
|
return original_url, 'unverified', None
|
|
|
|
|
|
def build_heritage_staff_list(staff_data: dict) -> list[dict]:
|
|
"""Build list of heritage-relevant staff members."""
|
|
staff = staff_data.get('staff', [])
|
|
heritage_staff = []
|
|
|
|
for person in staff:
|
|
if not person.get('heritage_relevant'):
|
|
continue
|
|
|
|
entry = {
|
|
'name': person.get('name'),
|
|
'headline': person.get('headline'),
|
|
}
|
|
|
|
if person.get('linkedin_profile_url'):
|
|
entry['linkedin_url'] = person['linkedin_profile_url']
|
|
elif person.get('linkedin_slug'):
|
|
entry['linkedin_url'] = f"https://www.linkedin.com/in/{person['linkedin_slug']}"
|
|
|
|
if person.get('heritage_type'):
|
|
entry['heritage_type'] = person['heritage_type']
|
|
|
|
heritage_staff.append(entry)
|
|
|
|
return heritage_staff
|
|
|
|
|
|
def generate_custodian_yaml(
|
|
slug: str,
|
|
staff_data: dict,
|
|
about_data: dict,
|
|
verification: dict,
|
|
corrections: dict,
|
|
no_website: set
|
|
) -> dict[str, Any]:
|
|
"""Generate YAML structure for a single custodian."""
|
|
metadata = staff_data.get('custodian_metadata', {})
|
|
source_meta = staff_data.get('source_metadata', {})
|
|
staff_analysis = staff_data.get('staff_analysis', {})
|
|
|
|
custodian_name = metadata.get('custodian_name') or metadata.get('name', slug)
|
|
|
|
# Determine website
|
|
website_url, website_status, original_url = determine_website_status(
|
|
custodian_name, about_data, verification, corrections, no_website
|
|
)
|
|
|
|
# Build YAML structure
|
|
result = {
|
|
'ghcid_temp': f"ghcid:linkedin:{slug}",
|
|
'name': custodian_name,
|
|
'linkedin_slug': slug,
|
|
}
|
|
|
|
# Website info
|
|
if website_url:
|
|
result['website'] = website_url
|
|
result['website_status'] = website_status
|
|
if original_url:
|
|
result['website_original'] = original_url
|
|
|
|
# LinkedIn URL
|
|
result['linkedin_url'] = f"https://www.linkedin.com/company/{slug}"
|
|
|
|
# Industry
|
|
industry = about_data.get('industry') or metadata.get('industry')
|
|
if industry:
|
|
result['industry'] = industry
|
|
|
|
# Location
|
|
location = metadata.get('location')
|
|
if location and isinstance(location, dict):
|
|
result['location'] = location
|
|
|
|
# Institution type
|
|
result['institution_type'] = infer_institution_type(staff_data, about_data)
|
|
|
|
# Staff counts
|
|
if metadata.get('follower_count'):
|
|
result['follower_count'] = metadata['follower_count']
|
|
|
|
result['staff_count'] = staff_analysis.get('total_staff_extracted', len(staff_data.get('staff', [])))
|
|
result['heritage_staff_count'] = staff_analysis.get('heritage_relevant_count', 0)
|
|
|
|
# Heritage staff list
|
|
heritage_staff = build_heritage_staff_list(staff_data)
|
|
if heritage_staff:
|
|
result['heritage_staff'] = heritage_staff
|
|
|
|
# Provenance
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
result['provenance'] = {
|
|
'schema_version': '1.0.0',
|
|
'generated_at': now,
|
|
'sources': {
|
|
'linkedin_people_page': [{
|
|
'source_type': source_meta.get('source_type', 'linkedin_company_people_page_html'),
|
|
'source_file': source_meta.get('source_file'),
|
|
'extraction_timestamp': source_meta.get('registered_timestamp'),
|
|
'claims_extracted': ['staff', 'industry', 'location', 'follower_count'],
|
|
}]
|
|
}
|
|
}
|
|
|
|
# Add about page source if available
|
|
if about_data and about_data.get('source_metadata'):
|
|
about_source = about_data['source_metadata']
|
|
result['provenance']['sources']['linkedin_about_page'] = [{
|
|
'source_type': about_source.get('source_type', 'linkedin_company_about_page_html'),
|
|
'source_file': about_source.get('source_file'),
|
|
'extraction_timestamp': about_source.get('extraction_timestamp'),
|
|
'claims_extracted': ['website_url', 'employee_count'],
|
|
}]
|
|
|
|
# Add verification source if website was verified
|
|
if website_status in ('verified', 'corrected'):
|
|
result['provenance']['sources']['url_verification'] = [{
|
|
'source_type': 'http_verification' if website_status == 'verified' else 'manual_correction',
|
|
'verification_timestamp': now,
|
|
'claims_extracted': ['website_status', 'final_url'],
|
|
}]
|
|
|
|
return result
|
|
|
|
|
|
def write_yaml(data: dict, path: Path):
|
|
"""Write YAML file."""
|
|
if USE_RUAMEL:
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
else:
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
pyyaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Generate custodian YAML from LinkedIn data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without writing files')
|
|
parser.add_argument('--limit', type=int, help='Limit number of custodians to process')
|
|
parser.add_argument('--slug', type=str, help='Process only a specific custodian slug')
|
|
args = parser.parse_args()
|
|
|
|
print("Loading data sources...")
|
|
|
|
# Load all data sources
|
|
staff_by_slug = load_parsed_staff()
|
|
print(f" Loaded {len(staff_by_slug)} parsed staff files")
|
|
|
|
about_by_slug = load_about_data()
|
|
print(f" Loaded {len(about_by_slug)} about page entries")
|
|
|
|
verification_by_name = load_verification_results()
|
|
print(f" Loaded {len(verification_by_name)} verification results")
|
|
|
|
corrections, permanently_closed, no_website = load_corrections()
|
|
print(f" Loaded {len(corrections)} URL corrections")
|
|
print(f" Found {len(permanently_closed)} permanently closed")
|
|
print(f" Found {len(no_website)} with no website")
|
|
|
|
# Create output directory
|
|
if not args.dry_run:
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Process custodians
|
|
processed = 0
|
|
skipped_closed = 0
|
|
skipped_errors = 0
|
|
|
|
slugs_to_process = [args.slug] if args.slug else sorted(staff_by_slug.keys())
|
|
|
|
for slug in slugs_to_process:
|
|
if args.limit and processed >= args.limit:
|
|
break
|
|
|
|
if slug not in staff_by_slug:
|
|
print(f"WARNING: Slug not found: {slug}", file=sys.stderr)
|
|
continue
|
|
|
|
staff_data = staff_by_slug[slug]
|
|
metadata = staff_data.get('custodian_metadata', {})
|
|
name = metadata.get('custodian_name') or metadata.get('name', slug)
|
|
|
|
# Skip permanently closed
|
|
if name in permanently_closed:
|
|
print(f"SKIPPED (permanently closed): {name}")
|
|
skipped_closed += 1
|
|
continue
|
|
|
|
try:
|
|
# Get matching about data
|
|
about_data = about_by_slug.get(slug, {})
|
|
|
|
# Generate YAML
|
|
yaml_data = generate_custodian_yaml(
|
|
slug,
|
|
staff_data,
|
|
about_data,
|
|
verification_by_name,
|
|
corrections,
|
|
no_website
|
|
)
|
|
|
|
if args.dry_run:
|
|
print(f"Would generate: {slug}.yaml ({name})")
|
|
if processed < 3: # Show first 3 examples
|
|
if USE_RUAMEL:
|
|
from io import StringIO
|
|
stream = StringIO()
|
|
yaml.dump(yaml_data, stream)
|
|
print(stream.getvalue()[:500])
|
|
else:
|
|
print(pyyaml.dump(yaml_data, default_flow_style=False)[:500])
|
|
print("---")
|
|
else:
|
|
output_path = OUTPUT_DIR / f"{slug}.yaml"
|
|
write_yaml(yaml_data, output_path)
|
|
print(f"Generated: {output_path.name}")
|
|
|
|
processed += 1
|
|
|
|
except Exception as e:
|
|
print(f"ERROR processing {slug}: {e}", file=sys.stderr)
|
|
skipped_errors += 1
|
|
continue
|
|
|
|
print("\n" + "="*60)
|
|
print(f"SUMMARY:")
|
|
print(f" Processed: {processed}")
|
|
print(f" Skipped (closed): {skipped_closed}")
|
|
print(f" Skipped (errors): {skipped_errors}")
|
|
if not args.dry_run:
|
|
print(f" Output directory: {OUTPUT_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|