#!/usr/bin/env python3 """ Generate custodian YAML files from LinkedIn parsed data. Merges data from: - Parsed staff JSON files (data/custodian/person/affiliated/parsed/*.json) - About page data (website URLs, industry, employee count) - URL verification results (alive/dead/redirect status) - URL corrections (resolved dead links) Output: data/custodian/linkedin/{slug}.yaml Usage: python scripts/generate_linkedin_custodian_yaml.py python scripts/generate_linkedin_custodian_yaml.py --dry-run python scripts/generate_linkedin_custodian_yaml.py --limit 10 """ import json import re import sys from pathlib import Path from datetime import datetime, timezone from collections import defaultdict from typing import Any import argparse # Use ruamel.yaml for better YAML output with preserved order try: from ruamel.yaml import YAML yaml = YAML() yaml.default_flow_style = False yaml.allow_unicode = True yaml.width = 120 USE_RUAMEL = True except ImportError: import yaml as pyyaml USE_RUAMEL = False BASE_DIR = Path(__file__).parent.parent DATA_DIR = BASE_DIR / "data/custodian/person/affiliated" PARSED_DIR = DATA_DIR / "parsed" ABOUT_DATA_FILE = DATA_DIR / "about_data/about_data_20251216T152238Z.json" VERIFICATION_FILE = DATA_DIR / "verified_links/verification_results_20251216T152538Z.json" CORRECTIONS_FILE = DATA_DIR / "verified_links/corrected_urls_20251216T160000Z.json" OUTPUT_DIR = BASE_DIR / "data/custodian/linkedin" # Industry to institution type mapping INDUSTRY_TO_TYPE = { "Museums, Historical Sites, and Zoos": ["M"], "Museums and Institutions": ["M"], "Libraries": ["L"], "Archives": ["A"], "Research Services": ["R"], "Higher Education": ["E"], "Non-profit Organizations": ["F"], "Government Administration": ["O"], "Performing Arts": ["C"], "Fine Art": ["G"], "Civic and Social Organizations": ["S"], "Environmental Services": ["B"], "Religious Institutions": ["H"], } def load_parsed_staff() -> dict[str, dict]: """Load most recent parsed JSON per custodian slug. Returns dict keyed by custodian_slug with most recent staff data. """ parsed_files: dict[str, list[tuple[str, Path]]] = defaultdict(list) for json_file in PARSED_DIR.glob("*.json"): # Extract slug from filename: {slug}_staff_{timestamp}.json match = re.match(r"(.+)_staff_(\d{8}T\d{6}Z)\.json", json_file.name) if match: slug = match.group(1) timestamp = match.group(2) parsed_files[slug].append((timestamp, json_file)) # Keep only most recent file per slug result = {} for slug, files in parsed_files.items(): if not files: continue # Sort by timestamp descending files.sort(key=lambda x: x[0], reverse=True) most_recent = files[0][1] try: with open(most_recent, 'r', encoding='utf-8') as f: data = json.load(f) result[slug] = data except (json.JSONDecodeError, FileNotFoundError) as e: print(f"WARNING: Failed to load {most_recent}: {e}", file=sys.stderr) return result def load_about_data() -> dict[str, dict]: """Load about page data indexed by custodian_slug.""" if not ABOUT_DATA_FILE.exists(): print(f"WARNING: About data file not found: {ABOUT_DATA_FILE}", file=sys.stderr) return {} with open(ABOUT_DATA_FILE, 'r', encoding='utf-8') as f: raw = json.load(f) # Handle nested structure - data is in 'data' key entries = [] if isinstance(raw, list): entries = raw elif isinstance(raw, dict) and 'data' in raw: entries = raw['data'] elif isinstance(raw, dict): # Might be keyed by slug already return raw result = {} for entry in entries: slug = entry.get('custodian_slug') if slug: result[slug] = entry return result def load_verification_results() -> dict[str, dict]: """Load URL verification results indexed by custodian_name.""" if not VERIFICATION_FILE.exists(): print(f"WARNING: Verification file not found: {VERIFICATION_FILE}", file=sys.stderr) return {} with open(VERIFICATION_FILE, 'r', encoding='utf-8') as f: raw = json.load(f) results = raw.get('results', []) return {r['custodian_name']: r for r in results} def load_corrections() -> tuple[dict[str, dict], set[str], set[str]]: """Load URL corrections and special cases. Returns: - corrections: dict mapping custodian_name to correction info - permanently_closed: set of custodian names that are permanently closed - no_website: set of custodian names with no website """ if not CORRECTIONS_FILE.exists(): print(f"WARNING: Corrections file not found: {CORRECTIONS_FILE}", file=sys.stderr) return {}, set(), set() with open(CORRECTIONS_FILE, 'r', encoding='utf-8') as f: raw = json.load(f) corrections = {c['custodian_name']: c for c in raw.get('corrections', [])} permanently_closed = {c['custodian_name'] for c in raw.get('permanently_closed', [])} no_website = {c['custodian_name'] for c in raw.get('no_website', [])} # Also add google_business_defunct as having no usable website for item in raw.get('google_business_defunct', []): no_website.add(item['custodian_name']) return corrections, permanently_closed, no_website def infer_institution_type(staff_data: dict, about_data: dict) -> list[str]: """Infer institution type from industry and staff heritage types.""" types_found = set() # From industry industry = (about_data.get('industry') or staff_data.get('custodian_metadata', {}).get('industry')) if industry and industry in INDUSTRY_TO_TYPE: types_found.update(INDUSTRY_TO_TYPE[industry]) # From staff heritage types staff_analysis = staff_data.get('staff_analysis', {}) heritage_types = staff_analysis.get('staff_by_heritage_type', {}) if heritage_types: # Take the most common heritage type sorted_types = sorted(heritage_types.items(), key=lambda x: x[1], reverse=True) if sorted_types: types_found.add(sorted_types[0][0]) # Default to M (Museum) if nothing found if not types_found: types_found.add("M") return sorted(list(types_found)) def determine_website_status( custodian_name: str, about_data: dict, verification: dict, corrections: dict, no_website: set ) -> tuple[str | None, str, str | None]: """Determine final website URL and status. Returns: - website_url: Final URL to use (may be corrected) - status: 'verified', 'corrected', 'unverified', 'dead', 'none' - original_url: Original URL if corrected, else None """ original_url = about_data.get('website_url') # Check if custodian has no website if custodian_name in no_website: return None, 'none', original_url # Check corrections first if custodian_name in corrections: correction = corrections[custodian_name] return correction['corrected_url'], 'corrected', correction.get('original_url', original_url) # No original URL from about page if not original_url: return None, 'none', None # Check verification status if custodian_name in verification: v = verification[custodian_name] if v.get('is_alive'): final_url = v.get('final_url') or original_url return final_url, 'verified', None if final_url == original_url else original_url else: # Dead link not in corrections return original_url, 'dead', None # Not verified return original_url, 'unverified', None def build_heritage_staff_list(staff_data: dict) -> list[dict]: """Build list of heritage-relevant staff members.""" staff = staff_data.get('staff', []) heritage_staff = [] for person in staff: if not person.get('heritage_relevant'): continue entry = { 'name': person.get('name'), 'headline': person.get('headline'), } if person.get('linkedin_profile_url'): entry['linkedin_url'] = person['linkedin_profile_url'] elif person.get('linkedin_slug'): entry['linkedin_url'] = f"https://www.linkedin.com/in/{person['linkedin_slug']}" if person.get('heritage_type'): entry['heritage_type'] = person['heritage_type'] heritage_staff.append(entry) return heritage_staff def generate_custodian_yaml( slug: str, staff_data: dict, about_data: dict, verification: dict, corrections: dict, no_website: set ) -> dict[str, Any]: """Generate YAML structure for a single custodian.""" metadata = staff_data.get('custodian_metadata', {}) source_meta = staff_data.get('source_metadata', {}) staff_analysis = staff_data.get('staff_analysis', {}) custodian_name = metadata.get('custodian_name') or metadata.get('name', slug) # Determine website website_url, website_status, original_url = determine_website_status( custodian_name, about_data, verification, corrections, no_website ) # Build YAML structure result = { 'ghcid_temp': f"ghcid:linkedin:{slug}", 'name': custodian_name, 'linkedin_slug': slug, } # Website info if website_url: result['website'] = website_url result['website_status'] = website_status if original_url: result['website_original'] = original_url # LinkedIn URL result['linkedin_url'] = f"https://www.linkedin.com/company/{slug}" # Industry industry = about_data.get('industry') or metadata.get('industry') if industry: result['industry'] = industry # Location location = metadata.get('location') if location and isinstance(location, dict): result['location'] = location # Institution type result['institution_type'] = infer_institution_type(staff_data, about_data) # Staff counts if metadata.get('follower_count'): result['follower_count'] = metadata['follower_count'] result['staff_count'] = staff_analysis.get('total_staff_extracted', len(staff_data.get('staff', []))) result['heritage_staff_count'] = staff_analysis.get('heritage_relevant_count', 0) # Heritage staff list heritage_staff = build_heritage_staff_list(staff_data) if heritage_staff: result['heritage_staff'] = heritage_staff # Provenance now = datetime.now(timezone.utc).isoformat() result['provenance'] = { 'schema_version': '1.0.0', 'generated_at': now, 'sources': { 'linkedin_people_page': [{ 'source_type': source_meta.get('source_type', 'linkedin_company_people_page_html'), 'source_file': source_meta.get('source_file'), 'extraction_timestamp': source_meta.get('registered_timestamp'), 'claims_extracted': ['staff', 'industry', 'location', 'follower_count'], }] } } # Add about page source if available if about_data and about_data.get('source_metadata'): about_source = about_data['source_metadata'] result['provenance']['sources']['linkedin_about_page'] = [{ 'source_type': about_source.get('source_type', 'linkedin_company_about_page_html'), 'source_file': about_source.get('source_file'), 'extraction_timestamp': about_source.get('extraction_timestamp'), 'claims_extracted': ['website_url', 'employee_count'], }] # Add verification source if website was verified if website_status in ('verified', 'corrected'): result['provenance']['sources']['url_verification'] = [{ 'source_type': 'http_verification' if website_status == 'verified' else 'manual_correction', 'verification_timestamp': now, 'claims_extracted': ['website_status', 'final_url'], }] return result def write_yaml(data: dict, path: Path): """Write YAML file.""" if USE_RUAMEL: with open(path, 'w', encoding='utf-8') as f: yaml.dump(data, f) else: with open(path, 'w', encoding='utf-8') as f: pyyaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def main(): parser = argparse.ArgumentParser(description='Generate custodian YAML from LinkedIn data') parser.add_argument('--dry-run', action='store_true', help='Print what would be done without writing files') parser.add_argument('--limit', type=int, help='Limit number of custodians to process') parser.add_argument('--slug', type=str, help='Process only a specific custodian slug') args = parser.parse_args() print("Loading data sources...") # Load all data sources staff_by_slug = load_parsed_staff() print(f" Loaded {len(staff_by_slug)} parsed staff files") about_by_slug = load_about_data() print(f" Loaded {len(about_by_slug)} about page entries") verification_by_name = load_verification_results() print(f" Loaded {len(verification_by_name)} verification results") corrections, permanently_closed, no_website = load_corrections() print(f" Loaded {len(corrections)} URL corrections") print(f" Found {len(permanently_closed)} permanently closed") print(f" Found {len(no_website)} with no website") # Create output directory if not args.dry_run: OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Process custodians processed = 0 skipped_closed = 0 skipped_errors = 0 slugs_to_process = [args.slug] if args.slug else sorted(staff_by_slug.keys()) for slug in slugs_to_process: if args.limit and processed >= args.limit: break if slug not in staff_by_slug: print(f"WARNING: Slug not found: {slug}", file=sys.stderr) continue staff_data = staff_by_slug[slug] metadata = staff_data.get('custodian_metadata', {}) name = metadata.get('custodian_name') or metadata.get('name', slug) # Skip permanently closed if name in permanently_closed: print(f"SKIPPED (permanently closed): {name}") skipped_closed += 1 continue try: # Get matching about data about_data = about_by_slug.get(slug, {}) # Generate YAML yaml_data = generate_custodian_yaml( slug, staff_data, about_data, verification_by_name, corrections, no_website ) if args.dry_run: print(f"Would generate: {slug}.yaml ({name})") if processed < 3: # Show first 3 examples if USE_RUAMEL: from io import StringIO stream = StringIO() yaml.dump(yaml_data, stream) print(stream.getvalue()[:500]) else: print(pyyaml.dump(yaml_data, default_flow_style=False)[:500]) print("---") else: output_path = OUTPUT_DIR / f"{slug}.yaml" write_yaml(yaml_data, output_path) print(f"Generated: {output_path.name}") processed += 1 except Exception as e: print(f"ERROR processing {slug}: {e}", file=sys.stderr) skipped_errors += 1 continue print("\n" + "="*60) print(f"SUMMARY:") print(f" Processed: {processed}") print(f" Skipped (closed): {skipped_closed}") print(f" Skipped (errors): {skipped_errors}") if not args.dry_run: print(f" Output directory: {OUTPUT_DIR}") if __name__ == "__main__": main()