#!/usr/bin/env python3 """ Merge LinkedIn custodian data into existing NL-* custodian files. This script: 1. Loads all LinkedIn custodian YAML files from data/custodian/linkedin/ 2. Matches them to existing NL-* files via: a. LinkedIn slug (from pre-built index) b. Name similarity (from _name_matches.json) 3. Adds linkedin_enrichment section to matched files (ADDITIVE ONLY per Rule 5) 4. Reports unmatched LinkedIn records for later processing Usage: python scripts/merge_linkedin_to_custodians.py [--dry-run] [--limit N] python scripts/merge_linkedin_to_custodians.py --use-name-matches [--min-score 80] """ import argparse import json import re import sys from datetime import datetime, timezone from pathlib import Path import yaml # Project root PROJECT_ROOT = Path(__file__).parent.parent LINKEDIN_DIR = PROJECT_ROOT / "data" / "custodian" / "linkedin" CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" INDEX_FILE = LINKEDIN_DIR / "_nl_file_index.json" NAME_MATCHES_FILE = LINKEDIN_DIR / "_name_matches.json" def load_yaml(filepath: Path) -> dict: """Load a YAML file.""" with open(filepath, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} def save_yaml(filepath: Path, data: dict) -> None: """Save data to a YAML file with nice formatting.""" with open(filepath, "w", encoding="utf-8") as f: yaml.dump( data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120, ) def load_name_matches(min_score: int = 80) -> dict[str, Path]: """Load name-based matches from _name_matches.json. Args: min_score: Minimum similarity score (0-100) to include a match. Returns: Dict mapping LinkedIn slug to NL-* file path. """ if not NAME_MATCHES_FILE.exists(): return {} with open(NAME_MATCHES_FILE, "r") as f: data = json.load(f) matches = {} for match in data.get("matches", []): if match.get("score", 0) >= min_score: slug = match["linkedin_slug"] nl_file = Path(match["nl_file"]) matches[slug] = nl_file return matches def load_nl_file_index() -> dict[str, Path]: """Load pre-built index mapping LinkedIn slugs to NL-* custodian files. Run scripts/build_linkedin_index.py first to create this index. """ print("Loading NL-* file index...") if not INDEX_FILE.exists(): print(f" ERROR: Index file not found: {INDEX_FILE}") print(" Run: python scripts/build_linkedin_index.py") sys.exit(1) with open(INDEX_FILE, "r") as f: raw_index = json.load(f) # Convert relative paths to Path objects index = {slug: PROJECT_ROOT / path for slug, path in raw_index.items()} print(f" Loaded {len(index)} LinkedIn slug mappings") return index def load_linkedin_custodians() -> list[tuple[Path, dict]]: """Load all LinkedIn custodian YAML files.""" print("Loading LinkedIn custodian files...") custodians = [] for filepath in LINKEDIN_DIR.glob("*.yaml"): if filepath.name.startswith("_"): continue # Skip excluded files try: data = load_yaml(filepath) custodians.append((filepath, data)) except Exception as e: print(f"Warning: Error loading {filepath}: {e}", file=sys.stderr) print(f" Loaded {len(custodians)} LinkedIn custodians") return custodians def create_linkedin_enrichment(linkedin_data: dict, source_file: Path) -> dict: """Create linkedin_enrichment section from LinkedIn data.""" enrichment = { "linkedin_url": linkedin_data.get("linkedin_url"), "linkedin_slug": linkedin_data.get("linkedin_slug"), "industry": linkedin_data.get("industry"), "follower_count": linkedin_data.get("follower_count"), "staff_count": linkedin_data.get("staff_count"), "heritage_staff_count": linkedin_data.get("heritage_staff_count"), "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), "provenance": { "source": "linkedin_company_scrape", "original_file": str(source_file.relative_to(PROJECT_ROOT)), "schema_version": linkedin_data.get("provenance", {}).get("schema_version", "1.0.0"), "generated_at": linkedin_data.get("provenance", {}).get("generated_at"), }, } # Add heritage_staff if present if heritage_staff := linkedin_data.get("heritage_staff"): enrichment["heritage_staff"] = heritage_staff # Remove None values enrichment = {k: v for k, v in enrichment.items() if v is not None} return enrichment def merge_linkedin_data( nl_filepath: Path, linkedin_data: dict, linkedin_filepath: Path, dry_run: bool = False, ) -> bool: """Merge LinkedIn data into existing NL-* file.""" try: nl_data = load_yaml(nl_filepath) # Check if already has linkedin_enrichment if "linkedin_enrichment" in nl_data: existing_slug = nl_data["linkedin_enrichment"].get("linkedin_slug") new_slug = linkedin_data.get("linkedin_slug") if existing_slug == new_slug: print(f" Skipping {nl_filepath.name}: already has linkedin_enrichment for {new_slug}") return False # Create enrichment section enrichment = create_linkedin_enrichment(linkedin_data, linkedin_filepath) # Add to NL data (ADDITIVE - per Rule 5) nl_data["linkedin_enrichment"] = enrichment if dry_run: print(f" [DRY-RUN] Would add linkedin_enrichment to {nl_filepath.name}") print(f" LinkedIn: {linkedin_data.get('name')} ({linkedin_data.get('linkedin_slug')})") staff_count = linkedin_data.get("heritage_staff_count", 0) print(f" Staff: {staff_count} heritage-relevant") else: save_yaml(nl_filepath, nl_data) print(f" Merged: {linkedin_filepath.name} -> {nl_filepath.name}") return True except Exception as e: print(f" Error merging {linkedin_filepath.name}: {e}", file=sys.stderr) return False def main(): parser = argparse.ArgumentParser(description="Merge LinkedIn custodian data into NL-* files") parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") parser.add_argument("--limit", type=int, help="Limit number of merges for testing") parser.add_argument("--use-name-matches", action="store_true", help="Include name-based matches from _name_matches.json") parser.add_argument("--min-score", type=int, default=80, help="Minimum name match score (0-100, default: 80)") args = parser.parse_args() print("=" * 60) print("LinkedIn Custodian Merge Script") print("=" * 60) if args.dry_run: print("*** DRY RUN MODE - No files will be modified ***\n") # Load pre-built index of LinkedIn slugs in existing NL-* files nl_index = load_nl_file_index() # Optionally add name-based matches if args.use_name_matches: name_matches = load_name_matches(args.min_score) print(f" Adding {len(name_matches)} name-based matches (score >= {args.min_score})") # Name matches supplement slug matches (slug matches take priority) for slug, path in name_matches.items(): if slug not in nl_index: nl_index[slug] = path print(f" Total mappings after merge: {len(nl_index)}") # Load LinkedIn custodians linkedin_custodians = load_linkedin_custodians() # Match and merge print("\nMatching and merging...") matched = 0 merged = 0 unmatched = [] already_enriched = 0 for linkedin_path, linkedin_data in linkedin_custodians: if args.limit and merged >= args.limit: print(f"\nReached limit of {args.limit} merges") break slug = linkedin_data.get("linkedin_slug") or linkedin_path.stem if slug in nl_index: matched += 1 nl_path = nl_index[slug] if merge_linkedin_data(nl_path, linkedin_data, linkedin_path, args.dry_run): merged += 1 else: already_enriched += 1 else: unmatched.append((linkedin_path, linkedin_data)) # Report print("\n" + "=" * 60) print("MERGE SUMMARY") print("=" * 60) print(f"Total LinkedIn custodians: {len(linkedin_custodians)}") print(f"Matched to NL-* files: {matched}") print(f"Successfully merged: {merged}") print(f"Already enriched (skipped): {already_enriched}") print(f"Unmatched (need new files): {len(unmatched)}") if unmatched: print("\n--- Top 20 Unmatched LinkedIn Custodians ---") for linkedin_path, linkedin_data in unmatched[:20]: name = linkedin_data.get("name", "Unknown") slug = linkedin_data.get("linkedin_slug", linkedin_path.stem) location = linkedin_data.get("location", {}) city = location.get("city", "?") country = location.get("country", "?") print(f" {slug}: {name} ({city}, {country})") if len(unmatched) > 20: print(f" ... and {len(unmatched) - 20} more") # Save unmatched list for later processing unmatched_file = PROJECT_ROOT / "data" / "custodian" / "linkedin" / "_unmatched.txt" with open(unmatched_file, "w") as f: for linkedin_path, linkedin_data in unmatched: f.write(f"{linkedin_path.stem}\n") print(f"\nUnmatched slugs saved to: {unmatched_file.relative_to(PROJECT_ROOT)}") return 0 if merged > 0 or args.dry_run else 1 if __name__ == "__main__": sys.exit(main())