#!/usr/bin/env python3 """ Fast index builder for LinkedIn slug to NL-* file mapping. Uses regex on raw file content for speed. """ import os import re import json from pathlib import Path PROJECT_ROOT = Path(__file__).parent.parent CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" OUTPUT_FILE = PROJECT_ROOT / "data" / "custodian" / "linkedin" / "_nl_file_index.json" # Regex to find LinkedIn slugs in file content LINKEDIN_URL_PATTERN = re.compile(r'linkedin\.com/company/([a-z0-9-]+)', re.IGNORECASE) LINKEDIN_ID_PATTERN = re.compile(r'linkedin_company_id:\s*([a-z0-9-]+)', re.IGNORECASE) def main(): print("Building LinkedIn slug index from NL-* files...") index = {} # slug -> filepath nl_files = list(CUSTODIAN_DIR.glob("NL-*.yaml")) print(f"Scanning {len(nl_files)} NL-* files...") for i, filepath in enumerate(nl_files): if i % 200 == 0: print(f" Progress: {i}/{len(nl_files)}") try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Find all LinkedIn slugs slugs = set() for match in LINKEDIN_URL_PATTERN.finditer(content): slug = match.group(1).lower() if slug and re.match(r'^[a-z0-9-]+$', slug): slugs.add(slug) for match in LINKEDIN_ID_PATTERN.finditer(content): slug = match.group(1).lower() if slug and re.match(r'^[a-z0-9-]+$', slug): slugs.add(slug) # Add to index rel_path = str(filepath.relative_to(PROJECT_ROOT)) for slug in slugs: if slug not in index: index[slug] = rel_path except Exception as e: print(f" Error reading {filepath.name}: {e}") print(f"\nIndexed {len(index)} LinkedIn slugs") # Save index with open(OUTPUT_FILE, 'w') as f: json.dump(index, f, indent=2, sort_keys=True) print(f"Saved to: {OUTPUT_FILE.relative_to(PROJECT_ROOT)}") # Show some examples print("\nSample entries:") for slug, path in list(index.items())[:10]: print(f" {slug} -> {path}") if __name__ == "__main__": main()