glam/scripts/build_linkedin_index.py
2025-12-16 20:27:39 +01:00

72 lines
2.3 KiB
Python

#!/usr/bin/env python3
"""
Fast index builder for LinkedIn slug to NL-* file mapping.
Uses regex on raw file content for speed.
"""
import os
import re
import json
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
OUTPUT_FILE = PROJECT_ROOT / "data" / "custodian" / "linkedin" / "_nl_file_index.json"
# Regex to find LinkedIn slugs in file content
LINKEDIN_URL_PATTERN = re.compile(r'linkedin\.com/company/([a-z0-9-]+)', re.IGNORECASE)
LINKEDIN_ID_PATTERN = re.compile(r'linkedin_company_id:\s*([a-z0-9-]+)', re.IGNORECASE)
def main():
print("Building LinkedIn slug index from NL-* files...")
index = {} # slug -> filepath
nl_files = list(CUSTODIAN_DIR.glob("NL-*.yaml"))
print(f"Scanning {len(nl_files)} NL-* files...")
for i, filepath in enumerate(nl_files):
if i % 200 == 0:
print(f" Progress: {i}/{len(nl_files)}")
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Find all LinkedIn slugs
slugs = set()
for match in LINKEDIN_URL_PATTERN.finditer(content):
slug = match.group(1).lower()
if slug and re.match(r'^[a-z0-9-]+$', slug):
slugs.add(slug)
for match in LINKEDIN_ID_PATTERN.finditer(content):
slug = match.group(1).lower()
if slug and re.match(r'^[a-z0-9-]+$', slug):
slugs.add(slug)
# Add to index
rel_path = str(filepath.relative_to(PROJECT_ROOT))
for slug in slugs:
if slug not in index:
index[slug] = rel_path
except Exception as e:
print(f" Error reading {filepath.name}: {e}")
print(f"\nIndexed {len(index)} LinkedIn slugs")
# Save index
with open(OUTPUT_FILE, 'w') as f:
json.dump(index, f, indent=2, sort_keys=True)
print(f"Saved to: {OUTPUT_FILE.relative_to(PROJECT_ROOT)}")
# Show some examples
print("\nSample entries:")
for slug, path in list(index.items())[:10]:
print(f" {slug} -> {path}")
if __name__ == "__main__":
main()