72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast index builder for LinkedIn slug to NL-* file mapping.
|
|
Uses regex on raw file content for speed.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
|
OUTPUT_FILE = PROJECT_ROOT / "data" / "custodian" / "linkedin" / "_nl_file_index.json"
|
|
|
|
# Regex to find LinkedIn slugs in file content
|
|
LINKEDIN_URL_PATTERN = re.compile(r'linkedin\.com/company/([a-z0-9-]+)', re.IGNORECASE)
|
|
LINKEDIN_ID_PATTERN = re.compile(r'linkedin_company_id:\s*([a-z0-9-]+)', re.IGNORECASE)
|
|
|
|
def main():
|
|
print("Building LinkedIn slug index from NL-* files...")
|
|
|
|
index = {} # slug -> filepath
|
|
|
|
nl_files = list(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
|
print(f"Scanning {len(nl_files)} NL-* files...")
|
|
|
|
for i, filepath in enumerate(nl_files):
|
|
if i % 200 == 0:
|
|
print(f" Progress: {i}/{len(nl_files)}")
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find all LinkedIn slugs
|
|
slugs = set()
|
|
|
|
for match in LINKEDIN_URL_PATTERN.finditer(content):
|
|
slug = match.group(1).lower()
|
|
if slug and re.match(r'^[a-z0-9-]+$', slug):
|
|
slugs.add(slug)
|
|
|
|
for match in LINKEDIN_ID_PATTERN.finditer(content):
|
|
slug = match.group(1).lower()
|
|
if slug and re.match(r'^[a-z0-9-]+$', slug):
|
|
slugs.add(slug)
|
|
|
|
# Add to index
|
|
rel_path = str(filepath.relative_to(PROJECT_ROOT))
|
|
for slug in slugs:
|
|
if slug not in index:
|
|
index[slug] = rel_path
|
|
|
|
except Exception as e:
|
|
print(f" Error reading {filepath.name}: {e}")
|
|
|
|
print(f"\nIndexed {len(index)} LinkedIn slugs")
|
|
|
|
# Save index
|
|
with open(OUTPUT_FILE, 'w') as f:
|
|
json.dump(index, f, indent=2, sort_keys=True)
|
|
|
|
print(f"Saved to: {OUTPUT_FILE.relative_to(PROJECT_ROOT)}")
|
|
|
|
# Show some examples
|
|
print("\nSample entries:")
|
|
for slug, path in list(index.items())[:10]:
|
|
print(f" {slug} -> {path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|