glam/scripts/test_linkedin_urls.py

#!/usr/bin/env python3
"""
Test script to show LinkedIn URLs that would be fetched from staff files.
"""

import json
import sys
from pathlib import Path
from typing import Dict, List
import re


def extract_linkedin_urls(staff_data: Dict) -> List[str]:
    """Extract LinkedIn URLs from staff data."""
    urls = []

    if 'staff' in staff_data:
        for person in staff_data['staff']:
            # Check both possible field names
            url = person.get('linkedin_url') or person.get('linkedin_profile_url')
            if url and url not in urls:
                urls.append(url)

    return urls


def load_staff_files(directory: Path) -> List[str]:
    """Load all staff files and extract LinkedIn URLs."""
    all_urls = []
    file_count = 0

    for file_path in directory.glob("*.json"):
        file_count += 1
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                urls = extract_linkedin_urls(data)
                all_urls.extend(urls)
                print(f"\n{file_path.name}: {len(urls)} LinkedIn URLs")
                for url in urls[:3]:  # Show first 3
                    print(f"  - {url}")
                if len(urls) > 3:
                    print(f"  ... and {len(urls) - 3} more")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    # Remove duplicates while preserving order
    seen = set()
    unique_urls = []
    for url in all_urls:
        if url not in seen:
            seen.add(url)
            unique_urls.append(url)

    print(f"\n" + "="*60)
    print(f"SUMMARY:")
    print(f"  Files processed: {file_count}")
    print(f"  Total URLs found: {len(all_urls)}")
    print(f"  Unique URLs: {len(unique_urls)}")

    return unique_urls


def main():
    """Main entry point."""
    if len(sys.argv) != 2:
        print("Usage: python test_linkedin_urls.py <staff_directory>")
        sys.exit(1)

    staff_directory = sys.argv[1]
    urls = load_staff_files(Path(staff_directory))

    # Save URLs to a file for review
    with open("linkedin_urls_to_fetch.txt", "w") as f:
        for url in urls:
            f.write(f"{url}\n")

    print(f"\nURLs saved to: linkedin_urls_to_fetch.txt")


if __name__ == "__main__":
    main()