#!/usr/bin/env python3 """ Test script to show LinkedIn URLs that would be fetched from staff files. """ import json import sys from pathlib import Path from typing import Dict, List import re def extract_linkedin_urls(staff_data: Dict) -> List[str]: """Extract LinkedIn URLs from staff data.""" urls = [] if 'staff' in staff_data: for person in staff_data['staff']: # Check both possible field names url = person.get('linkedin_url') or person.get('linkedin_profile_url') if url and url not in urls: urls.append(url) return urls def load_staff_files(directory: Path) -> List[str]: """Load all staff files and extract LinkedIn URLs.""" all_urls = [] file_count = 0 for file_path in directory.glob("*.json"): file_count += 1 try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) urls = extract_linkedin_urls(data) all_urls.extend(urls) print(f"\n{file_path.name}: {len(urls)} LinkedIn URLs") for url in urls[:3]: # Show first 3 print(f" - {url}") if len(urls) > 3: print(f" ... and {len(urls) - 3} more") except Exception as e: print(f"Error loading {file_path}: {e}") # Remove duplicates while preserving order seen = set() unique_urls = [] for url in all_urls: if url not in seen: seen.add(url) unique_urls.append(url) print(f"\n" + "="*60) print(f"SUMMARY:") print(f" Files processed: {file_count}") print(f" Total URLs found: {len(all_urls)}") print(f" Unique URLs: {len(unique_urls)}") return unique_urls def main(): """Main entry point.""" if len(sys.argv) != 2: print("Usage: python test_linkedin_urls.py ") sys.exit(1) staff_directory = sys.argv[1] urls = load_staff_files(Path(staff_directory)) # Save URLs to a file for review with open("linkedin_urls_to_fetch.txt", "w") as f: for url in urls: f.write(f"{url}\n") print(f"\nURLs saved to: linkedin_urls_to_fetch.txt") if __name__ == "__main__": main()