82 lines
No EOL
2.3 KiB
Python
82 lines
No EOL
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to show LinkedIn URLs that would be fetched from staff files.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
import re
|
|
|
|
|
|
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
|
|
"""Extract LinkedIn URLs from staff data."""
|
|
urls = []
|
|
|
|
if 'staff' in staff_data:
|
|
for person in staff_data['staff']:
|
|
# Check both possible field names
|
|
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
|
|
if url and url not in urls:
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
def load_staff_files(directory: Path) -> List[str]:
|
|
"""Load all staff files and extract LinkedIn URLs."""
|
|
all_urls = []
|
|
file_count = 0
|
|
|
|
for file_path in directory.glob("*.json"):
|
|
file_count += 1
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
urls = extract_linkedin_urls(data)
|
|
all_urls.extend(urls)
|
|
print(f"\n{file_path.name}: {len(urls)} LinkedIn URLs")
|
|
for url in urls[:3]: # Show first 3
|
|
print(f" - {url}")
|
|
if len(urls) > 3:
|
|
print(f" ... and {len(urls) - 3} more")
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_urls = []
|
|
for url in all_urls:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
unique_urls.append(url)
|
|
|
|
print(f"\n" + "="*60)
|
|
print(f"SUMMARY:")
|
|
print(f" Files processed: {file_count}")
|
|
print(f" Total URLs found: {len(all_urls)}")
|
|
print(f" Unique URLs: {len(unique_urls)}")
|
|
|
|
return unique_urls
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python test_linkedin_urls.py <staff_directory>")
|
|
sys.exit(1)
|
|
|
|
staff_directory = sys.argv[1]
|
|
urls = load_staff_files(Path(staff_directory))
|
|
|
|
# Save URLs to a file for review
|
|
with open("linkedin_urls_to_fetch.txt", "w") as f:
|
|
for url in urls:
|
|
f.write(f"{url}\n")
|
|
|
|
print(f"\nURLs saved to: linkedin_urls_to_fetch.txt")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |