glam/scripts/test_linkedin_urls.py
2025-12-11 22:32:09 +01:00

82 lines
No EOL
2.3 KiB
Python

#!/usr/bin/env python3
"""
Test script to show LinkedIn URLs that would be fetched from staff files.
"""
import json
import sys
from pathlib import Path
from typing import Dict, List
import re
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
"""Extract LinkedIn URLs from staff data."""
urls = []
if 'staff' in staff_data:
for person in staff_data['staff']:
# Check both possible field names
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
if url and url not in urls:
urls.append(url)
return urls
def load_staff_files(directory: Path) -> List[str]:
"""Load all staff files and extract LinkedIn URLs."""
all_urls = []
file_count = 0
for file_path in directory.glob("*.json"):
file_count += 1
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
urls = extract_linkedin_urls(data)
all_urls.extend(urls)
print(f"\n{file_path.name}: {len(urls)} LinkedIn URLs")
for url in urls[:3]: # Show first 3
print(f" - {url}")
if len(urls) > 3:
print(f" ... and {len(urls) - 3} more")
except Exception as e:
print(f"Error loading {file_path}: {e}")
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in all_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
print(f"\n" + "="*60)
print(f"SUMMARY:")
print(f" Files processed: {file_count}")
print(f" Total URLs found: {len(all_urls)}")
print(f" Unique URLs: {len(unique_urls)}")
return unique_urls
def main():
"""Main entry point."""
if len(sys.argv) != 2:
print("Usage: python test_linkedin_urls.py <staff_directory>")
sys.exit(1)
staff_directory = sys.argv[1]
urls = load_staff_files(Path(staff_directory))
# Save URLs to a file for review
with open("linkedin_urls_to_fetch.txt", "w") as f:
for url in urls:
f.write(f"{url}\n")
print(f"\nURLs saved to: linkedin_urls_to_fetch.txt")
if __name__ == "__main__":
main()