glam/scripts/enrich_missing_websites.py
2025-11-30 23:30:29 +01:00

128 lines
3.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich skipped entries with websites found via web search.
Uses Exa API to find official websites for institutions.
"""
import os
import sys
import yaml
import time
import httpx
from pathlib import Path
from datetime import datetime, timezone
ENTRIES_DIR = Path('data/nde/enriched/entries')
STATE_FILE = ENTRIES_DIR / 'web' / '_archive_state.yaml'
# Exa API setup
EXA_API_KEY = os.environ.get('EXA_API_KEY')
EXA_ENDPOINT = "https://api.exa.ai/search"
def search_website(org_name: str) -> str | None:
"""Search for official website using Exa."""
if not EXA_API_KEY:
print("Error: EXA_API_KEY not set")
return None
query = f"{org_name} official website Netherlands"
headers = {
"Authorization": f"Bearer {EXA_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"query": query,
"numResults": 3,
"type": "auto"
}
try:
response = httpx.post(EXA_ENDPOINT, json=payload, headers=headers, timeout=30)
if response.status_code == 200:
data = response.json()
results = data.get('results', [])
if results:
# Return first result URL
return results[0].get('url')
except Exception as e:
print(f" Error searching: {e}")
return None
def main():
# Load state
with open(STATE_FILE) as f:
state = yaml.safe_load(f)
skipped = state.get('skipped_entries', [])
print(f"Found {len(skipped)} skipped entries")
# Track results
found_websites = []
for entry_name in skipped[:20]: # Process first 20
filepath = ENTRIES_DIR / entry_name
if not filepath.exists():
continue
with open(filepath) as f:
data = yaml.safe_load(f)
oe = data.get('original_entry', {})
org_name = oe.get('organisatie', '')
remarks = oe.get('opmerkingen_inez', '') or ''
# Skip if explicitly noted as no website or defunct
if 'geen website' in remarks.lower():
print(f"SKIP {entry_name}: No website noted")
continue
if 'bestaat niet meer' in remarks.lower():
print(f"SKIP {entry_name}: Institution defunct")
continue
print(f"\nSearching: {org_name}")
website = search_website(org_name)
if website:
print(f" FOUND: {website}")
found_websites.append({
'entry': entry_name,
'name': org_name,
'website': website
})
# Update entry file
if 'web_enrichment' not in data:
data['web_enrichment'] = {}
data['web_enrichment']['source_url'] = website
data['web_enrichment']['source_method'] = 'exa_web_search'
data['web_enrichment']['search_timestamp'] = datetime.now(timezone.utc).isoformat()
with open(filepath, 'w') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove from skipped list
if entry_name in state['skipped_entries']:
state['skipped_entries'].remove(entry_name)
else:
print(f" NOT FOUND")
time.sleep(1) # Rate limit
# Save updated state
with open(STATE_FILE, 'w') as f:
yaml.dump(state, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\n=== SUMMARY ===")
print(f"Found {len(found_websites)} websites")
for fw in found_websites:
print(f" {fw['name']}: {fw['website']}")
if __name__ == '__main__':
main()