#!/usr/bin/env python3 """ Enrich skipped entries with websites found via web search. Uses Exa API to find official websites for institutions. """ import os import sys import yaml import time import httpx from pathlib import Path from datetime import datetime, timezone ENTRIES_DIR = Path('data/nde/enriched/entries') STATE_FILE = ENTRIES_DIR / 'web' / '_archive_state.yaml' # Exa API setup EXA_API_KEY = os.environ.get('EXA_API_KEY') EXA_ENDPOINT = "https://api.exa.ai/search" def search_website(org_name: str) -> str | None: """Search for official website using Exa.""" if not EXA_API_KEY: print("Error: EXA_API_KEY not set") return None query = f"{org_name} official website Netherlands" headers = { "Authorization": f"Bearer {EXA_API_KEY}", "Content-Type": "application/json" } payload = { "query": query, "numResults": 3, "type": "auto" } try: response = httpx.post(EXA_ENDPOINT, json=payload, headers=headers, timeout=30) if response.status_code == 200: data = response.json() results = data.get('results', []) if results: # Return first result URL return results[0].get('url') except Exception as e: print(f" Error searching: {e}") return None def main(): # Load state with open(STATE_FILE) as f: state = yaml.safe_load(f) skipped = state.get('skipped_entries', []) print(f"Found {len(skipped)} skipped entries") # Track results found_websites = [] for entry_name in skipped[:20]: # Process first 20 filepath = ENTRIES_DIR / entry_name if not filepath.exists(): continue with open(filepath) as f: data = yaml.safe_load(f) oe = data.get('original_entry', {}) org_name = oe.get('organisatie', '') remarks = oe.get('opmerkingen_inez', '') or '' # Skip if explicitly noted as no website or defunct if 'geen website' in remarks.lower(): print(f"SKIP {entry_name}: No website noted") continue if 'bestaat niet meer' in remarks.lower(): print(f"SKIP {entry_name}: Institution defunct") continue print(f"\nSearching: {org_name}") website = search_website(org_name) if website: print(f" FOUND: {website}") found_websites.append({ 'entry': entry_name, 'name': org_name, 'website': website }) # Update entry file if 'web_enrichment' not in data: data['web_enrichment'] = {} data['web_enrichment']['source_url'] = website data['web_enrichment']['source_method'] = 'exa_web_search' data['web_enrichment']['search_timestamp'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove from skipped list if entry_name in state['skipped_entries']: state['skipped_entries'].remove(entry_name) else: print(f" NOT FOUND") time.sleep(1) # Rate limit # Save updated state with open(STATE_FILE, 'w') as f: yaml.dump(state, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"\n=== SUMMARY ===") print(f"Found {len(found_websites)} websites") for fw in found_websites: print(f" {fw['name']}: {fw['website']}") if __name__ == '__main__': main()