#!/usr/bin/env python3 """ Enrich NDE entries with URLs using Exa web search. Adds discovered URLs to YAML files with proper provenance. """ import yaml import json import sys import os from pathlib import Path from datetime import datetime, timezone import time import subprocess import re # Configuration ENTRIES_DIR = Path("data/nde/enriched/entries") NEEDS_URLS_FILE = Path("data/nde/enriched/_entries_needing_urls.yaml") LOG_FILE = Path("data/nde/enriched/_url_enrichment_log.yaml") def load_entries_needing_urls(): """Load list of entries that need URL enrichment.""" with open(NEEDS_URLS_FILE, 'r') as f: return yaml.safe_load(f) def search_url_with_exa(name: str, wikidata_id: str = None) -> dict: """ Search for organization URL using Exa. Returns dict with url, title, and source info. """ # Build search query query = f'"{name}" Nederland official website' if wikidata_id: query = f'"{name}" official website homepage' # Note: This would normally call the Exa MCP tool # For now, we return a placeholder that will be filled by the agent return { 'query': query, 'name': name, 'wikidata_id': wikidata_id } def update_entry_with_url(entry_file: str, url: str, search_query: str): """Update YAML entry with discovered URL.""" file_path = ENTRIES_DIR / entry_file with open(file_path, 'r') as f: data = yaml.safe_load(f) if not isinstance(data, dict): data = {'original_entry': {}} # Add URL to original_entry if not present if 'original_entry' not in data: data['original_entry'] = {} if isinstance(data['original_entry'], dict): data['original_entry']['webadres_organisatie'] = url # Add enrichment record if 'url_enrichment' not in data: data['url_enrichment'] = {} data['url_enrichment'] = { 'discovered_url': url, 'search_query': search_query, 'enrichment_timestamp': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'exa_web_search', 'status': 'SUCCESS' } # Save with open(file_path, 'w') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True def main(): entries = load_entries_needing_urls() print(f"Found {len(entries)} entries needing URLs") # Print search queries for all entries print("\n" + "="*80) print("SEARCH QUERIES FOR EXA") print("="*80 + "\n") for i, entry in enumerate(entries): name = entry.get('name', 'Unknown') wikidata_id = entry.get('wikidata_id') query = f'"{name}" Nederland official website' print(f"{i+1}. {entry['file']}") print(f" Name: {name}") if wikidata_id: print(f" Wikidata: {wikidata_id}") print(f" Search: {query}") print() if __name__ == "__main__": main()