glam/scripts/enrich_urls_exa.py
2025-11-29 18:05:16 +01:00

102 lines
2.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich NDE entries with URLs using Exa web search.
Adds discovered URLs to YAML files with proper provenance.
"""
import yaml
import json
import sys
import os
from pathlib import Path
from datetime import datetime, timezone
import time
import subprocess
import re
# Configuration
ENTRIES_DIR = Path("data/nde/enriched/entries")
NEEDS_URLS_FILE = Path("data/nde/enriched/_entries_needing_urls.yaml")
LOG_FILE = Path("data/nde/enriched/_url_enrichment_log.yaml")
def load_entries_needing_urls():
"""Load list of entries that need URL enrichment."""
with open(NEEDS_URLS_FILE, 'r') as f:
return yaml.safe_load(f)
def search_url_with_exa(name: str, wikidata_id: str = None) -> dict:
"""
Search for organization URL using Exa.
Returns dict with url, title, and source info.
"""
# Build search query
query = f'"{name}" Nederland official website'
if wikidata_id:
query = f'"{name}" official website homepage'
# Note: This would normally call the Exa MCP tool
# For now, we return a placeholder that will be filled by the agent
return {
'query': query,
'name': name,
'wikidata_id': wikidata_id
}
def update_entry_with_url(entry_file: str, url: str, search_query: str):
"""Update YAML entry with discovered URL."""
file_path = ENTRIES_DIR / entry_file
with open(file_path, 'r') as f:
data = yaml.safe_load(f)
if not isinstance(data, dict):
data = {'original_entry': {}}
# Add URL to original_entry if not present
if 'original_entry' not in data:
data['original_entry'] = {}
if isinstance(data['original_entry'], dict):
data['original_entry']['webadres_organisatie'] = url
# Add enrichment record
if 'url_enrichment' not in data:
data['url_enrichment'] = {}
data['url_enrichment'] = {
'discovered_url': url,
'search_query': search_query,
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'exa_web_search',
'status': 'SUCCESS'
}
# Save
with open(file_path, 'w') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
def main():
entries = load_entries_needing_urls()
print(f"Found {len(entries)} entries needing URLs")
# Print search queries for all entries
print("\n" + "="*80)
print("SEARCH QUERIES FOR EXA")
print("="*80 + "\n")
for i, entry in enumerate(entries):
name = entry.get('name', 'Unknown')
wikidata_id = entry.get('wikidata_id')
query = f'"{name}" Nederland official website'
print(f"{i+1}. {entry['file']}")
print(f" Name: {name}")
if wikidata_id:
print(f" Wikidata: {wikidata_id}")
print(f" Search: {query}")
print()
if __name__ == "__main__":
main()