102 lines
2.9 KiB
Python
Executable file
102 lines
2.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich NDE entries with URLs using Exa web search.
|
|
Adds discovered URLs to YAML files with proper provenance.
|
|
"""
|
|
|
|
import yaml
|
|
import json
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import subprocess
|
|
import re
|
|
|
|
# Configuration
|
|
ENTRIES_DIR = Path("data/nde/enriched/entries")
|
|
NEEDS_URLS_FILE = Path("data/nde/enriched/_entries_needing_urls.yaml")
|
|
LOG_FILE = Path("data/nde/enriched/_url_enrichment_log.yaml")
|
|
|
|
def load_entries_needing_urls():
|
|
"""Load list of entries that need URL enrichment."""
|
|
with open(NEEDS_URLS_FILE, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def search_url_with_exa(name: str, wikidata_id: str = None) -> dict:
|
|
"""
|
|
Search for organization URL using Exa.
|
|
Returns dict with url, title, and source info.
|
|
"""
|
|
# Build search query
|
|
query = f'"{name}" Nederland official website'
|
|
if wikidata_id:
|
|
query = f'"{name}" official website homepage'
|
|
|
|
# Note: This would normally call the Exa MCP tool
|
|
# For now, we return a placeholder that will be filled by the agent
|
|
return {
|
|
'query': query,
|
|
'name': name,
|
|
'wikidata_id': wikidata_id
|
|
}
|
|
|
|
def update_entry_with_url(entry_file: str, url: str, search_query: str):
|
|
"""Update YAML entry with discovered URL."""
|
|
file_path = ENTRIES_DIR / entry_file
|
|
|
|
with open(file_path, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not isinstance(data, dict):
|
|
data = {'original_entry': {}}
|
|
|
|
# Add URL to original_entry if not present
|
|
if 'original_entry' not in data:
|
|
data['original_entry'] = {}
|
|
|
|
if isinstance(data['original_entry'], dict):
|
|
data['original_entry']['webadres_organisatie'] = url
|
|
|
|
# Add enrichment record
|
|
if 'url_enrichment' not in data:
|
|
data['url_enrichment'] = {}
|
|
|
|
data['url_enrichment'] = {
|
|
'discovered_url': url,
|
|
'search_query': search_query,
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'exa_web_search',
|
|
'status': 'SUCCESS'
|
|
}
|
|
|
|
# Save
|
|
with open(file_path, 'w') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
def main():
|
|
entries = load_entries_needing_urls()
|
|
print(f"Found {len(entries)} entries needing URLs")
|
|
|
|
# Print search queries for all entries
|
|
print("\n" + "="*80)
|
|
print("SEARCH QUERIES FOR EXA")
|
|
print("="*80 + "\n")
|
|
|
|
for i, entry in enumerate(entries):
|
|
name = entry.get('name', 'Unknown')
|
|
wikidata_id = entry.get('wikidata_id')
|
|
query = f'"{name}" Nederland official website'
|
|
|
|
print(f"{i+1}. {entry['file']}")
|
|
print(f" Name: {name}")
|
|
if wikidata_id:
|
|
print(f" Wikidata: {wikidata_id}")
|
|
print(f" Search: {query}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|