128 lines
3.7 KiB
Python
Executable file
128 lines
3.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich skipped entries with websites found via web search.
|
|
Uses Exa API to find official websites for institutions.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import time
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
ENTRIES_DIR = Path('data/nde/enriched/entries')
|
|
STATE_FILE = ENTRIES_DIR / 'web' / '_archive_state.yaml'
|
|
|
|
# Exa API setup
|
|
EXA_API_KEY = os.environ.get('EXA_API_KEY')
|
|
EXA_ENDPOINT = "https://api.exa.ai/search"
|
|
|
|
def search_website(org_name: str) -> str | None:
|
|
"""Search for official website using Exa."""
|
|
if not EXA_API_KEY:
|
|
print("Error: EXA_API_KEY not set")
|
|
return None
|
|
|
|
query = f"{org_name} official website Netherlands"
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {EXA_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
payload = {
|
|
"query": query,
|
|
"numResults": 3,
|
|
"type": "auto"
|
|
}
|
|
|
|
try:
|
|
response = httpx.post(EXA_ENDPOINT, json=payload, headers=headers, timeout=30)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
results = data.get('results', [])
|
|
if results:
|
|
# Return first result URL
|
|
return results[0].get('url')
|
|
except Exception as e:
|
|
print(f" Error searching: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
# Load state
|
|
with open(STATE_FILE) as f:
|
|
state = yaml.safe_load(f)
|
|
|
|
skipped = state.get('skipped_entries', [])
|
|
print(f"Found {len(skipped)} skipped entries")
|
|
|
|
# Track results
|
|
found_websites = []
|
|
|
|
for entry_name in skipped[:20]: # Process first 20
|
|
filepath = ENTRIES_DIR / entry_name
|
|
if not filepath.exists():
|
|
continue
|
|
|
|
with open(filepath) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
oe = data.get('original_entry', {})
|
|
org_name = oe.get('organisatie', '')
|
|
remarks = oe.get('opmerkingen_inez', '') or ''
|
|
|
|
# Skip if explicitly noted as no website or defunct
|
|
if 'geen website' in remarks.lower():
|
|
print(f"SKIP {entry_name}: No website noted")
|
|
continue
|
|
if 'bestaat niet meer' in remarks.lower():
|
|
print(f"SKIP {entry_name}: Institution defunct")
|
|
continue
|
|
|
|
print(f"\nSearching: {org_name}")
|
|
|
|
website = search_website(org_name)
|
|
|
|
if website:
|
|
print(f" FOUND: {website}")
|
|
found_websites.append({
|
|
'entry': entry_name,
|
|
'name': org_name,
|
|
'website': website
|
|
})
|
|
|
|
# Update entry file
|
|
if 'web_enrichment' not in data:
|
|
data['web_enrichment'] = {}
|
|
|
|
data['web_enrichment']['source_url'] = website
|
|
data['web_enrichment']['source_method'] = 'exa_web_search'
|
|
data['web_enrichment']['search_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(filepath, 'w') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Remove from skipped list
|
|
if entry_name in state['skipped_entries']:
|
|
state['skipped_entries'].remove(entry_name)
|
|
else:
|
|
print(f" NOT FOUND")
|
|
|
|
time.sleep(1) # Rate limit
|
|
|
|
# Save updated state
|
|
with open(STATE_FILE, 'w') as f:
|
|
yaml.dump(state, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\n=== SUMMARY ===")
|
|
print(f"Found {len(found_websites)} websites")
|
|
for fw in found_websites:
|
|
print(f" {fw['name']}: {fw['website']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|