glam/scripts/enrich_missing_websites.py

#!/usr/bin/env python3
"""
Enrich skipped entries with websites found via web search.
Uses Exa API to find official websites for institutions.
"""

import os
import sys
import yaml
import time
import httpx
from pathlib import Path
from datetime import datetime, timezone

ENTRIES_DIR = Path('data/nde/enriched/entries')
STATE_FILE = ENTRIES_DIR / 'web' / '_archive_state.yaml'

# Exa API setup
EXA_API_KEY = os.environ.get('EXA_API_KEY')
EXA_ENDPOINT = "https://api.exa.ai/search"

def search_website(org_name: str) -> str | None:
    """Search for official website using Exa."""
    if not EXA_API_KEY:
        print("Error: EXA_API_KEY not set")
        return None

    query = f"{org_name} official website Netherlands"

    headers = {
        "Authorization": f"Bearer {EXA_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "query": query,
        "numResults": 3,
        "type": "auto"
    }

    try:
        response = httpx.post(EXA_ENDPOINT, json=payload, headers=headers, timeout=30)
        if response.status_code == 200:
            data = response.json()
            results = data.get('results', [])
            if results:
                # Return first result URL
                return results[0].get('url')
    except Exception as e:
        print(f"  Error searching: {e}")

    return None


def main():
    # Load state
    with open(STATE_FILE) as f:
        state = yaml.safe_load(f)

    skipped = state.get('skipped_entries', [])
    print(f"Found {len(skipped)} skipped entries")

    # Track results
    found_websites = []

    for entry_name in skipped[:20]:  # Process first 20
        filepath = ENTRIES_DIR / entry_name
        if not filepath.exists():
            continue

        with open(filepath) as f:
            data = yaml.safe_load(f)

        oe = data.get('original_entry', {})
        org_name = oe.get('organisatie', '')
        remarks = oe.get('opmerkingen_inez', '') or ''

        # Skip if explicitly noted as no website or defunct
        if 'geen website' in remarks.lower():
            print(f"SKIP {entry_name}: No website noted")
            continue
        if 'bestaat niet meer' in remarks.lower():
            print(f"SKIP {entry_name}: Institution defunct")
            continue

        print(f"\nSearching: {org_name}")

        website = search_website(org_name)

        if website:
            print(f"  FOUND: {website}")
            found_websites.append({
                'entry': entry_name,
                'name': org_name,
                'website': website
            })

            # Update entry file
            if 'web_enrichment' not in data:
                data['web_enrichment'] = {}

            data['web_enrichment']['source_url'] = website
            data['web_enrichment']['source_method'] = 'exa_web_search'
            data['web_enrichment']['search_timestamp'] = datetime.now(timezone.utc).isoformat()

            with open(filepath, 'w') as f:
                yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

            # Remove from skipped list
            if entry_name in state['skipped_entries']:
                state['skipped_entries'].remove(entry_name)
        else:
            print(f"  NOT FOUND")

        time.sleep(1)  # Rate limit

    # Save updated state
    with open(STATE_FILE, 'w') as f:
        yaml.dump(state, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"\n=== SUMMARY ===")
    print(f"Found {len(found_websites)} websites")
    for fw in found_websites:
        print(f"  {fw['name']}: {fw['website']}")


if __name__ == '__main__':
    main()