glam/scripts/populate_cache.py

#!/usr/bin/env python3
"""
Populate the Qdrant-backed semantic cache with common Dutch archive queries.

This script:
1. Makes real RAG streaming requests to /api/rag/dspy/query/stream
2. Stores the responses in the Qdrant cache via /api/cache/store

Usage:
    python scripts/populate_cache.py [--base-url URL] [--dry-run]

Examples:
    # Populate cache on production
    python scripts/populate_cache.py --base-url https://archief.support

    # Test locally
    python scripts/populate_cache.py --base-url http://localhost:8010 --dry-run
"""

import argparse
import json
import sys
import time
from typing import Optional

import httpx


# Common Dutch archive queries to pre-cache
COMMON_QUERIES = [
    # Core archive questions
    "Wat is het Nationaal Archief?",
    "Hoe kan ik mijn stamboom onderzoeken?",
    "Waar vind ik geboorteaktes?",
    "Welke archieven zijn er in Nederland?",
    "Hoe kan ik een overlijdensakte opvragen?",
    "Wat is het VOC archief?",
    "Waar vind ik oude kadasterkaarten?",
    "Hoe zoek ik naar voorouders?",
    "Wat zijn Bevolkingsregisters?",
    "Hoe vind ik militaire stamboeken?",

    # Regional archives
    "Welke archieven zijn er in Amsterdam?",
    "Waar vind ik het stadsarchief van Rotterdam?",
    "Welke archieven zijn er in Den Haag?",
    "Waar is het Utrechts Archief?",
    "Welke archieven zijn er in Groningen?",

    # Specific document types
    "Waar vind ik trouwaktes?",
    "Hoe vind ik scheepvaartregisters?",
    "Waar zijn notariële aktes te vinden?",
    "Hoe zoek ik in de Burgerlijke Stand?",
    "Waar vind ik emigratieregisters?",

    # Research topics
    "Hoe onderzoek ik mijn familiegeschiedenis?",
    "Waar vind ik informatie over de Tweede Wereldoorlog?",
    "Hoe vind ik informatie over de slavernijgeschiedenis?",
    "Waar zijn koloniaal archieven te vinden?",

    # Museum/Library questions
    "Hoeveel musea zijn er in Nederland?",
    "Welke bibliotheken hebben oude manuscripten?",
    "Wat is het grootste museum van Nederland?",
    "Waar vind ik de Koninklijke Bibliotheek?",
]


def make_rag_request(base_url: str, query: str, timeout: float = 120.0) -> Optional[dict]:
    """
    Make a streaming RAG request and collect the final response.

    Args:
        base_url: Base URL (e.g., https://archief.support)
        query: The query string
        timeout: Request timeout in seconds

    Returns:
        Final response data dict, or None on error
    """
    url = f"{base_url}/api/rag/dspy/query/stream"

    payload = {
        "question": query,
        "language": "nl",
        "include_visualization": False,
        "use_agent": False,
        "llm_provider": "zai",  # Free provider
        "llm_model": "glm-4.6",
    }

    final_data = None

    try:
        with httpx.stream("POST", url, json=payload, timeout=timeout) as response:
            response.raise_for_status()

            for line in response.iter_lines():
                if not line.strip():
                    continue

                try:
                    event = json.loads(line)

                    if event.get("type") == "status":
                        stage = event.get("stage", "")
                        message = event.get("message", "")
                        print(f"    [{stage}] {message}")

                    elif event.get("type") == "error":
                        print(f"    ERROR: {event.get('error')}")
                        return None

                    elif event.get("type") == "complete":
                        final_data = event.get("data", {})

                except json.JSONDecodeError:
                    continue

    except httpx.TimeoutException:
        print(f"    TIMEOUT after {timeout}s")
        return None
    except httpx.HTTPStatusError as e:
        print(f"    HTTP ERROR: {e.response.status_code}")
        return None
    except Exception as e:
        print(f"    ERROR: {e}")
        return None

    return final_data


def store_in_cache(base_url: str, query: str, response_data: dict, timeout: float = 30.0) -> bool:
    """
    Store a query/response pair in the Qdrant cache.

    Args:
        base_url: Base URL (e.g., https://archief.support)
        query: The original query
        response_data: The RAG response data
        timeout: Request timeout

    Returns:
        True if stored successfully
    """
    url = f"{base_url}/api/cache/store"

    # Extract relevant fields for caching
    answer = response_data.get("answer", "")
    sources = response_data.get("sources_used", [])

    # Parse institutions from retrieved_results
    institutions = []
    for r in response_data.get("retrieved_results", []):
        metadata = r.get("metadata", {})
        institutions.append({
            "name": r.get("name", ""),
            "type": metadata.get("institution_type"),
            "city": metadata.get("city"),
            "country": metadata.get("country"),
            "description": metadata.get("description"),
            "website": r.get("website"),
        })

    payload = {
        "query": query,
        "response": {
            "answer": answer,
            "sources": [{"database": s, "name": s} for s in sources],
            "institutions": institutions,
        },
        "language": "nl",
        "model": "glm-4.6",
    }

    try:
        resp = httpx.post(url, json=payload, timeout=timeout)
        resp.raise_for_status()
        return True
    except Exception as e:
        print(f"    CACHE STORE ERROR: {e}")
        return False


def get_cache_stats(base_url: str) -> dict:
    """Get current cache statistics."""
    try:
        resp = httpx.get(f"{base_url}/api/cache/stats", timeout=10.0)
        resp.raise_for_status()
        return resp.json()
    except Exception:
        return {}


def main():
    parser = argparse.ArgumentParser(description="Populate semantic cache with common queries")
    parser.add_argument("--base-url", default="https://archief.support", help="Base URL for API")
    parser.add_argument("--dry-run", action="store_true", help="Only show what would be done")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of queries to process")
    parser.add_argument("--delay", type=float, default=2.0, help="Delay between queries (seconds)")

    args = parser.parse_args()

    print(f"=== Semantic Cache Population Script ===")
    print(f"Base URL: {args.base_url}")
    print(f"Dry run: {args.dry_run}")
    print()

    # Check cache stats first
    stats = get_cache_stats(args.base_url)
    if stats:
        print(f"Current cache: {stats.get('total_entries', 0)} entries")
        print()

    queries = COMMON_QUERIES[:args.limit] if args.limit else COMMON_QUERIES

    if args.dry_run:
        print(f"Would process {len(queries)} queries:")
        for i, q in enumerate(queries, 1):
            print(f"  {i}. {q}")
        return 0

    # Process queries
    success_count = 0
    fail_count = 0

    for i, query in enumerate(queries, 1):
        print(f"[{i}/{len(queries)}] {query}")

        # Make RAG request
        print("  Making RAG request...")
        start_time = time.time()
        response_data = make_rag_request(args.base_url, query)
        elapsed = time.time() - start_time

        if response_data:
            answer_preview = response_data.get("answer", "")[:100]
            print(f"  Got response ({elapsed:.1f}s): {answer_preview}...")

            # Store in cache
            print("  Storing in cache...")
            if store_in_cache(args.base_url, query, response_data):
                print("  ✓ Cached successfully")
                success_count += 1
            else:
                fail_count += 1
        else:
            print("  ✗ No response")
            fail_count += 1

        # Delay between requests to avoid rate limiting
        if i < len(queries) and args.delay > 0:
            print(f"  Waiting {args.delay}s...")
            time.sleep(args.delay)

        print()

    # Final stats
    print("=== Summary ===")
    print(f"Processed: {len(queries)} queries")
    print(f"Success: {success_count}")
    print(f"Failed: {fail_count}")

    # Check final cache stats
    stats = get_cache_stats(args.base_url)
    if stats:
        print(f"Cache now has: {stats.get('total_entries', 0)} entries")

    return 0 if fail_count == 0 else 1


if __name__ == "__main__":
    sys.exit(main())