Some checks failed
DSPy RAG Evaluation / Layer 1 - Unit Tests (push) Failing after 5m9s
DSPy RAG Evaluation / Layer 3 - Integration Tests (push) Has been skipped
DSPy RAG Evaluation / Layer 2 - DSPy Module Tests (push) Has been skipped
DSPy RAG Evaluation / Layer 4 - Comprehensive Evaluation (push) Has been skipped
DSPy RAG Evaluation / Quality Gate (push) Failing after 2s
200 lines
5.5 KiB
Python
200 lines
5.5 KiB
Python
"""
|
|
Pytest fixtures for DSPy GitOps testing.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
# Conditionally import dspy
|
|
try:
|
|
import dspy
|
|
DSPY_AVAILABLE = True
|
|
except ImportError:
|
|
DSPY_AVAILABLE = False
|
|
dspy = MagicMock()
|
|
|
|
|
|
DATASETS_DIR = Path(__file__).parent / "datasets"
|
|
OXIGRAPH_URL = os.environ.get("OXIGRAPH_ENDPOINT", "http://91.98.224.44:7878")
|
|
|
|
|
|
# =============================================================================
|
|
# Skip markers
|
|
# =============================================================================
|
|
|
|
requires_dspy = pytest.mark.skipif(
|
|
not DSPY_AVAILABLE,
|
|
reason="DSPy not installed"
|
|
)
|
|
|
|
requires_llm = pytest.mark.skipif(
|
|
not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("CLAUDE_API_KEY")),
|
|
reason="ANTHROPIC_API_KEY or CLAUDE_API_KEY not set"
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Dataset loading
|
|
# =============================================================================
|
|
|
|
def load_examples_from_json(filename: str) -> list[dict[str, Any]]:
|
|
"""Load examples from JSON file."""
|
|
filepath = DATASETS_DIR / filename
|
|
if not filepath.exists():
|
|
return []
|
|
|
|
with open(filepath) as f:
|
|
data = json.load(f)
|
|
|
|
return data.get("examples", [])
|
|
|
|
|
|
def dict_to_dspy_example(ex: dict[str, Any]) -> Any:
|
|
"""Convert dict to dspy.Example."""
|
|
if not DSPY_AVAILABLE:
|
|
return ex
|
|
|
|
return dspy.Example(
|
|
question=ex["question"],
|
|
language=ex["language"],
|
|
expected_intent=ex["expected_intent"],
|
|
expected_entities=ex.get("expected_entities", []),
|
|
expected_entity_type=ex.get("expected_entity_type", "institution"),
|
|
expected_sources=ex.get("expected_sources", []),
|
|
gold_answer=ex.get("gold_answer"),
|
|
).with_inputs("question", "language")
|
|
|
|
|
|
@pytest.fixture
|
|
def dev_set() -> list[Any]:
|
|
"""Load development set for evaluation."""
|
|
examples = load_examples_from_json("heritage_rag_dev.json")
|
|
if DSPY_AVAILABLE:
|
|
return [dict_to_dspy_example(ex) for ex in examples]
|
|
return examples
|
|
|
|
|
|
@pytest.fixture
|
|
def test_set() -> list[Any]:
|
|
"""Load test set for final evaluation."""
|
|
examples = load_examples_from_json("heritage_rag_test.json")
|
|
if DSPY_AVAILABLE:
|
|
return [dict_to_dspy_example(ex) for ex in examples]
|
|
return examples
|
|
|
|
|
|
@pytest.fixture
|
|
def golden_tests() -> list[dict]:
|
|
"""Load golden test cases."""
|
|
import yaml
|
|
filepath = DATASETS_DIR / "golden_queries.yaml"
|
|
if not filepath.exists():
|
|
return []
|
|
|
|
with open(filepath) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
return data.get("golden_tests", [])
|
|
|
|
|
|
# =============================================================================
|
|
# API fixtures
|
|
# =============================================================================
|
|
|
|
@pytest.fixture
|
|
def oxigraph_url() -> str:
|
|
"""Return Oxigraph endpoint URL."""
|
|
return OXIGRAPH_URL
|
|
|
|
|
|
@pytest.fixture
|
|
def api_client():
|
|
"""Create async HTTP client for API testing."""
|
|
import httpx
|
|
return httpx.AsyncClient(base_url="http://localhost:8000", timeout=30.0)
|
|
|
|
|
|
# =============================================================================
|
|
# DSPy fixtures
|
|
# =============================================================================
|
|
|
|
@pytest.fixture
|
|
def dspy_lm():
|
|
"""Configure DSPy with Claude."""
|
|
if not DSPY_AVAILABLE:
|
|
pytest.skip("DSPy not installed")
|
|
|
|
# Check for API key in both variable names
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("CLAUDE_API_KEY")
|
|
if not api_key:
|
|
pytest.skip("ANTHROPIC_API_KEY or CLAUDE_API_KEY not set")
|
|
|
|
lm = dspy.LM(model="anthropic/claude-sonnet-4-20250514", api_key=api_key)
|
|
dspy.configure(lm=lm)
|
|
return lm
|
|
|
|
|
|
@pytest.fixture
|
|
def heritage_pipeline(dspy_lm):
|
|
"""Create Heritage RAG pipeline."""
|
|
try:
|
|
from backend.rag.dspy_heritage_rag import create_heritage_rag_pipeline
|
|
return create_heritage_rag_pipeline(use_tools=False)
|
|
except ImportError:
|
|
pytest.skip("Heritage RAG pipeline not available")
|
|
|
|
|
|
@pytest.fixture
|
|
def query_router(dspy_lm):
|
|
"""Create query router."""
|
|
try:
|
|
from backend.rag.dspy_heritage_rag import HeritageQueryRouter
|
|
return HeritageQueryRouter()
|
|
except ImportError:
|
|
pytest.skip("Query router not available")
|
|
|
|
|
|
# =============================================================================
|
|
# Sample test data
|
|
# =============================================================================
|
|
|
|
SAMPLE_QUERIES = [
|
|
{
|
|
"question": "Hoeveel musea zijn er in Amsterdam?",
|
|
"language": "nl",
|
|
"expected_intent": "statistical",
|
|
"expected_entities": ["amsterdam", "musea"],
|
|
},
|
|
{
|
|
"question": "Waar is het Rijksmuseum gevestigd?",
|
|
"language": "nl",
|
|
"expected_intent": "entity_lookup",
|
|
"expected_entities": ["rijksmuseum"],
|
|
},
|
|
{
|
|
"question": "How many libraries are there in the Netherlands?",
|
|
"language": "en",
|
|
"expected_intent": "statistical",
|
|
"expected_entities": ["libraries", "netherlands"],
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_queries() -> list[dict]:
|
|
"""Return sample test queries."""
|
|
return SAMPLE_QUERIES
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_dspy_examples() -> list[Any]:
|
|
"""Return sample queries as DSPy examples."""
|
|
if DSPY_AVAILABLE:
|
|
return [dict_to_dspy_example(q) for q in SAMPLE_QUERIES]
|
|
return SAMPLE_QUERIES
|
|
# Trigger test
|