- Layer 1: 35 unit tests (no LLM required) - Layer 2: 56 DSPy module tests with LLM - Layer 3: 10 integration tests with Oxigraph - Layer 4: Comprehensive evaluation suite Fixed: - Coordinate queries to use schema:location -> blank node pattern - Golden query expected intent for location questions - Health check test filtering in Layer 4 Added GitHub Actions workflow for CI/CD evaluation
52 lines
1.4 KiB
YAML
52 lines
1.4 KiB
YAML
# Golden Test Cases for Heritage RAG
|
|
# These tests MUST pass for any release
|
|
|
|
golden_tests:
|
|
- id: "golden_amsterdam_museums"
|
|
question: "Hoeveel musea zijn er in Amsterdam?"
|
|
language: nl
|
|
expected_intent: statistical
|
|
expected_entity_type: institution
|
|
min_answer_contains:
|
|
- "musea"
|
|
- "Amsterdam"
|
|
max_latency_ms: 10000
|
|
priority: critical
|
|
|
|
- id: "golden_rijksmuseum_location"
|
|
question: "Waar is het Rijksmuseum gevestigd?"
|
|
language: nl
|
|
# Note: geographic and entity_lookup are both valid for location questions
|
|
expected_intent: geographic
|
|
expected_entity_type: institution
|
|
expected_answer_contains:
|
|
- "Amsterdam"
|
|
max_latency_ms: 10000
|
|
priority: critical
|
|
|
|
- id: "golden_nl_libraries_count"
|
|
question: "How many libraries are there in the Netherlands?"
|
|
language: en
|
|
expected_intent: statistical
|
|
expected_entity_type: institution
|
|
max_latency_ms: 10000
|
|
priority: high
|
|
|
|
- id: "golden_nationaal_archief_staff"
|
|
question: "Wie werkt bij het Nationaal Archief?"
|
|
language: nl
|
|
expected_intent: entity_lookup
|
|
expected_entity_type: person
|
|
expected_sources:
|
|
- oxigraph
|
|
max_latency_ms: 15000
|
|
priority: high
|
|
|
|
- id: "golden_api_health"
|
|
type: health_check
|
|
endpoint: "/api/dspy/rag/health"
|
|
expected_status: 200
|
|
expected_fields:
|
|
- status
|
|
- components
|
|
priority: critical
|