glam/tests/dspy_gitops/metrics/answer_relevance.py
2026-01-11 18:08:40 +01:00

198 lines
5.6 KiB
Python

"""
Answer Relevance Metrics
Evaluates answer quality using both heuristic and LLM-based methods.
"""
from typing import Any, Optional
import re
def answer_has_content(answer: str) -> bool:
"""Check if answer has meaningful content.
Args:
answer: Answer string
Returns:
True if answer has content
"""
if not answer:
return False
# Strip whitespace and check length
cleaned = answer.strip()
if len(cleaned) < 10:
return False
# Check for placeholder patterns
placeholder_patterns = [
r"I don't know",
r"I cannot",
r"No information",
r"Geen informatie",
r"Ik weet het niet",
]
for pattern in placeholder_patterns:
if re.search(pattern, answer, re.IGNORECASE):
return False
return True
def answer_mentions_entities(answer: str, entities: list[str]) -> float:
"""Check how many expected entities are mentioned in answer.
Args:
answer: Answer string
entities: Expected entities to mention
Returns:
Ratio of entities mentioned (0.0-1.0)
"""
if not entities:
return 1.0 # No entities expected
answer_lower = answer.lower()
mentioned = sum(1 for e in entities if e.lower() in answer_lower)
return mentioned / len(entities)
def answer_relevance_metric(example: Any, pred: Any, trace: Any = None) -> float:
"""DSPy-compatible answer relevance metric.
Uses heuristics to evaluate answer quality.
Args:
example: DSPy Example
pred: Prediction with answer field
trace: Optional trace
Returns:
Relevance score 0.0-1.0
"""
answer = getattr(pred, "answer", None)
if not answer:
return 0.0
# Base score for having content
if not answer_has_content(answer):
return 0.0
score = 0.5 # Base score for having content
# Bonus for mentioning expected entities
expected_entities = getattr(example, "expected_entities", [])
entity_score = answer_mentions_entities(answer, expected_entities)
score += 0.3 * entity_score
# Bonus for having citations
citations = getattr(pred, "citations", [])
if citations:
score += 0.1
# Bonus for reasonable confidence
confidence = getattr(pred, "confidence", 0.0)
if 0.5 <= confidence <= 0.95:
score += 0.1
return min(score, 1.0)
async def llm_as_judge_metric(
question: str,
answer: str,
gold_answer: Optional[str] = None,
context: str = "",
) -> dict[str, float]:
"""Use LLM to evaluate answer quality.
Args:
question: Original question
answer: Generated answer
gold_answer: Optional gold standard answer
context: Retrieved context
Returns:
Dict with component scores
"""
try:
import dspy
except ImportError:
return {"overall": 0.5, "error": "DSPy not available"}
class AnswerJudge(dspy.Signature):
"""Judge the quality of an answer to a heritage question.
Evaluate on:
- Relevance: Does it answer the question?
- Accuracy: Is the information correct?
- Completeness: Does it cover all aspects?
- Clarity: Is it well-written?
"""
question: str = dspy.InputField(desc="Original question")
answer: str = dspy.InputField(desc="Answer to evaluate")
gold_answer: str = dspy.InputField(desc="Gold standard answer (if available)", default="")
context: str = dspy.InputField(desc="Retrieved context", default="")
relevance: float = dspy.OutputField(desc="Relevance score 0-1")
accuracy: float = dspy.OutputField(desc="Accuracy score 0-1")
completeness: float = dspy.OutputField(desc="Completeness score 0-1")
clarity: float = dspy.OutputField(desc="Clarity score 0-1")
reasoning: str = dspy.OutputField(desc="Explanation of scores")
try:
judge = dspy.Predict(AnswerJudge)
result = judge(
question=question,
answer=answer,
gold_answer=gold_answer or "",
context=context,
)
return {
"relevance": float(result.relevance),
"accuracy": float(result.accuracy),
"completeness": float(result.completeness),
"clarity": float(result.clarity),
"overall": (
float(result.relevance) * 0.3 +
float(result.accuracy) * 0.4 +
float(result.completeness) * 0.2 +
float(result.clarity) * 0.1
),
"reasoning": result.reasoning,
}
except Exception as e:
return {
"overall": 0.0,
"error": str(e),
}
def language_match_score(expected_lang: str, answer: str) -> float:
"""Check if answer is in expected language.
Args:
expected_lang: Expected language code (nl, en)
answer: Answer text
Returns:
1.0 if language matches, 0.0 otherwise
"""
# Simple heuristics for Dutch vs English
dutch_indicators = ["het", "de", "een", "zijn", "er", "naar", "voor", "bij"]
english_indicators = ["the", "a", "an", "is", "are", "for", "with", "to"]
answer_lower = answer.lower()
words = answer_lower.split()
dutch_count = sum(1 for w in words if w in dutch_indicators)
english_count = sum(1 for w in words if w in english_indicators)
detected = "nl" if dutch_count > english_count else "en"
return 1.0 if detected == expected_lang else 0.0