198 lines
5.6 KiB
Python
198 lines
5.6 KiB
Python
"""
|
|
Answer Relevance Metrics
|
|
|
|
Evaluates answer quality using both heuristic and LLM-based methods.
|
|
"""
|
|
|
|
from typing import Any, Optional
|
|
import re
|
|
|
|
|
|
def answer_has_content(answer: str) -> bool:
|
|
"""Check if answer has meaningful content.
|
|
|
|
Args:
|
|
answer: Answer string
|
|
|
|
Returns:
|
|
True if answer has content
|
|
"""
|
|
if not answer:
|
|
return False
|
|
|
|
# Strip whitespace and check length
|
|
cleaned = answer.strip()
|
|
if len(cleaned) < 10:
|
|
return False
|
|
|
|
# Check for placeholder patterns
|
|
placeholder_patterns = [
|
|
r"I don't know",
|
|
r"I cannot",
|
|
r"No information",
|
|
r"Geen informatie",
|
|
r"Ik weet het niet",
|
|
]
|
|
|
|
for pattern in placeholder_patterns:
|
|
if re.search(pattern, answer, re.IGNORECASE):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def answer_mentions_entities(answer: str, entities: list[str]) -> float:
|
|
"""Check how many expected entities are mentioned in answer.
|
|
|
|
Args:
|
|
answer: Answer string
|
|
entities: Expected entities to mention
|
|
|
|
Returns:
|
|
Ratio of entities mentioned (0.0-1.0)
|
|
"""
|
|
if not entities:
|
|
return 1.0 # No entities expected
|
|
|
|
answer_lower = answer.lower()
|
|
mentioned = sum(1 for e in entities if e.lower() in answer_lower)
|
|
|
|
return mentioned / len(entities)
|
|
|
|
|
|
def answer_relevance_metric(example: Any, pred: Any, trace: Any = None) -> float:
|
|
"""DSPy-compatible answer relevance metric.
|
|
|
|
Uses heuristics to evaluate answer quality.
|
|
|
|
Args:
|
|
example: DSPy Example
|
|
pred: Prediction with answer field
|
|
trace: Optional trace
|
|
|
|
Returns:
|
|
Relevance score 0.0-1.0
|
|
"""
|
|
answer = getattr(pred, "answer", None)
|
|
if not answer:
|
|
return 0.0
|
|
|
|
# Base score for having content
|
|
if not answer_has_content(answer):
|
|
return 0.0
|
|
|
|
score = 0.5 # Base score for having content
|
|
|
|
# Bonus for mentioning expected entities
|
|
expected_entities = getattr(example, "expected_entities", [])
|
|
entity_score = answer_mentions_entities(answer, expected_entities)
|
|
score += 0.3 * entity_score
|
|
|
|
# Bonus for having citations
|
|
citations = getattr(pred, "citations", [])
|
|
if citations:
|
|
score += 0.1
|
|
|
|
# Bonus for reasonable confidence
|
|
confidence = getattr(pred, "confidence", 0.0)
|
|
if 0.5 <= confidence <= 0.95:
|
|
score += 0.1
|
|
|
|
return min(score, 1.0)
|
|
|
|
|
|
async def llm_as_judge_metric(
|
|
question: str,
|
|
answer: str,
|
|
gold_answer: Optional[str] = None,
|
|
context: str = "",
|
|
) -> dict[str, float]:
|
|
"""Use LLM to evaluate answer quality.
|
|
|
|
Args:
|
|
question: Original question
|
|
answer: Generated answer
|
|
gold_answer: Optional gold standard answer
|
|
context: Retrieved context
|
|
|
|
Returns:
|
|
Dict with component scores
|
|
"""
|
|
try:
|
|
import dspy
|
|
except ImportError:
|
|
return {"overall": 0.5, "error": "DSPy not available"}
|
|
|
|
class AnswerJudge(dspy.Signature):
|
|
"""Judge the quality of an answer to a heritage question.
|
|
|
|
Evaluate on:
|
|
- Relevance: Does it answer the question?
|
|
- Accuracy: Is the information correct?
|
|
- Completeness: Does it cover all aspects?
|
|
- Clarity: Is it well-written?
|
|
"""
|
|
|
|
question: str = dspy.InputField(desc="Original question")
|
|
answer: str = dspy.InputField(desc="Answer to evaluate")
|
|
gold_answer: str = dspy.InputField(desc="Gold standard answer (if available)", default="")
|
|
context: str = dspy.InputField(desc="Retrieved context", default="")
|
|
|
|
relevance: float = dspy.OutputField(desc="Relevance score 0-1")
|
|
accuracy: float = dspy.OutputField(desc="Accuracy score 0-1")
|
|
completeness: float = dspy.OutputField(desc="Completeness score 0-1")
|
|
clarity: float = dspy.OutputField(desc="Clarity score 0-1")
|
|
reasoning: str = dspy.OutputField(desc="Explanation of scores")
|
|
|
|
try:
|
|
judge = dspy.Predict(AnswerJudge)
|
|
result = judge(
|
|
question=question,
|
|
answer=answer,
|
|
gold_answer=gold_answer or "",
|
|
context=context,
|
|
)
|
|
|
|
return {
|
|
"relevance": float(result.relevance),
|
|
"accuracy": float(result.accuracy),
|
|
"completeness": float(result.completeness),
|
|
"clarity": float(result.clarity),
|
|
"overall": (
|
|
float(result.relevance) * 0.3 +
|
|
float(result.accuracy) * 0.4 +
|
|
float(result.completeness) * 0.2 +
|
|
float(result.clarity) * 0.1
|
|
),
|
|
"reasoning": result.reasoning,
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"overall": 0.0,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def language_match_score(expected_lang: str, answer: str) -> float:
|
|
"""Check if answer is in expected language.
|
|
|
|
Args:
|
|
expected_lang: Expected language code (nl, en)
|
|
answer: Answer text
|
|
|
|
Returns:
|
|
1.0 if language matches, 0.0 otherwise
|
|
"""
|
|
# Simple heuristics for Dutch vs English
|
|
dutch_indicators = ["het", "de", "een", "zijn", "er", "naar", "voor", "bij"]
|
|
english_indicators = ["the", "a", "an", "is", "are", "for", "with", "to"]
|
|
|
|
answer_lower = answer.lower()
|
|
words = answer_lower.split()
|
|
|
|
dutch_count = sum(1 for w in words if w in dutch_indicators)
|
|
english_count = sum(1 for w in words if w in english_indicators)
|
|
|
|
detected = "nl" if dutch_count > english_count else "en"
|
|
|
|
return 1.0 if detected == expected_lang else 0.0
|