""" Answer Relevance Metrics Evaluates answer quality using both heuristic and LLM-based methods. """ from typing import Any, Optional import re def answer_has_content(answer: str) -> bool: """Check if answer has meaningful content. Args: answer: Answer string Returns: True if answer has content """ if not answer: return False # Strip whitespace and check length cleaned = answer.strip() if len(cleaned) < 10: return False # Check for placeholder patterns placeholder_patterns = [ r"I don't know", r"I cannot", r"No information", r"Geen informatie", r"Ik weet het niet", ] for pattern in placeholder_patterns: if re.search(pattern, answer, re.IGNORECASE): return False return True def answer_mentions_entities(answer: str, entities: list[str]) -> float: """Check how many expected entities are mentioned in answer. Args: answer: Answer string entities: Expected entities to mention Returns: Ratio of entities mentioned (0.0-1.0) """ if not entities: return 1.0 # No entities expected answer_lower = answer.lower() mentioned = sum(1 for e in entities if e.lower() in answer_lower) return mentioned / len(entities) def answer_relevance_metric(example: Any, pred: Any, trace: Any = None) -> float: """DSPy-compatible answer relevance metric. Uses heuristics to evaluate answer quality. Args: example: DSPy Example pred: Prediction with answer field trace: Optional trace Returns: Relevance score 0.0-1.0 """ answer = getattr(pred, "answer", None) if not answer: return 0.0 # Base score for having content if not answer_has_content(answer): return 0.0 score = 0.5 # Base score for having content # Bonus for mentioning expected entities expected_entities = getattr(example, "expected_entities", []) entity_score = answer_mentions_entities(answer, expected_entities) score += 0.3 * entity_score # Bonus for having citations citations = getattr(pred, "citations", []) if citations: score += 0.1 # Bonus for reasonable confidence confidence = getattr(pred, "confidence", 0.0) if 0.5 <= confidence <= 0.95: score += 0.1 return min(score, 1.0) async def llm_as_judge_metric( question: str, answer: str, gold_answer: Optional[str] = None, context: str = "", ) -> dict[str, float]: """Use LLM to evaluate answer quality. Args: question: Original question answer: Generated answer gold_answer: Optional gold standard answer context: Retrieved context Returns: Dict with component scores """ try: import dspy except ImportError: return {"overall": 0.5, "error": "DSPy not available"} class AnswerJudge(dspy.Signature): """Judge the quality of an answer to a heritage question. Evaluate on: - Relevance: Does it answer the question? - Accuracy: Is the information correct? - Completeness: Does it cover all aspects? - Clarity: Is it well-written? """ question: str = dspy.InputField(desc="Original question") answer: str = dspy.InputField(desc="Answer to evaluate") gold_answer: str = dspy.InputField(desc="Gold standard answer (if available)", default="") context: str = dspy.InputField(desc="Retrieved context", default="") relevance: float = dspy.OutputField(desc="Relevance score 0-1") accuracy: float = dspy.OutputField(desc="Accuracy score 0-1") completeness: float = dspy.OutputField(desc="Completeness score 0-1") clarity: float = dspy.OutputField(desc="Clarity score 0-1") reasoning: str = dspy.OutputField(desc="Explanation of scores") try: judge = dspy.Predict(AnswerJudge) result = judge( question=question, answer=answer, gold_answer=gold_answer or "", context=context, ) return { "relevance": float(result.relevance), "accuracy": float(result.accuracy), "completeness": float(result.completeness), "clarity": float(result.clarity), "overall": ( float(result.relevance) * 0.3 + float(result.accuracy) * 0.4 + float(result.completeness) * 0.2 + float(result.clarity) * 0.1 ), "reasoning": result.reasoning, } except Exception as e: return { "overall": 0.0, "error": str(e), } def language_match_score(expected_lang: str, answer: str) -> float: """Check if answer is in expected language. Args: expected_lang: Expected language code (nl, en) answer: Answer text Returns: 1.0 if language matches, 0.0 otherwise """ # Simple heuristics for Dutch vs English dutch_indicators = ["het", "de", "een", "zijn", "er", "naar", "voor", "bij"] english_indicators = ["the", "a", "an", "is", "are", "for", "with", "to"] answer_lower = answer.lower() words = answer_lower.split() dutch_count = sum(1 for w in words if w in dutch_indicators) english_count = sum(1 for w in words if w in english_indicators) detected = "nl" if dutch_count > english_count else "en" return 1.0 if detected == expected_lang else 0.0