glam/tests/dspy_gitops/metrics/answer_relevance.py

"""
Answer Relevance Metrics

Evaluates answer quality using both heuristic and LLM-based methods.
"""

from typing import Any, Optional
import re


def answer_has_content(answer: str) -> bool:
    """Check if answer has meaningful content.

    Args:
        answer: Answer string

    Returns:
        True if answer has content
    """
    if not answer:
        return False

    # Strip whitespace and check length
    cleaned = answer.strip()
    if len(cleaned) < 10:
        return False

    # Check for placeholder patterns
    placeholder_patterns = [
        r"I don't know",
        r"I cannot",
        r"No information",
        r"Geen informatie",
        r"Ik weet het niet",
    ]

    for pattern in placeholder_patterns:
        if re.search(pattern, answer, re.IGNORECASE):
            return False

    return True


def answer_mentions_entities(answer: str, entities: list[str]) -> float:
    """Check how many expected entities are mentioned in answer.

    Args:
        answer: Answer string
        entities: Expected entities to mention

    Returns:
        Ratio of entities mentioned (0.0-1.0)
    """
    if not entities:
        return 1.0  # No entities expected

    answer_lower = answer.lower()
    mentioned = sum(1 for e in entities if e.lower() in answer_lower)

    return mentioned / len(entities)


def answer_relevance_metric(example: Any, pred: Any, trace: Any = None) -> float:
    """DSPy-compatible answer relevance metric.

    Uses heuristics to evaluate answer quality.

    Args:
        example: DSPy Example
        pred: Prediction with answer field
        trace: Optional trace

    Returns:
        Relevance score 0.0-1.0
    """
    answer = getattr(pred, "answer", None)
    if not answer:
        return 0.0

    # Base score for having content
    if not answer_has_content(answer):
        return 0.0

    score = 0.5  # Base score for having content

    # Bonus for mentioning expected entities
    expected_entities = getattr(example, "expected_entities", [])
    entity_score = answer_mentions_entities(answer, expected_entities)
    score += 0.3 * entity_score

    # Bonus for having citations
    citations = getattr(pred, "citations", [])
    if citations:
        score += 0.1

    # Bonus for reasonable confidence
    confidence = getattr(pred, "confidence", 0.0)
    if 0.5 <= confidence <= 0.95:
        score += 0.1

    return min(score, 1.0)


async def llm_as_judge_metric(
    question: str,
    answer: str,
    gold_answer: Optional[str] = None,
    context: str = "",
) -> dict[str, float]:
    """Use LLM to evaluate answer quality.

    Args:
        question: Original question
        answer: Generated answer
        gold_answer: Optional gold standard answer
        context: Retrieved context

    Returns:
        Dict with component scores
    """
    try:
        import dspy
    except ImportError:
        return {"overall": 0.5, "error": "DSPy not available"}

    class AnswerJudge(dspy.Signature):
        """Judge the quality of an answer to a heritage question.

        Evaluate on:
        - Relevance: Does it answer the question?
        - Accuracy: Is the information correct?
        - Completeness: Does it cover all aspects?
        - Clarity: Is it well-written?
        """

        question: str = dspy.InputField(desc="Original question")
        answer: str = dspy.InputField(desc="Answer to evaluate")
        gold_answer: str = dspy.InputField(desc="Gold standard answer (if available)", default="")
        context: str = dspy.InputField(desc="Retrieved context", default="")

        relevance: float = dspy.OutputField(desc="Relevance score 0-1")
        accuracy: float = dspy.OutputField(desc="Accuracy score 0-1")
        completeness: float = dspy.OutputField(desc="Completeness score 0-1")
        clarity: float = dspy.OutputField(desc="Clarity score 0-1")
        reasoning: str = dspy.OutputField(desc="Explanation of scores")

    try:
        judge = dspy.Predict(AnswerJudge)
        result = judge(
            question=question,
            answer=answer,
            gold_answer=gold_answer or "",
            context=context,
        )

        return {
            "relevance": float(result.relevance),
            "accuracy": float(result.accuracy),
            "completeness": float(result.completeness),
            "clarity": float(result.clarity),
            "overall": (
                float(result.relevance) * 0.3 +
                float(result.accuracy) * 0.4 +
                float(result.completeness) * 0.2 +
                float(result.clarity) * 0.1
            ),
            "reasoning": result.reasoning,
        }
    except Exception as e:
        return {
            "overall": 0.0,
            "error": str(e),
        }


def language_match_score(expected_lang: str, answer: str) -> float:
    """Check if answer is in expected language.

    Args:
        expected_lang: Expected language code (nl, en)
        answer: Answer text

    Returns:
        1.0 if language matches, 0.0 otherwise
    """
    # Simple heuristics for Dutch vs English
    dutch_indicators = ["het", "de", "een", "zijn", "er", "naar", "voor", "bij"]
    english_indicators = ["the", "a", "an", "is", "are", "for", "with", "to"]

    answer_lower = answer.lower()
    words = answer_lower.split()

    dutch_count = sum(1 for w in words if w in dutch_indicators)
    english_count = sum(1 for w in words if w in english_indicators)

    detected = "nl" if dutch_count > english_count else "en"

    return 1.0 if detected == expected_lang else 0.0