glam/tests/dspy_gitops/metrics/intent_accuracy.py
2026-01-11 18:08:40 +01:00

76 lines
2 KiB
Python

"""
Intent Accuracy Metrics
Measures how accurately the system classifies query intent.
"""
from typing import Any
def intent_accuracy(expected: str, predicted: str) -> float:
"""Calculate intent accuracy (exact match).
Args:
expected: Expected intent classification
predicted: Predicted intent classification
Returns:
1.0 if exact match, 0.0 otherwise
"""
return 1.0 if expected.lower().strip() == predicted.lower().strip() else 0.0
def intent_accuracy_metric(example: Any, pred: Any, trace: Any = None) -> float:
"""DSPy-compatible intent accuracy metric.
Args:
example: DSPy Example with expected_intent
pred: Prediction with intent field
trace: Optional trace for debugging
Returns:
1.0 if intent matches, 0.0 otherwise
"""
expected = getattr(example, "expected_intent", None)
predicted = getattr(pred, "intent", None)
if expected is None or predicted is None:
return 0.0
return intent_accuracy(expected, predicted)
# Intent similarity mapping for partial credit
INTENT_SIMILARITY = {
("statistical", "exploration"): 0.3,
("geographic", "exploration"): 0.3,
("entity_lookup", "exploration"): 0.5,
("temporal", "entity_lookup"): 0.2,
("relational", "entity_lookup"): 0.3,
("comparative", "statistical"): 0.4,
}
def intent_similarity_score(expected: str, predicted: str) -> float:
"""Calculate intent similarity with partial credit.
Args:
expected: Expected intent
predicted: Predicted intent
Returns:
Score between 0.0 and 1.0
"""
if expected.lower() == predicted.lower():
return 1.0
# Check similarity mapping (bidirectional)
key = (expected.lower(), predicted.lower())
if key in INTENT_SIMILARITY:
return INTENT_SIMILARITY[key]
key_reverse = (predicted.lower(), expected.lower())
if key_reverse in INTENT_SIMILARITY:
return INTENT_SIMILARITY[key_reverse]
return 0.0