997 lines
31 KiB
Markdown
997 lines
31 KiB
Markdown
# Evaluation Framework for Heritage Custodian RAG
|
|
|
|
## Overview
|
|
|
|
This document defines evaluation metrics, benchmarks, and gold standard datasets for the Heritage Custodian RAG pipeline. Evaluation covers all pipeline stages: entity extraction, type classification, entity linking, retrieval, and end-to-end question answering.
|
|
|
|
## Evaluation Architecture
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────────┐
|
|
│ Evaluation Framework │
|
|
├─────────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Task │ │ Gold │ │ Metrics │ │
|
|
│ │ Modules │ │ Standard │ │ Suite │ │
|
|
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
|
|
│ │ │ │ │
|
|
│ ▼ ▼ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Evaluation Runner │ │
|
|
│ │ • Batch evaluation across datasets │ │
|
|
│ │ • Stratified sampling (by type, country, tier) │ │
|
|
│ │ • Statistical significance testing │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Results Dashboard │ │
|
|
│ │ • Per-task metrics breakdown │ │
|
|
│ │ • Error analysis and confusion matrices │ │
|
|
│ │ • Performance over time / model versions │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
## 1. Entity Extraction Metrics
|
|
|
|
### Token-Level Metrics
|
|
|
|
```python
|
|
from typing import List, Tuple
|
|
from collections import defaultdict
|
|
|
|
def token_level_metrics(
|
|
pred_spans: List[Tuple[int, int, str]], # (start, end, type)
|
|
gold_spans: List[Tuple[int, int, str]],
|
|
text: str,
|
|
) -> dict:
|
|
"""Compute token-level precision, recall, F1.
|
|
|
|
Args:
|
|
pred_spans: Predicted entity spans with types
|
|
gold_spans: Gold standard entity spans with types
|
|
text: Original text for tokenization
|
|
|
|
Returns:
|
|
Dictionary with P, R, F1 for each entity type and overall
|
|
"""
|
|
|
|
# Tokenize and create token-to-span mapping
|
|
tokens = text.split()
|
|
|
|
def spans_to_tokens(spans: List[Tuple[int, int, str]]) -> dict:
|
|
"""Map spans to token indices with types."""
|
|
token_labels = {}
|
|
char_pos = 0
|
|
for idx, token in enumerate(tokens):
|
|
for start, end, etype in spans:
|
|
if char_pos >= start and char_pos < end:
|
|
token_labels[idx] = etype
|
|
char_pos += len(token) + 1
|
|
return token_labels
|
|
|
|
pred_tokens = spans_to_tokens(pred_spans)
|
|
gold_tokens = spans_to_tokens(gold_spans)
|
|
|
|
# Compute metrics per type
|
|
all_types = set(list(pred_tokens.values()) + list(gold_tokens.values()))
|
|
metrics = {}
|
|
|
|
for etype in all_types:
|
|
pred_set = {k for k, v in pred_tokens.items() if v == etype}
|
|
gold_set = {k for k, v in gold_tokens.items() if v == etype}
|
|
|
|
tp = len(pred_set & gold_set)
|
|
fp = len(pred_set - gold_set)
|
|
fn = len(gold_set - pred_set)
|
|
|
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
metrics[etype] = {
|
|
"precision": precision,
|
|
"recall": recall,
|
|
"f1": f1,
|
|
"support": len(gold_set),
|
|
}
|
|
|
|
# Macro average
|
|
macro_p = sum(m["precision"] for m in metrics.values()) / len(metrics) if metrics else 0.0
|
|
macro_r = sum(m["recall"] for m in metrics.values()) / len(metrics) if metrics else 0.0
|
|
macro_f1 = sum(m["f1"] for m in metrics.values()) / len(metrics) if metrics else 0.0
|
|
|
|
metrics["macro_avg"] = {"precision": macro_p, "recall": macro_r, "f1": macro_f1}
|
|
|
|
return metrics
|
|
```
|
|
|
|
### Entity-Level Metrics (Strict & Partial)
|
|
|
|
```python
|
|
def entity_level_metrics(
|
|
pred_entities: List[dict], # [{text, type, start, end}]
|
|
gold_entities: List[dict],
|
|
match_type: str = "strict", # "strict" or "partial"
|
|
) -> dict:
|
|
"""Compute entity-level precision, recall, F1.
|
|
|
|
Strict: Exact span and type match
|
|
Partial: Overlapping span and type match
|
|
"""
|
|
|
|
def spans_match(pred: dict, gold: dict, match_type: str) -> bool:
|
|
# Type must always match
|
|
if pred["type"] != gold["type"]:
|
|
return False
|
|
|
|
if match_type == "strict":
|
|
return pred["start"] == gold["start"] and pred["end"] == gold["end"]
|
|
else: # partial
|
|
# Spans overlap
|
|
return not (pred["end"] <= gold["start"] or pred["start"] >= gold["end"])
|
|
|
|
matched_gold = set()
|
|
matched_pred = set()
|
|
|
|
for i, pred in enumerate(pred_entities):
|
|
for j, gold in enumerate(gold_entities):
|
|
if j not in matched_gold and spans_match(pred, gold, match_type):
|
|
matched_pred.add(i)
|
|
matched_gold.add(j)
|
|
break
|
|
|
|
tp = len(matched_gold)
|
|
fp = len(pred_entities) - len(matched_pred)
|
|
fn = len(gold_entities) - len(matched_gold)
|
|
|
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
return {
|
|
"precision": precision,
|
|
"recall": recall,
|
|
"f1": f1,
|
|
"true_positives": tp,
|
|
"false_positives": fp,
|
|
"false_negatives": fn,
|
|
"match_type": match_type,
|
|
}
|
|
```
|
|
|
|
### Entity Type Distribution Analysis
|
|
|
|
```python
|
|
def type_distribution_analysis(
|
|
pred_entities: List[dict],
|
|
gold_entities: List[dict],
|
|
) -> dict:
|
|
"""Analyze entity type prediction distribution."""
|
|
|
|
from collections import Counter
|
|
|
|
pred_types = Counter(e["type"] for e in pred_entities)
|
|
gold_types = Counter(e["type"] for e in gold_entities)
|
|
|
|
all_types = set(pred_types.keys()) | set(gold_types.keys())
|
|
|
|
analysis = {}
|
|
for etype in all_types:
|
|
pred_count = pred_types.get(etype, 0)
|
|
gold_count = gold_types.get(etype, 0)
|
|
|
|
analysis[etype] = {
|
|
"predicted": pred_count,
|
|
"gold": gold_count,
|
|
"over_prediction": pred_count - gold_count,
|
|
"ratio": pred_count / gold_count if gold_count > 0 else float('inf'),
|
|
}
|
|
|
|
return analysis
|
|
```
|
|
|
|
## 2. Type Classification Metrics
|
|
|
|
### Multi-Class Classification Metrics
|
|
|
|
```python
|
|
import numpy as np
|
|
from sklearn.metrics import (
|
|
accuracy_score,
|
|
precision_recall_fscore_support,
|
|
confusion_matrix,
|
|
classification_report,
|
|
)
|
|
|
|
def type_classification_metrics(
|
|
y_true: List[str], # Gold labels
|
|
y_pred: List[str], # Predicted labels
|
|
labels: List[str] = None, # All possible labels
|
|
) -> dict:
|
|
"""Compute classification metrics for GLAMORCUBESFIXPHDNT types."""
|
|
|
|
if labels is None:
|
|
labels = list("GLAMORCUBESFIXPHDNT") # 19 types
|
|
|
|
# Overall accuracy
|
|
accuracy = accuracy_score(y_true, y_pred)
|
|
|
|
# Per-class metrics
|
|
precision, recall, f1, support = precision_recall_fscore_support(
|
|
y_true, y_pred, labels=labels, zero_division=0
|
|
)
|
|
|
|
# Confusion matrix
|
|
cm = confusion_matrix(y_true, y_pred, labels=labels)
|
|
|
|
# Build results
|
|
per_class = {}
|
|
for i, label in enumerate(labels):
|
|
per_class[label] = {
|
|
"precision": float(precision[i]),
|
|
"recall": float(recall[i]),
|
|
"f1": float(f1[i]),
|
|
"support": int(support[i]),
|
|
}
|
|
|
|
return {
|
|
"accuracy": accuracy,
|
|
"macro_precision": float(np.mean(precision)),
|
|
"macro_recall": float(np.mean(recall)),
|
|
"macro_f1": float(np.mean(f1)),
|
|
"weighted_f1": float(np.average(f1, weights=support)),
|
|
"per_class": per_class,
|
|
"confusion_matrix": cm.tolist(),
|
|
"labels": labels,
|
|
}
|
|
```
|
|
|
|
### Type Confusion Analysis
|
|
|
|
```python
|
|
def type_confusion_analysis(
|
|
y_true: List[str],
|
|
y_pred: List[str],
|
|
top_k: int = 10,
|
|
) -> dict:
|
|
"""Identify most common type confusions."""
|
|
|
|
from collections import Counter
|
|
|
|
confusions = Counter()
|
|
for true_label, pred_label in zip(y_true, y_pred):
|
|
if true_label != pred_label:
|
|
confusions[(true_label, pred_label)] += 1
|
|
|
|
top_confusions = confusions.most_common(top_k)
|
|
|
|
return {
|
|
"top_confusions": [
|
|
{
|
|
"true_type": pair[0],
|
|
"predicted_type": pair[1],
|
|
"count": count,
|
|
}
|
|
for pair, count in top_confusions
|
|
],
|
|
"total_errors": sum(confusions.values()),
|
|
"unique_confusion_pairs": len(confusions),
|
|
}
|
|
```
|
|
|
|
### Hierarchical Type Metrics
|
|
|
|
```python
|
|
def hierarchical_type_metrics(
|
|
y_true: List[str],
|
|
y_pred: List[str],
|
|
) -> dict:
|
|
"""Metrics accounting for type hierarchy relationships.
|
|
|
|
GLAMORCUBESFIXPHDNT types can be grouped into higher-level categories:
|
|
- CULTURAL: G, L, A, M (core GLAM)
|
|
- INSTITUTIONAL: O, R, E
|
|
- COMMUNITY: S, I, N
|
|
- SPECIALIZED: B, H, T, F
|
|
- DIGITAL: D
|
|
- PRIVATE: C, P
|
|
- UNKNOWN: U, X
|
|
"""
|
|
|
|
TYPE_HIERARCHY = {
|
|
"CULTURAL": ["G", "L", "A", "M"],
|
|
"INSTITUTIONAL": ["O", "R", "E"],
|
|
"COMMUNITY": ["S", "I", "N"],
|
|
"SPECIALIZED": ["B", "H", "T", "F"],
|
|
"DIGITAL": ["D"],
|
|
"PRIVATE": ["C", "P"],
|
|
"UNKNOWN": ["U", "X"],
|
|
}
|
|
|
|
# Reverse mapping
|
|
type_to_category = {}
|
|
for category, types in TYPE_HIERARCHY.items():
|
|
for t in types:
|
|
type_to_category[t] = category
|
|
|
|
# Category-level predictions
|
|
y_true_cat = [type_to_category.get(t, "UNKNOWN") for t in y_true]
|
|
y_pred_cat = [type_to_category.get(t, "UNKNOWN") for t in y_pred]
|
|
|
|
# Exact match
|
|
exact_accuracy = accuracy_score(y_true, y_pred)
|
|
|
|
# Category match (lenient)
|
|
category_accuracy = accuracy_score(y_true_cat, y_pred_cat)
|
|
|
|
return {
|
|
"exact_accuracy": exact_accuracy,
|
|
"category_accuracy": category_accuracy,
|
|
"hierarchy_gap": category_accuracy - exact_accuracy,
|
|
}
|
|
```
|
|
|
|
## 3. Entity Linking Metrics
|
|
|
|
### Linking Accuracy (Hits@K)
|
|
|
|
```python
|
|
def linking_accuracy(
|
|
predictions: List[dict], # [{mention, candidates: [{kb_id, score}]}]
|
|
gold: List[dict], # [{mention, gold_kb_id}]
|
|
k_values: List[int] = [1, 5, 10],
|
|
) -> dict:
|
|
"""Compute Hits@K for entity linking."""
|
|
|
|
results = {f"hits@{k}": 0.0 for k in k_values}
|
|
|
|
# Build gold lookup
|
|
gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
|
|
|
|
for pred in predictions:
|
|
mention = pred["mention"]
|
|
gold_id = gold_lookup.get(mention)
|
|
|
|
if gold_id is None:
|
|
continue
|
|
|
|
candidates = pred.get("candidates", [])
|
|
candidate_ids = [c["kb_id"] for c in candidates]
|
|
|
|
for k in k_values:
|
|
if gold_id in candidate_ids[:k]:
|
|
results[f"hits@{k}"] += 1
|
|
|
|
# Normalize
|
|
n = len(gold)
|
|
for k in k_values:
|
|
results[f"hits@{k}"] /= n if n > 0 else 1
|
|
|
|
return results
|
|
```
|
|
|
|
### Mean Reciprocal Rank (MRR)
|
|
|
|
```python
|
|
def mean_reciprocal_rank(
|
|
predictions: List[dict],
|
|
gold: List[dict],
|
|
) -> float:
|
|
"""Compute MRR for entity linking."""
|
|
|
|
gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
|
|
|
|
reciprocal_ranks = []
|
|
|
|
for pred in predictions:
|
|
mention = pred["mention"]
|
|
gold_id = gold_lookup.get(mention)
|
|
|
|
if gold_id is None:
|
|
continue
|
|
|
|
candidates = pred.get("candidates", [])
|
|
candidate_ids = [c["kb_id"] for c in candidates]
|
|
|
|
if gold_id in candidate_ids:
|
|
rank = candidate_ids.index(gold_id) + 1
|
|
reciprocal_ranks.append(1.0 / rank)
|
|
else:
|
|
reciprocal_ranks.append(0.0)
|
|
|
|
return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0
|
|
```
|
|
|
|
### NIL Detection Metrics
|
|
|
|
```python
|
|
def nil_detection_metrics(
|
|
predictions: List[dict], # [{mention, is_nil_pred}]
|
|
gold: List[dict], # [{mention, is_nil_gold}]
|
|
) -> dict:
|
|
"""Metrics for NIL entity detection."""
|
|
|
|
y_true = [g["is_nil_gold"] for g in gold]
|
|
y_pred = [p["is_nil_pred"] for p in predictions]
|
|
|
|
tp = sum(1 for t, p in zip(y_true, y_pred) if t and p)
|
|
fp = sum(1 for t, p in zip(y_true, y_pred) if not t and p)
|
|
fn = sum(1 for t, p in zip(y_true, y_pred) if t and not p)
|
|
tn = sum(1 for t, p in zip(y_true, y_pred) if not t and not p)
|
|
|
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
return {
|
|
"nil_precision": precision,
|
|
"nil_recall": recall,
|
|
"nil_f1": f1,
|
|
"true_positives": tp,
|
|
"false_positives": fp,
|
|
"false_negatives": fn,
|
|
"true_negatives": tn,
|
|
}
|
|
```
|
|
|
|
### Cross-KB Linking Consistency
|
|
|
|
```python
|
|
def cross_kb_consistency(
|
|
linked_entities: List[dict], # [{wikidata_id, viaf_id, isil_code}]
|
|
) -> dict:
|
|
"""Check consistency of cross-KB identifier linking."""
|
|
|
|
consistent = 0
|
|
inconsistent = 0
|
|
partial = 0
|
|
|
|
for entity in linked_entities:
|
|
wd = entity.get("wikidata_id")
|
|
viaf = entity.get("viaf_id")
|
|
isil = entity.get("isil_code")
|
|
|
|
if wd and viaf:
|
|
# Verify VIAF in Wikidata
|
|
wd_viaf = get_wikidata_viaf(wd)
|
|
if wd_viaf == viaf:
|
|
consistent += 1
|
|
else:
|
|
inconsistent += 1
|
|
elif wd or viaf or isil:
|
|
partial += 1
|
|
|
|
total = consistent + inconsistent + partial
|
|
|
|
return {
|
|
"consistent": consistent,
|
|
"inconsistent": inconsistent,
|
|
"partial": partial,
|
|
"consistency_rate": consistent / (consistent + inconsistent) if (consistent + inconsistent) > 0 else 1.0,
|
|
"coverage_rate": (consistent + partial) / total if total > 0 else 0.0,
|
|
}
|
|
```
|
|
|
|
## 4. Retrieval Metrics
|
|
|
|
### Normalized Discounted Cumulative Gain (NDCG)
|
|
|
|
```python
|
|
import numpy as np
|
|
|
|
def ndcg_at_k(
|
|
relevance_scores: List[float], # Graded relevance (0-3)
|
|
k: int = 10,
|
|
) -> float:
|
|
"""Compute NDCG@K for retrieval results."""
|
|
|
|
# DCG
|
|
relevance = np.array(relevance_scores[:k])
|
|
gains = 2**relevance - 1
|
|
discounts = np.log2(np.arange(2, len(relevance) + 2))
|
|
dcg = np.sum(gains / discounts)
|
|
|
|
# Ideal DCG
|
|
ideal_relevance = np.sort(relevance_scores)[::-1][:k]
|
|
ideal_gains = 2**np.array(ideal_relevance) - 1
|
|
idcg = np.sum(ideal_gains / discounts[:len(ideal_relevance)])
|
|
|
|
return dcg / idcg if idcg > 0 else 0.0
|
|
|
|
|
|
def compute_retrieval_metrics(
|
|
queries: List[dict], # [{query, results: [{doc_id, score}], gold_relevant: [doc_ids]}]
|
|
k_values: List[int] = [5, 10, 20],
|
|
) -> dict:
|
|
"""Compute comprehensive retrieval metrics."""
|
|
|
|
metrics = {f"ndcg@{k}": [] for k in k_values}
|
|
metrics.update({f"precision@{k}": [] for k in k_values})
|
|
metrics.update({f"recall@{k}": [] for k in k_values})
|
|
metrics["mrr"] = []
|
|
|
|
for query in queries:
|
|
result_ids = [r["doc_id"] for r in query["results"]]
|
|
gold_set = set(query["gold_relevant"])
|
|
|
|
# Binary relevance for each result
|
|
relevance = [1 if rid in gold_set else 0 for rid in result_ids]
|
|
|
|
# MRR
|
|
for i, rel in enumerate(relevance):
|
|
if rel == 1:
|
|
metrics["mrr"].append(1.0 / (i + 1))
|
|
break
|
|
else:
|
|
metrics["mrr"].append(0.0)
|
|
|
|
# Metrics at each K
|
|
for k in k_values:
|
|
# NDCG
|
|
ndcg = ndcg_at_k(relevance, k)
|
|
metrics[f"ndcg@{k}"].append(ndcg)
|
|
|
|
# Precision
|
|
retrieved_relevant = sum(relevance[:k])
|
|
metrics[f"precision@{k}"].append(retrieved_relevant / k)
|
|
|
|
# Recall
|
|
if gold_set:
|
|
metrics[f"recall@{k}"].append(retrieved_relevant / len(gold_set))
|
|
else:
|
|
metrics[f"recall@{k}"].append(1.0)
|
|
|
|
# Average across queries
|
|
return {
|
|
metric: np.mean(values) for metric, values in metrics.items()
|
|
}
|
|
```
|
|
|
|
### Retrieval Source Analysis
|
|
|
|
```python
|
|
def retrieval_source_analysis(
|
|
query_results: List[dict], # [{results: [{doc_id, source, score}]}]
|
|
) -> dict:
|
|
"""Analyze contribution of different retrieval sources."""
|
|
|
|
source_counts = defaultdict(int)
|
|
source_at_top1 = defaultdict(int)
|
|
source_at_top5 = defaultdict(int)
|
|
|
|
for query in query_results:
|
|
results = query["results"]
|
|
|
|
for i, result in enumerate(results[:10]):
|
|
source = result["source"]
|
|
source_counts[source] += 1
|
|
|
|
if i == 0:
|
|
source_at_top1[source] += 1
|
|
if i < 5:
|
|
source_at_top5[source] += 1
|
|
|
|
total_queries = len(query_results)
|
|
|
|
return {
|
|
"source_distribution": dict(source_counts),
|
|
"top1_by_source": {s: c / total_queries for s, c in source_at_top1.items()},
|
|
"top5_by_source": {s: c / (total_queries * 5) for s, c in source_at_top5.items()},
|
|
}
|
|
```
|
|
|
|
## 5. End-to-End RAG Metrics
|
|
|
|
### Answer Quality Metrics
|
|
|
|
```python
|
|
class RAGEvaluator:
|
|
"""End-to-end RAG evaluation."""
|
|
|
|
def __init__(self, llm_judge=None):
|
|
self.llm_judge = llm_judge or self._default_judge()
|
|
|
|
def evaluate_answer(
|
|
self,
|
|
question: str,
|
|
generated_answer: str,
|
|
gold_answer: str,
|
|
retrieved_context: str,
|
|
) -> dict:
|
|
"""Evaluate a single RAG response."""
|
|
|
|
metrics = {}
|
|
|
|
# 1. Lexical overlap
|
|
metrics["rouge"] = self._compute_rouge(generated_answer, gold_answer)
|
|
|
|
# 2. Semantic similarity
|
|
metrics["semantic_similarity"] = self._semantic_similarity(
|
|
generated_answer, gold_answer
|
|
)
|
|
|
|
# 3. Faithfulness (answer grounded in context)
|
|
metrics["faithfulness"] = self._compute_faithfulness(
|
|
generated_answer, retrieved_context
|
|
)
|
|
|
|
# 4. Relevance (answer addresses question)
|
|
metrics["relevance"] = self._compute_relevance(
|
|
question, generated_answer
|
|
)
|
|
|
|
# 5. LLM-as-judge (if available)
|
|
if self.llm_judge:
|
|
metrics["llm_judge"] = self._llm_judge_score(
|
|
question, generated_answer, gold_answer
|
|
)
|
|
|
|
return metrics
|
|
|
|
def _compute_rouge(self, generated: str, reference: str) -> dict:
|
|
from rouge_score import rouge_scorer
|
|
|
|
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
|
|
scores = scorer.score(reference, generated)
|
|
|
|
return {
|
|
"rouge1": scores["rouge1"].fmeasure,
|
|
"rouge2": scores["rouge2"].fmeasure,
|
|
"rougeL": scores["rougeL"].fmeasure,
|
|
}
|
|
|
|
def _semantic_similarity(self, text1: str, text2: str) -> float:
|
|
from sentence_transformers import SentenceTransformer, util
|
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
emb1 = model.encode(text1)
|
|
emb2 = model.encode(text2)
|
|
|
|
return float(util.cos_sim(emb1, emb2)[0][0])
|
|
|
|
def _compute_faithfulness(self, answer: str, context: str) -> float:
|
|
"""Check if answer claims are supported by context."""
|
|
# Simplified: check sentence overlap
|
|
answer_sents = answer.split(". ")
|
|
|
|
supported = 0
|
|
for sent in answer_sents:
|
|
if any(word in context.lower() for word in sent.lower().split() if len(word) > 4):
|
|
supported += 1
|
|
|
|
return supported / len(answer_sents) if answer_sents else 0.0
|
|
|
|
def _compute_relevance(self, question: str, answer: str) -> float:
|
|
"""Check if answer addresses the question."""
|
|
return self._semantic_similarity(question, answer)
|
|
```
|
|
|
|
### Factual Accuracy
|
|
|
|
```python
|
|
def factual_accuracy(
|
|
generated_claims: List[dict], # [{claim, verifiable}]
|
|
gold_facts: List[dict], # [{fact, source}]
|
|
) -> dict:
|
|
"""Evaluate factual accuracy of generated answers."""
|
|
|
|
verifiable_claims = [c for c in generated_claims if c["verifiable"]]
|
|
|
|
correct = 0
|
|
incorrect = 0
|
|
unverifiable = len(generated_claims) - len(verifiable_claims)
|
|
|
|
for claim in verifiable_claims:
|
|
if any(claim["claim"].lower() in fact["fact"].lower() for fact in gold_facts):
|
|
correct += 1
|
|
else:
|
|
incorrect += 1
|
|
|
|
total = correct + incorrect
|
|
accuracy = correct / total if total > 0 else 0.0
|
|
|
|
return {
|
|
"factual_accuracy": accuracy,
|
|
"correct_claims": correct,
|
|
"incorrect_claims": incorrect,
|
|
"unverifiable_claims": unverifiable,
|
|
"total_claims": len(generated_claims),
|
|
}
|
|
```
|
|
|
|
## 6. Gold Standard Datasets
|
|
|
|
### Dataset Requirements
|
|
|
|
```yaml
|
|
# Gold Standard Dataset Schema
|
|
gold_standard_dataset:
|
|
metadata:
|
|
name: "Heritage Custodian Evaluation v1.0"
|
|
created_date: "2025-12-01"
|
|
languages: ["nl", "en", "de", "fr"]
|
|
domains: ["museums", "archives", "libraries"]
|
|
|
|
ner_annotations:
|
|
format: "BRAT/IOB2"
|
|
entity_types:
|
|
- "GRP.HER.MUS"
|
|
- "GRP.HER.ARC"
|
|
- "GRP.HER.LIB"
|
|
- "GRP.HER.SOC"
|
|
- "TOP"
|
|
- "TMP"
|
|
- "IDENTIFIER"
|
|
samples_per_type: 100
|
|
inter_annotator_agreement: ">0.85 Cohen's Kappa"
|
|
|
|
type_classification:
|
|
format: "TSV"
|
|
columns: ["text", "gold_type", "secondary_types"]
|
|
samples_per_type: 50
|
|
balanced: true
|
|
|
|
entity_linking:
|
|
format: "JSON"
|
|
fields: ["mention", "context", "gold_wikidata", "gold_viaf", "gold_isil"]
|
|
nil_ratio: 0.15
|
|
samples: 500
|
|
|
|
retrieval:
|
|
format: "TREC"
|
|
queries: 200
|
|
judgments_per_query: 50
|
|
graded_relevance: [0, 1, 2, 3]
|
|
|
|
qa:
|
|
format: "JSON"
|
|
fields: ["question", "gold_answer", "supporting_facts", "answer_type"]
|
|
question_types:
|
|
- "factual"
|
|
- "comparative"
|
|
- "relationship"
|
|
- "temporal"
|
|
samples: 300
|
|
```
|
|
|
|
### Dataset Splits
|
|
|
|
```python
|
|
DATASET_SPLITS = {
|
|
"train": 0.70,
|
|
"dev": 0.15,
|
|
"test": 0.15,
|
|
}
|
|
|
|
STRATIFICATION_FACTORS = [
|
|
"custodian_type", # GLAMORCUBESFIXPHDNT distribution
|
|
"country", # Geographic coverage
|
|
"data_tier", # Quality tier (1-4)
|
|
"source_type", # Conversation, web, CSV
|
|
]
|
|
```
|
|
|
|
### Sample Gold Standard Entry
|
|
|
|
```json
|
|
{
|
|
"id": "eval_001",
|
|
"text": "Het Rijksmuseum Amsterdam (ISIL: NL-AmRM) werd opgericht in 1800 en beheert de grootste collectie Nederlandse kunst.",
|
|
|
|
"ner_annotations": [
|
|
{"text": "Rijksmuseum Amsterdam", "type": "GRP.HER.MUS", "start": 4, "end": 25},
|
|
{"text": "NL-AmRM", "type": "IDENTIFIER", "start": 33, "end": 40},
|
|
{"text": "1800", "type": "TMP", "start": 58, "end": 62},
|
|
{"text": "Nederlandse", "type": "TOP", "start": 93, "end": 104}
|
|
],
|
|
|
|
"type_classification": {
|
|
"primary_type": "M",
|
|
"secondary_types": [],
|
|
"rationale": "Art museum with cultural heritage collections"
|
|
},
|
|
|
|
"entity_linking": {
|
|
"Rijksmuseum Amsterdam": {
|
|
"wikidata_id": "Q190804",
|
|
"viaf_id": "148691498",
|
|
"isil_code": "NL-AmRM",
|
|
"ghcid": "NL-NH-AMS-M-RM"
|
|
}
|
|
},
|
|
|
|
"qa_pairs": [
|
|
{
|
|
"question": "Wanneer is het Rijksmuseum opgericht?",
|
|
"answer": "Het Rijksmuseum werd opgericht in 1800.",
|
|
"answer_type": "factual",
|
|
"supporting_facts": ["opgericht in 1800"]
|
|
}
|
|
]
|
|
}
|
|
```
|
|
|
|
## 7. Evaluation Runner
|
|
|
|
### Batch Evaluation Pipeline
|
|
|
|
```python
|
|
class EvaluationRunner:
|
|
"""Run comprehensive evaluation across all tasks."""
|
|
|
|
def __init__(self, config: dict):
|
|
self.config = config
|
|
self.metrics = {}
|
|
|
|
def run_full_evaluation(
|
|
self,
|
|
model,
|
|
gold_dataset: List[dict],
|
|
) -> dict:
|
|
"""Run all evaluation tasks."""
|
|
|
|
results = {}
|
|
|
|
# 1. NER Evaluation
|
|
print("Evaluating NER...")
|
|
ner_results = self._evaluate_ner(model, gold_dataset)
|
|
results["ner"] = ner_results
|
|
|
|
# 2. Type Classification Evaluation
|
|
print("Evaluating Type Classification...")
|
|
type_results = self._evaluate_type_classification(model, gold_dataset)
|
|
results["type_classification"] = type_results
|
|
|
|
# 3. Entity Linking Evaluation
|
|
print("Evaluating Entity Linking...")
|
|
linking_results = self._evaluate_entity_linking(model, gold_dataset)
|
|
results["entity_linking"] = linking_results
|
|
|
|
# 4. Retrieval Evaluation
|
|
print("Evaluating Retrieval...")
|
|
retrieval_results = self._evaluate_retrieval(model, gold_dataset)
|
|
results["retrieval"] = retrieval_results
|
|
|
|
# 5. End-to-End QA Evaluation
|
|
print("Evaluating End-to-End QA...")
|
|
qa_results = self._evaluate_qa(model, gold_dataset)
|
|
results["qa"] = qa_results
|
|
|
|
# Summary
|
|
results["summary"] = self._compute_summary(results)
|
|
|
|
return results
|
|
|
|
def _evaluate_ner(self, model, dataset) -> dict:
|
|
all_pred_entities = []
|
|
all_gold_entities = []
|
|
|
|
for sample in dataset:
|
|
pred = model.extract_entities(sample["text"])
|
|
all_pred_entities.extend(pred)
|
|
all_gold_entities.extend(sample["ner_annotations"])
|
|
|
|
strict = entity_level_metrics(all_pred_entities, all_gold_entities, "strict")
|
|
partial = entity_level_metrics(all_pred_entities, all_gold_entities, "partial")
|
|
|
|
return {
|
|
"strict": strict,
|
|
"partial": partial,
|
|
"type_distribution": type_distribution_analysis(
|
|
all_pred_entities, all_gold_entities
|
|
),
|
|
}
|
|
|
|
def _compute_summary(self, results: dict) -> dict:
|
|
"""Compute overall summary metrics."""
|
|
|
|
return {
|
|
"ner_f1_strict": results["ner"]["strict"]["f1"],
|
|
"ner_f1_partial": results["ner"]["partial"]["f1"],
|
|
"type_classification_accuracy": results["type_classification"]["accuracy"],
|
|
"entity_linking_mrr": results["entity_linking"]["mrr"],
|
|
"entity_linking_hits@1": results["entity_linking"]["hits@1"],
|
|
"retrieval_ndcg@10": results["retrieval"]["ndcg@10"],
|
|
"qa_faithfulness": results["qa"]["avg_faithfulness"],
|
|
"qa_relevance": results["qa"]["avg_relevance"],
|
|
}
|
|
```
|
|
|
|
### Statistical Significance Testing
|
|
|
|
```python
|
|
from scipy import stats
|
|
|
|
def significance_test(
|
|
scores_a: List[float],
|
|
scores_b: List[float],
|
|
test_type: str = "paired_t",
|
|
alpha: float = 0.05,
|
|
) -> dict:
|
|
"""Test statistical significance between two model scores."""
|
|
|
|
if test_type == "paired_t":
|
|
statistic, p_value = stats.ttest_rel(scores_a, scores_b)
|
|
elif test_type == "wilcoxon":
|
|
statistic, p_value = stats.wilcoxon(scores_a, scores_b)
|
|
elif test_type == "bootstrap":
|
|
# Bootstrap confidence interval
|
|
diff = np.array(scores_a) - np.array(scores_b)
|
|
bootstrap_diffs = []
|
|
for _ in range(10000):
|
|
sample = np.random.choice(diff, size=len(diff), replace=True)
|
|
bootstrap_diffs.append(np.mean(sample))
|
|
|
|
ci_lower = np.percentile(bootstrap_diffs, 2.5)
|
|
ci_upper = np.percentile(bootstrap_diffs, 97.5)
|
|
|
|
return {
|
|
"mean_difference": np.mean(diff),
|
|
"ci_95_lower": ci_lower,
|
|
"ci_95_upper": ci_upper,
|
|
"significant": not (ci_lower <= 0 <= ci_upper),
|
|
}
|
|
|
|
return {
|
|
"test": test_type,
|
|
"statistic": statistic,
|
|
"p_value": p_value,
|
|
"significant": p_value < alpha,
|
|
"alpha": alpha,
|
|
}
|
|
```
|
|
|
|
## 8. Performance Benchmarks
|
|
|
|
### Target Metrics
|
|
|
|
| Task | Metric | Target | State-of-Art |
|
|
|------|--------|--------|--------------|
|
|
| **NER (strict)** | F1 | ≥0.85 | 0.92 (CoNLL) |
|
|
| **NER (partial)** | F1 | ≥0.90 | 0.95 |
|
|
| **Type Classification** | Accuracy | ≥0.80 | 0.85 |
|
|
| **Type Classification** | Macro-F1 | ≥0.75 | 0.80 |
|
|
| **Entity Linking** | Hits@1 | ≥0.70 | 0.75 |
|
|
| **Entity Linking** | MRR | ≥0.75 | 0.82 |
|
|
| **NIL Detection** | F1 | ≥0.65 | 0.72 |
|
|
| **Retrieval** | NDCG@10 | ≥0.60 | 0.68 |
|
|
| **Retrieval** | MRR | ≥0.55 | 0.62 |
|
|
| **QA Faithfulness** | Score | ≥0.80 | 0.85 |
|
|
| **QA Relevance** | Score | ≥0.75 | 0.82 |
|
|
|
|
### Baseline Models
|
|
|
|
```python
|
|
BASELINE_MODELS = {
|
|
"ner": {
|
|
"spacy_nl": "nl_core_news_lg",
|
|
"spacy_en": "en_core_web_trf",
|
|
"regex_only": "pattern_based",
|
|
},
|
|
"type_classification": {
|
|
"keyword_based": "rule_based_classifier",
|
|
"zero_shot": "bart-large-mnli",
|
|
},
|
|
"entity_linking": {
|
|
"exact_match": "string_matching",
|
|
"fuzzy": "rapidfuzz_based",
|
|
"wikidata_api": "wikidata_search",
|
|
},
|
|
"retrieval": {
|
|
"bm25": "elasticsearch_bm25",
|
|
"dense": "all-MiniLM-L6-v2",
|
|
},
|
|
}
|
|
```
|
|
|
|
## See Also
|
|
|
|
- [02-dspy-signatures.md](./02-dspy-signatures.md) - DSPy module definitions
|
|
- [04-entity-extraction.md](./04-entity-extraction.md) - NER patterns
|
|
- [05-entity-linking.md](./05-entity-linking.md) - Entity linking strategies
|
|
- [06-retrieval-patterns.md](./06-retrieval-patterns.md) - Retrieval strategies
|
|
- [AGENTS.md](../../AGENTS.md) - Project conventions and rules
|