31 KiB
31 KiB
Evaluation Framework for Heritage Custodian RAG
Overview
This document defines evaluation metrics, benchmarks, and gold standard datasets for the Heritage Custodian RAG pipeline. Evaluation covers all pipeline stages: entity extraction, type classification, entity linking, retrieval, and end-to-end question answering.
Evaluation Architecture
┌─────────────────────────────────────────────────────────────────────┐
│ Evaluation Framework │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Task │ │ Gold │ │ Metrics │ │
│ │ Modules │ │ Standard │ │ Suite │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Evaluation Runner │ │
│ │ • Batch evaluation across datasets │ │
│ │ • Stratified sampling (by type, country, tier) │ │
│ │ • Statistical significance testing │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Results Dashboard │ │
│ │ • Per-task metrics breakdown │ │
│ │ • Error analysis and confusion matrices │ │
│ │ • Performance over time / model versions │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────┘
1. Entity Extraction Metrics
Token-Level Metrics
from typing import List, Tuple
from collections import defaultdict
def token_level_metrics(
pred_spans: List[Tuple[int, int, str]], # (start, end, type)
gold_spans: List[Tuple[int, int, str]],
text: str,
) -> dict:
"""Compute token-level precision, recall, F1.
Args:
pred_spans: Predicted entity spans with types
gold_spans: Gold standard entity spans with types
text: Original text for tokenization
Returns:
Dictionary with P, R, F1 for each entity type and overall
"""
# Tokenize and create token-to-span mapping
tokens = text.split()
def spans_to_tokens(spans: List[Tuple[int, int, str]]) -> dict:
"""Map spans to token indices with types."""
token_labels = {}
char_pos = 0
for idx, token in enumerate(tokens):
for start, end, etype in spans:
if char_pos >= start and char_pos < end:
token_labels[idx] = etype
char_pos += len(token) + 1
return token_labels
pred_tokens = spans_to_tokens(pred_spans)
gold_tokens = spans_to_tokens(gold_spans)
# Compute metrics per type
all_types = set(list(pred_tokens.values()) + list(gold_tokens.values()))
metrics = {}
for etype in all_types:
pred_set = {k for k, v in pred_tokens.items() if v == etype}
gold_set = {k for k, v in gold_tokens.items() if v == etype}
tp = len(pred_set & gold_set)
fp = len(pred_set - gold_set)
fn = len(gold_set - pred_set)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
metrics[etype] = {
"precision": precision,
"recall": recall,
"f1": f1,
"support": len(gold_set),
}
# Macro average
macro_p = sum(m["precision"] for m in metrics.values()) / len(metrics) if metrics else 0.0
macro_r = sum(m["recall"] for m in metrics.values()) / len(metrics) if metrics else 0.0
macro_f1 = sum(m["f1"] for m in metrics.values()) / len(metrics) if metrics else 0.0
metrics["macro_avg"] = {"precision": macro_p, "recall": macro_r, "f1": macro_f1}
return metrics
Entity-Level Metrics (Strict & Partial)
def entity_level_metrics(
pred_entities: List[dict], # [{text, type, start, end}]
gold_entities: List[dict],
match_type: str = "strict", # "strict" or "partial"
) -> dict:
"""Compute entity-level precision, recall, F1.
Strict: Exact span and type match
Partial: Overlapping span and type match
"""
def spans_match(pred: dict, gold: dict, match_type: str) -> bool:
# Type must always match
if pred["type"] != gold["type"]:
return False
if match_type == "strict":
return pred["start"] == gold["start"] and pred["end"] == gold["end"]
else: # partial
# Spans overlap
return not (pred["end"] <= gold["start"] or pred["start"] >= gold["end"])
matched_gold = set()
matched_pred = set()
for i, pred in enumerate(pred_entities):
for j, gold in enumerate(gold_entities):
if j not in matched_gold and spans_match(pred, gold, match_type):
matched_pred.add(i)
matched_gold.add(j)
break
tp = len(matched_gold)
fp = len(pred_entities) - len(matched_pred)
fn = len(gold_entities) - len(matched_gold)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return {
"precision": precision,
"recall": recall,
"f1": f1,
"true_positives": tp,
"false_positives": fp,
"false_negatives": fn,
"match_type": match_type,
}
Entity Type Distribution Analysis
def type_distribution_analysis(
pred_entities: List[dict],
gold_entities: List[dict],
) -> dict:
"""Analyze entity type prediction distribution."""
from collections import Counter
pred_types = Counter(e["type"] for e in pred_entities)
gold_types = Counter(e["type"] for e in gold_entities)
all_types = set(pred_types.keys()) | set(gold_types.keys())
analysis = {}
for etype in all_types:
pred_count = pred_types.get(etype, 0)
gold_count = gold_types.get(etype, 0)
analysis[etype] = {
"predicted": pred_count,
"gold": gold_count,
"over_prediction": pred_count - gold_count,
"ratio": pred_count / gold_count if gold_count > 0 else float('inf'),
}
return analysis
2. Type Classification Metrics
Multi-Class Classification Metrics
import numpy as np
from sklearn.metrics import (
accuracy_score,
precision_recall_fscore_support,
confusion_matrix,
classification_report,
)
def type_classification_metrics(
y_true: List[str], # Gold labels
y_pred: List[str], # Predicted labels
labels: List[str] = None, # All possible labels
) -> dict:
"""Compute classification metrics for GLAMORCUBESFIXPHDNT types."""
if labels is None:
labels = list("GLAMORCUBESFIXPHDNT") # 19 types
# Overall accuracy
accuracy = accuracy_score(y_true, y_pred)
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
y_true, y_pred, labels=labels, zero_division=0
)
# Confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=labels)
# Build results
per_class = {}
for i, label in enumerate(labels):
per_class[label] = {
"precision": float(precision[i]),
"recall": float(recall[i]),
"f1": float(f1[i]),
"support": int(support[i]),
}
return {
"accuracy": accuracy,
"macro_precision": float(np.mean(precision)),
"macro_recall": float(np.mean(recall)),
"macro_f1": float(np.mean(f1)),
"weighted_f1": float(np.average(f1, weights=support)),
"per_class": per_class,
"confusion_matrix": cm.tolist(),
"labels": labels,
}
Type Confusion Analysis
def type_confusion_analysis(
y_true: List[str],
y_pred: List[str],
top_k: int = 10,
) -> dict:
"""Identify most common type confusions."""
from collections import Counter
confusions = Counter()
for true_label, pred_label in zip(y_true, y_pred):
if true_label != pred_label:
confusions[(true_label, pred_label)] += 1
top_confusions = confusions.most_common(top_k)
return {
"top_confusions": [
{
"true_type": pair[0],
"predicted_type": pair[1],
"count": count,
}
for pair, count in top_confusions
],
"total_errors": sum(confusions.values()),
"unique_confusion_pairs": len(confusions),
}
Hierarchical Type Metrics
def hierarchical_type_metrics(
y_true: List[str],
y_pred: List[str],
) -> dict:
"""Metrics accounting for type hierarchy relationships.
GLAMORCUBESFIXPHDNT types can be grouped into higher-level categories:
- CULTURAL: G, L, A, M (core GLAM)
- INSTITUTIONAL: O, R, E
- COMMUNITY: S, I, N
- SPECIALIZED: B, H, T, F
- DIGITAL: D
- PRIVATE: C, P
- UNKNOWN: U, X
"""
TYPE_HIERARCHY = {
"CULTURAL": ["G", "L", "A", "M"],
"INSTITUTIONAL": ["O", "R", "E"],
"COMMUNITY": ["S", "I", "N"],
"SPECIALIZED": ["B", "H", "T", "F"],
"DIGITAL": ["D"],
"PRIVATE": ["C", "P"],
"UNKNOWN": ["U", "X"],
}
# Reverse mapping
type_to_category = {}
for category, types in TYPE_HIERARCHY.items():
for t in types:
type_to_category[t] = category
# Category-level predictions
y_true_cat = [type_to_category.get(t, "UNKNOWN") for t in y_true]
y_pred_cat = [type_to_category.get(t, "UNKNOWN") for t in y_pred]
# Exact match
exact_accuracy = accuracy_score(y_true, y_pred)
# Category match (lenient)
category_accuracy = accuracy_score(y_true_cat, y_pred_cat)
return {
"exact_accuracy": exact_accuracy,
"category_accuracy": category_accuracy,
"hierarchy_gap": category_accuracy - exact_accuracy,
}
3. Entity Linking Metrics
Linking Accuracy (Hits@K)
def linking_accuracy(
predictions: List[dict], # [{mention, candidates: [{kb_id, score}]}]
gold: List[dict], # [{mention, gold_kb_id}]
k_values: List[int] = [1, 5, 10],
) -> dict:
"""Compute Hits@K for entity linking."""
results = {f"hits@{k}": 0.0 for k in k_values}
# Build gold lookup
gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
for pred in predictions:
mention = pred["mention"]
gold_id = gold_lookup.get(mention)
if gold_id is None:
continue
candidates = pred.get("candidates", [])
candidate_ids = [c["kb_id"] for c in candidates]
for k in k_values:
if gold_id in candidate_ids[:k]:
results[f"hits@{k}"] += 1
# Normalize
n = len(gold)
for k in k_values:
results[f"hits@{k}"] /= n if n > 0 else 1
return results
Mean Reciprocal Rank (MRR)
def mean_reciprocal_rank(
predictions: List[dict],
gold: List[dict],
) -> float:
"""Compute MRR for entity linking."""
gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
reciprocal_ranks = []
for pred in predictions:
mention = pred["mention"]
gold_id = gold_lookup.get(mention)
if gold_id is None:
continue
candidates = pred.get("candidates", [])
candidate_ids = [c["kb_id"] for c in candidates]
if gold_id in candidate_ids:
rank = candidate_ids.index(gold_id) + 1
reciprocal_ranks.append(1.0 / rank)
else:
reciprocal_ranks.append(0.0)
return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0
NIL Detection Metrics
def nil_detection_metrics(
predictions: List[dict], # [{mention, is_nil_pred}]
gold: List[dict], # [{mention, is_nil_gold}]
) -> dict:
"""Metrics for NIL entity detection."""
y_true = [g["is_nil_gold"] for g in gold]
y_pred = [p["is_nil_pred"] for p in predictions]
tp = sum(1 for t, p in zip(y_true, y_pred) if t and p)
fp = sum(1 for t, p in zip(y_true, y_pred) if not t and p)
fn = sum(1 for t, p in zip(y_true, y_pred) if t and not p)
tn = sum(1 for t, p in zip(y_true, y_pred) if not t and not p)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return {
"nil_precision": precision,
"nil_recall": recall,
"nil_f1": f1,
"true_positives": tp,
"false_positives": fp,
"false_negatives": fn,
"true_negatives": tn,
}
Cross-KB Linking Consistency
def cross_kb_consistency(
linked_entities: List[dict], # [{wikidata_id, viaf_id, isil_code}]
) -> dict:
"""Check consistency of cross-KB identifier linking."""
consistent = 0
inconsistent = 0
partial = 0
for entity in linked_entities:
wd = entity.get("wikidata_id")
viaf = entity.get("viaf_id")
isil = entity.get("isil_code")
if wd and viaf:
# Verify VIAF in Wikidata
wd_viaf = get_wikidata_viaf(wd)
if wd_viaf == viaf:
consistent += 1
else:
inconsistent += 1
elif wd or viaf or isil:
partial += 1
total = consistent + inconsistent + partial
return {
"consistent": consistent,
"inconsistent": inconsistent,
"partial": partial,
"consistency_rate": consistent / (consistent + inconsistent) if (consistent + inconsistent) > 0 else 1.0,
"coverage_rate": (consistent + partial) / total if total > 0 else 0.0,
}
4. Retrieval Metrics
Normalized Discounted Cumulative Gain (NDCG)
import numpy as np
def ndcg_at_k(
relevance_scores: List[float], # Graded relevance (0-3)
k: int = 10,
) -> float:
"""Compute NDCG@K for retrieval results."""
# DCG
relevance = np.array(relevance_scores[:k])
gains = 2**relevance - 1
discounts = np.log2(np.arange(2, len(relevance) + 2))
dcg = np.sum(gains / discounts)
# Ideal DCG
ideal_relevance = np.sort(relevance_scores)[::-1][:k]
ideal_gains = 2**np.array(ideal_relevance) - 1
idcg = np.sum(ideal_gains / discounts[:len(ideal_relevance)])
return dcg / idcg if idcg > 0 else 0.0
def compute_retrieval_metrics(
queries: List[dict], # [{query, results: [{doc_id, score}], gold_relevant: [doc_ids]}]
k_values: List[int] = [5, 10, 20],
) -> dict:
"""Compute comprehensive retrieval metrics."""
metrics = {f"ndcg@{k}": [] for k in k_values}
metrics.update({f"precision@{k}": [] for k in k_values})
metrics.update({f"recall@{k}": [] for k in k_values})
metrics["mrr"] = []
for query in queries:
result_ids = [r["doc_id"] for r in query["results"]]
gold_set = set(query["gold_relevant"])
# Binary relevance for each result
relevance = [1 if rid in gold_set else 0 for rid in result_ids]
# MRR
for i, rel in enumerate(relevance):
if rel == 1:
metrics["mrr"].append(1.0 / (i + 1))
break
else:
metrics["mrr"].append(0.0)
# Metrics at each K
for k in k_values:
# NDCG
ndcg = ndcg_at_k(relevance, k)
metrics[f"ndcg@{k}"].append(ndcg)
# Precision
retrieved_relevant = sum(relevance[:k])
metrics[f"precision@{k}"].append(retrieved_relevant / k)
# Recall
if gold_set:
metrics[f"recall@{k}"].append(retrieved_relevant / len(gold_set))
else:
metrics[f"recall@{k}"].append(1.0)
# Average across queries
return {
metric: np.mean(values) for metric, values in metrics.items()
}
Retrieval Source Analysis
def retrieval_source_analysis(
query_results: List[dict], # [{results: [{doc_id, source, score}]}]
) -> dict:
"""Analyze contribution of different retrieval sources."""
source_counts = defaultdict(int)
source_at_top1 = defaultdict(int)
source_at_top5 = defaultdict(int)
for query in query_results:
results = query["results"]
for i, result in enumerate(results[:10]):
source = result["source"]
source_counts[source] += 1
if i == 0:
source_at_top1[source] += 1
if i < 5:
source_at_top5[source] += 1
total_queries = len(query_results)
return {
"source_distribution": dict(source_counts),
"top1_by_source": {s: c / total_queries for s, c in source_at_top1.items()},
"top5_by_source": {s: c / (total_queries * 5) for s, c in source_at_top5.items()},
}
5. End-to-End RAG Metrics
Answer Quality Metrics
class RAGEvaluator:
"""End-to-end RAG evaluation."""
def __init__(self, llm_judge=None):
self.llm_judge = llm_judge or self._default_judge()
def evaluate_answer(
self,
question: str,
generated_answer: str,
gold_answer: str,
retrieved_context: str,
) -> dict:
"""Evaluate a single RAG response."""
metrics = {}
# 1. Lexical overlap
metrics["rouge"] = self._compute_rouge(generated_answer, gold_answer)
# 2. Semantic similarity
metrics["semantic_similarity"] = self._semantic_similarity(
generated_answer, gold_answer
)
# 3. Faithfulness (answer grounded in context)
metrics["faithfulness"] = self._compute_faithfulness(
generated_answer, retrieved_context
)
# 4. Relevance (answer addresses question)
metrics["relevance"] = self._compute_relevance(
question, generated_answer
)
# 5. LLM-as-judge (if available)
if self.llm_judge:
metrics["llm_judge"] = self._llm_judge_score(
question, generated_answer, gold_answer
)
return metrics
def _compute_rouge(self, generated: str, reference: str) -> dict:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
scores = scorer.score(reference, generated)
return {
"rouge1": scores["rouge1"].fmeasure,
"rouge2": scores["rouge2"].fmeasure,
"rougeL": scores["rougeL"].fmeasure,
}
def _semantic_similarity(self, text1: str, text2: str) -> float:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb1 = model.encode(text1)
emb2 = model.encode(text2)
return float(util.cos_sim(emb1, emb2)[0][0])
def _compute_faithfulness(self, answer: str, context: str) -> float:
"""Check if answer claims are supported by context."""
# Simplified: check sentence overlap
answer_sents = answer.split(". ")
supported = 0
for sent in answer_sents:
if any(word in context.lower() for word in sent.lower().split() if len(word) > 4):
supported += 1
return supported / len(answer_sents) if answer_sents else 0.0
def _compute_relevance(self, question: str, answer: str) -> float:
"""Check if answer addresses the question."""
return self._semantic_similarity(question, answer)
Factual Accuracy
def factual_accuracy(
generated_claims: List[dict], # [{claim, verifiable}]
gold_facts: List[dict], # [{fact, source}]
) -> dict:
"""Evaluate factual accuracy of generated answers."""
verifiable_claims = [c for c in generated_claims if c["verifiable"]]
correct = 0
incorrect = 0
unverifiable = len(generated_claims) - len(verifiable_claims)
for claim in verifiable_claims:
if any(claim["claim"].lower() in fact["fact"].lower() for fact in gold_facts):
correct += 1
else:
incorrect += 1
total = correct + incorrect
accuracy = correct / total if total > 0 else 0.0
return {
"factual_accuracy": accuracy,
"correct_claims": correct,
"incorrect_claims": incorrect,
"unverifiable_claims": unverifiable,
"total_claims": len(generated_claims),
}
6. Gold Standard Datasets
Dataset Requirements
# Gold Standard Dataset Schema
gold_standard_dataset:
metadata:
name: "Heritage Custodian Evaluation v1.0"
created_date: "2025-12-01"
languages: ["nl", "en", "de", "fr"]
domains: ["museums", "archives", "libraries"]
ner_annotations:
format: "BRAT/IOB2"
entity_types:
- "GRP.HER.MUS"
- "GRP.HER.ARC"
- "GRP.HER.LIB"
- "GRP.HER.SOC"
- "TOP"
- "TMP"
- "IDENTIFIER"
samples_per_type: 100
inter_annotator_agreement: ">0.85 Cohen's Kappa"
type_classification:
format: "TSV"
columns: ["text", "gold_type", "secondary_types"]
samples_per_type: 50
balanced: true
entity_linking:
format: "JSON"
fields: ["mention", "context", "gold_wikidata", "gold_viaf", "gold_isil"]
nil_ratio: 0.15
samples: 500
retrieval:
format: "TREC"
queries: 200
judgments_per_query: 50
graded_relevance: [0, 1, 2, 3]
qa:
format: "JSON"
fields: ["question", "gold_answer", "supporting_facts", "answer_type"]
question_types:
- "factual"
- "comparative"
- "relationship"
- "temporal"
samples: 300
Dataset Splits
DATASET_SPLITS = {
"train": 0.70,
"dev": 0.15,
"test": 0.15,
}
STRATIFICATION_FACTORS = [
"custodian_type", # GLAMORCUBESFIXPHDNT distribution
"country", # Geographic coverage
"data_tier", # Quality tier (1-4)
"source_type", # Conversation, web, CSV
]
Sample Gold Standard Entry
{
"id": "eval_001",
"text": "Het Rijksmuseum Amsterdam (ISIL: NL-AmRM) werd opgericht in 1800 en beheert de grootste collectie Nederlandse kunst.",
"ner_annotations": [
{"text": "Rijksmuseum Amsterdam", "type": "GRP.HER.MUS", "start": 4, "end": 25},
{"text": "NL-AmRM", "type": "IDENTIFIER", "start": 33, "end": 40},
{"text": "1800", "type": "TMP", "start": 58, "end": 62},
{"text": "Nederlandse", "type": "TOP", "start": 93, "end": 104}
],
"type_classification": {
"primary_type": "M",
"secondary_types": [],
"rationale": "Art museum with cultural heritage collections"
},
"entity_linking": {
"Rijksmuseum Amsterdam": {
"wikidata_id": "Q190804",
"viaf_id": "148691498",
"isil_code": "NL-AmRM",
"ghcid": "NL-NH-AMS-M-RM"
}
},
"qa_pairs": [
{
"question": "Wanneer is het Rijksmuseum opgericht?",
"answer": "Het Rijksmuseum werd opgericht in 1800.",
"answer_type": "factual",
"supporting_facts": ["opgericht in 1800"]
}
]
}
7. Evaluation Runner
Batch Evaluation Pipeline
class EvaluationRunner:
"""Run comprehensive evaluation across all tasks."""
def __init__(self, config: dict):
self.config = config
self.metrics = {}
def run_full_evaluation(
self,
model,
gold_dataset: List[dict],
) -> dict:
"""Run all evaluation tasks."""
results = {}
# 1. NER Evaluation
print("Evaluating NER...")
ner_results = self._evaluate_ner(model, gold_dataset)
results["ner"] = ner_results
# 2. Type Classification Evaluation
print("Evaluating Type Classification...")
type_results = self._evaluate_type_classification(model, gold_dataset)
results["type_classification"] = type_results
# 3. Entity Linking Evaluation
print("Evaluating Entity Linking...")
linking_results = self._evaluate_entity_linking(model, gold_dataset)
results["entity_linking"] = linking_results
# 4. Retrieval Evaluation
print("Evaluating Retrieval...")
retrieval_results = self._evaluate_retrieval(model, gold_dataset)
results["retrieval"] = retrieval_results
# 5. End-to-End QA Evaluation
print("Evaluating End-to-End QA...")
qa_results = self._evaluate_qa(model, gold_dataset)
results["qa"] = qa_results
# Summary
results["summary"] = self._compute_summary(results)
return results
def _evaluate_ner(self, model, dataset) -> dict:
all_pred_entities = []
all_gold_entities = []
for sample in dataset:
pred = model.extract_entities(sample["text"])
all_pred_entities.extend(pred)
all_gold_entities.extend(sample["ner_annotations"])
strict = entity_level_metrics(all_pred_entities, all_gold_entities, "strict")
partial = entity_level_metrics(all_pred_entities, all_gold_entities, "partial")
return {
"strict": strict,
"partial": partial,
"type_distribution": type_distribution_analysis(
all_pred_entities, all_gold_entities
),
}
def _compute_summary(self, results: dict) -> dict:
"""Compute overall summary metrics."""
return {
"ner_f1_strict": results["ner"]["strict"]["f1"],
"ner_f1_partial": results["ner"]["partial"]["f1"],
"type_classification_accuracy": results["type_classification"]["accuracy"],
"entity_linking_mrr": results["entity_linking"]["mrr"],
"entity_linking_hits@1": results["entity_linking"]["hits@1"],
"retrieval_ndcg@10": results["retrieval"]["ndcg@10"],
"qa_faithfulness": results["qa"]["avg_faithfulness"],
"qa_relevance": results["qa"]["avg_relevance"],
}
Statistical Significance Testing
from scipy import stats
def significance_test(
scores_a: List[float],
scores_b: List[float],
test_type: str = "paired_t",
alpha: float = 0.05,
) -> dict:
"""Test statistical significance between two model scores."""
if test_type == "paired_t":
statistic, p_value = stats.ttest_rel(scores_a, scores_b)
elif test_type == "wilcoxon":
statistic, p_value = stats.wilcoxon(scores_a, scores_b)
elif test_type == "bootstrap":
# Bootstrap confidence interval
diff = np.array(scores_a) - np.array(scores_b)
bootstrap_diffs = []
for _ in range(10000):
sample = np.random.choice(diff, size=len(diff), replace=True)
bootstrap_diffs.append(np.mean(sample))
ci_lower = np.percentile(bootstrap_diffs, 2.5)
ci_upper = np.percentile(bootstrap_diffs, 97.5)
return {
"mean_difference": np.mean(diff),
"ci_95_lower": ci_lower,
"ci_95_upper": ci_upper,
"significant": not (ci_lower <= 0 <= ci_upper),
}
return {
"test": test_type,
"statistic": statistic,
"p_value": p_value,
"significant": p_value < alpha,
"alpha": alpha,
}
8. Performance Benchmarks
Target Metrics
| Task | Metric | Target | State-of-Art |
|---|---|---|---|
| NER (strict) | F1 | ≥0.85 | 0.92 (CoNLL) |
| NER (partial) | F1 | ≥0.90 | 0.95 |
| Type Classification | Accuracy | ≥0.80 | 0.85 |
| Type Classification | Macro-F1 | ≥0.75 | 0.80 |
| Entity Linking | Hits@1 | ≥0.70 | 0.75 |
| Entity Linking | MRR | ≥0.75 | 0.82 |
| NIL Detection | F1 | ≥0.65 | 0.72 |
| Retrieval | NDCG@10 | ≥0.60 | 0.68 |
| Retrieval | MRR | ≥0.55 | 0.62 |
| QA Faithfulness | Score | ≥0.80 | 0.85 |
| QA Relevance | Score | ≥0.75 | 0.82 |
Baseline Models
BASELINE_MODELS = {
"ner": {
"spacy_nl": "nl_core_news_lg",
"spacy_en": "en_core_web_trf",
"regex_only": "pattern_based",
},
"type_classification": {
"keyword_based": "rule_based_classifier",
"zero_shot": "bart-large-mnli",
},
"entity_linking": {
"exact_match": "string_matching",
"fuzzy": "rapidfuzz_based",
"wikidata_api": "wikidata_search",
},
"retrieval": {
"bm25": "elasticsearch_bm25",
"dense": "all-MiniLM-L6-v2",
},
}
See Also
- 02-dspy-signatures.md - DSPy module definitions
- 04-entity-extraction.md - NER patterns
- 05-entity-linking.md - Entity linking strategies
- 06-retrieval-patterns.md - Retrieval strategies
- AGENTS.md - Project conventions and rules