# Evaluation Framework for Heritage Custodian RAG ## Overview This document defines evaluation metrics, benchmarks, and gold standard datasets for the Heritage Custodian RAG pipeline. Evaluation covers all pipeline stages: entity extraction, type classification, entity linking, retrieval, and end-to-end question answering. ## Evaluation Architecture ``` ┌─────────────────────────────────────────────────────────────────────┐ │ Evaluation Framework │ ├─────────────────────────────────────────────────────────────────────┤ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ │ Task │ │ Gold │ │ Metrics │ │ │ │ Modules │ │ Standard │ │ Suite │ │ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ │ │ │ │ │ │ ▼ ▼ ▼ │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Evaluation Runner │ │ │ │ • Batch evaluation across datasets │ │ │ │ • Stratified sampling (by type, country, tier) │ │ │ │ • Statistical significance testing │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Results Dashboard │ │ │ │ • Per-task metrics breakdown │ │ │ │ • Error analysis and confusion matrices │ │ │ │ • Performance over time / model versions │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────────┘ ``` ## 1. Entity Extraction Metrics ### Token-Level Metrics ```python from typing import List, Tuple from collections import defaultdict def token_level_metrics( pred_spans: List[Tuple[int, int, str]], # (start, end, type) gold_spans: List[Tuple[int, int, str]], text: str, ) -> dict: """Compute token-level precision, recall, F1. Args: pred_spans: Predicted entity spans with types gold_spans: Gold standard entity spans with types text: Original text for tokenization Returns: Dictionary with P, R, F1 for each entity type and overall """ # Tokenize and create token-to-span mapping tokens = text.split() def spans_to_tokens(spans: List[Tuple[int, int, str]]) -> dict: """Map spans to token indices with types.""" token_labels = {} char_pos = 0 for idx, token in enumerate(tokens): for start, end, etype in spans: if char_pos >= start and char_pos < end: token_labels[idx] = etype char_pos += len(token) + 1 return token_labels pred_tokens = spans_to_tokens(pred_spans) gold_tokens = spans_to_tokens(gold_spans) # Compute metrics per type all_types = set(list(pred_tokens.values()) + list(gold_tokens.values())) metrics = {} for etype in all_types: pred_set = {k for k, v in pred_tokens.items() if v == etype} gold_set = {k for k, v in gold_tokens.items() if v == etype} tp = len(pred_set & gold_set) fp = len(pred_set - gold_set) fn = len(gold_set - pred_set) precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 metrics[etype] = { "precision": precision, "recall": recall, "f1": f1, "support": len(gold_set), } # Macro average macro_p = sum(m["precision"] for m in metrics.values()) / len(metrics) if metrics else 0.0 macro_r = sum(m["recall"] for m in metrics.values()) / len(metrics) if metrics else 0.0 macro_f1 = sum(m["f1"] for m in metrics.values()) / len(metrics) if metrics else 0.0 metrics["macro_avg"] = {"precision": macro_p, "recall": macro_r, "f1": macro_f1} return metrics ``` ### Entity-Level Metrics (Strict & Partial) ```python def entity_level_metrics( pred_entities: List[dict], # [{text, type, start, end}] gold_entities: List[dict], match_type: str = "strict", # "strict" or "partial" ) -> dict: """Compute entity-level precision, recall, F1. Strict: Exact span and type match Partial: Overlapping span and type match """ def spans_match(pred: dict, gold: dict, match_type: str) -> bool: # Type must always match if pred["type"] != gold["type"]: return False if match_type == "strict": return pred["start"] == gold["start"] and pred["end"] == gold["end"] else: # partial # Spans overlap return not (pred["end"] <= gold["start"] or pred["start"] >= gold["end"]) matched_gold = set() matched_pred = set() for i, pred in enumerate(pred_entities): for j, gold in enumerate(gold_entities): if j not in matched_gold and spans_match(pred, gold, match_type): matched_pred.add(i) matched_gold.add(j) break tp = len(matched_gold) fp = len(pred_entities) - len(matched_pred) fn = len(gold_entities) - len(matched_gold) precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return { "precision": precision, "recall": recall, "f1": f1, "true_positives": tp, "false_positives": fp, "false_negatives": fn, "match_type": match_type, } ``` ### Entity Type Distribution Analysis ```python def type_distribution_analysis( pred_entities: List[dict], gold_entities: List[dict], ) -> dict: """Analyze entity type prediction distribution.""" from collections import Counter pred_types = Counter(e["type"] for e in pred_entities) gold_types = Counter(e["type"] for e in gold_entities) all_types = set(pred_types.keys()) | set(gold_types.keys()) analysis = {} for etype in all_types: pred_count = pred_types.get(etype, 0) gold_count = gold_types.get(etype, 0) analysis[etype] = { "predicted": pred_count, "gold": gold_count, "over_prediction": pred_count - gold_count, "ratio": pred_count / gold_count if gold_count > 0 else float('inf'), } return analysis ``` ## 2. Type Classification Metrics ### Multi-Class Classification Metrics ```python import numpy as np from sklearn.metrics import ( accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report, ) def type_classification_metrics( y_true: List[str], # Gold labels y_pred: List[str], # Predicted labels labels: List[str] = None, # All possible labels ) -> dict: """Compute classification metrics for GLAMORCUBESFIXPHDNT types.""" if labels is None: labels = list("GLAMORCUBESFIXPHDNT") # 19 types # Overall accuracy accuracy = accuracy_score(y_true, y_pred) # Per-class metrics precision, recall, f1, support = precision_recall_fscore_support( y_true, y_pred, labels=labels, zero_division=0 ) # Confusion matrix cm = confusion_matrix(y_true, y_pred, labels=labels) # Build results per_class = {} for i, label in enumerate(labels): per_class[label] = { "precision": float(precision[i]), "recall": float(recall[i]), "f1": float(f1[i]), "support": int(support[i]), } return { "accuracy": accuracy, "macro_precision": float(np.mean(precision)), "macro_recall": float(np.mean(recall)), "macro_f1": float(np.mean(f1)), "weighted_f1": float(np.average(f1, weights=support)), "per_class": per_class, "confusion_matrix": cm.tolist(), "labels": labels, } ``` ### Type Confusion Analysis ```python def type_confusion_analysis( y_true: List[str], y_pred: List[str], top_k: int = 10, ) -> dict: """Identify most common type confusions.""" from collections import Counter confusions = Counter() for true_label, pred_label in zip(y_true, y_pred): if true_label != pred_label: confusions[(true_label, pred_label)] += 1 top_confusions = confusions.most_common(top_k) return { "top_confusions": [ { "true_type": pair[0], "predicted_type": pair[1], "count": count, } for pair, count in top_confusions ], "total_errors": sum(confusions.values()), "unique_confusion_pairs": len(confusions), } ``` ### Hierarchical Type Metrics ```python def hierarchical_type_metrics( y_true: List[str], y_pred: List[str], ) -> dict: """Metrics accounting for type hierarchy relationships. GLAMORCUBESFIXPHDNT types can be grouped into higher-level categories: - CULTURAL: G, L, A, M (core GLAM) - INSTITUTIONAL: O, R, E - COMMUNITY: S, I, N - SPECIALIZED: B, H, T, F - DIGITAL: D - PRIVATE: C, P - UNKNOWN: U, X """ TYPE_HIERARCHY = { "CULTURAL": ["G", "L", "A", "M"], "INSTITUTIONAL": ["O", "R", "E"], "COMMUNITY": ["S", "I", "N"], "SPECIALIZED": ["B", "H", "T", "F"], "DIGITAL": ["D"], "PRIVATE": ["C", "P"], "UNKNOWN": ["U", "X"], } # Reverse mapping type_to_category = {} for category, types in TYPE_HIERARCHY.items(): for t in types: type_to_category[t] = category # Category-level predictions y_true_cat = [type_to_category.get(t, "UNKNOWN") for t in y_true] y_pred_cat = [type_to_category.get(t, "UNKNOWN") for t in y_pred] # Exact match exact_accuracy = accuracy_score(y_true, y_pred) # Category match (lenient) category_accuracy = accuracy_score(y_true_cat, y_pred_cat) return { "exact_accuracy": exact_accuracy, "category_accuracy": category_accuracy, "hierarchy_gap": category_accuracy - exact_accuracy, } ``` ## 3. Entity Linking Metrics ### Linking Accuracy (Hits@K) ```python def linking_accuracy( predictions: List[dict], # [{mention, candidates: [{kb_id, score}]}] gold: List[dict], # [{mention, gold_kb_id}] k_values: List[int] = [1, 5, 10], ) -> dict: """Compute Hits@K for entity linking.""" results = {f"hits@{k}": 0.0 for k in k_values} # Build gold lookup gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold} for pred in predictions: mention = pred["mention"] gold_id = gold_lookup.get(mention) if gold_id is None: continue candidates = pred.get("candidates", []) candidate_ids = [c["kb_id"] for c in candidates] for k in k_values: if gold_id in candidate_ids[:k]: results[f"hits@{k}"] += 1 # Normalize n = len(gold) for k in k_values: results[f"hits@{k}"] /= n if n > 0 else 1 return results ``` ### Mean Reciprocal Rank (MRR) ```python def mean_reciprocal_rank( predictions: List[dict], gold: List[dict], ) -> float: """Compute MRR for entity linking.""" gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold} reciprocal_ranks = [] for pred in predictions: mention = pred["mention"] gold_id = gold_lookup.get(mention) if gold_id is None: continue candidates = pred.get("candidates", []) candidate_ids = [c["kb_id"] for c in candidates] if gold_id in candidate_ids: rank = candidate_ids.index(gold_id) + 1 reciprocal_ranks.append(1.0 / rank) else: reciprocal_ranks.append(0.0) return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0 ``` ### NIL Detection Metrics ```python def nil_detection_metrics( predictions: List[dict], # [{mention, is_nil_pred}] gold: List[dict], # [{mention, is_nil_gold}] ) -> dict: """Metrics for NIL entity detection.""" y_true = [g["is_nil_gold"] for g in gold] y_pred = [p["is_nil_pred"] for p in predictions] tp = sum(1 for t, p in zip(y_true, y_pred) if t and p) fp = sum(1 for t, p in zip(y_true, y_pred) if not t and p) fn = sum(1 for t, p in zip(y_true, y_pred) if t and not p) tn = sum(1 for t, p in zip(y_true, y_pred) if not t and not p) precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return { "nil_precision": precision, "nil_recall": recall, "nil_f1": f1, "true_positives": tp, "false_positives": fp, "false_negatives": fn, "true_negatives": tn, } ``` ### Cross-KB Linking Consistency ```python def cross_kb_consistency( linked_entities: List[dict], # [{wikidata_id, viaf_id, isil_code}] ) -> dict: """Check consistency of cross-KB identifier linking.""" consistent = 0 inconsistent = 0 partial = 0 for entity in linked_entities: wd = entity.get("wikidata_id") viaf = entity.get("viaf_id") isil = entity.get("isil_code") if wd and viaf: # Verify VIAF in Wikidata wd_viaf = get_wikidata_viaf(wd) if wd_viaf == viaf: consistent += 1 else: inconsistent += 1 elif wd or viaf or isil: partial += 1 total = consistent + inconsistent + partial return { "consistent": consistent, "inconsistent": inconsistent, "partial": partial, "consistency_rate": consistent / (consistent + inconsistent) if (consistent + inconsistent) > 0 else 1.0, "coverage_rate": (consistent + partial) / total if total > 0 else 0.0, } ``` ## 4. Retrieval Metrics ### Normalized Discounted Cumulative Gain (NDCG) ```python import numpy as np def ndcg_at_k( relevance_scores: List[float], # Graded relevance (0-3) k: int = 10, ) -> float: """Compute NDCG@K for retrieval results.""" # DCG relevance = np.array(relevance_scores[:k]) gains = 2**relevance - 1 discounts = np.log2(np.arange(2, len(relevance) + 2)) dcg = np.sum(gains / discounts) # Ideal DCG ideal_relevance = np.sort(relevance_scores)[::-1][:k] ideal_gains = 2**np.array(ideal_relevance) - 1 idcg = np.sum(ideal_gains / discounts[:len(ideal_relevance)]) return dcg / idcg if idcg > 0 else 0.0 def compute_retrieval_metrics( queries: List[dict], # [{query, results: [{doc_id, score}], gold_relevant: [doc_ids]}] k_values: List[int] = [5, 10, 20], ) -> dict: """Compute comprehensive retrieval metrics.""" metrics = {f"ndcg@{k}": [] for k in k_values} metrics.update({f"precision@{k}": [] for k in k_values}) metrics.update({f"recall@{k}": [] for k in k_values}) metrics["mrr"] = [] for query in queries: result_ids = [r["doc_id"] for r in query["results"]] gold_set = set(query["gold_relevant"]) # Binary relevance for each result relevance = [1 if rid in gold_set else 0 for rid in result_ids] # MRR for i, rel in enumerate(relevance): if rel == 1: metrics["mrr"].append(1.0 / (i + 1)) break else: metrics["mrr"].append(0.0) # Metrics at each K for k in k_values: # NDCG ndcg = ndcg_at_k(relevance, k) metrics[f"ndcg@{k}"].append(ndcg) # Precision retrieved_relevant = sum(relevance[:k]) metrics[f"precision@{k}"].append(retrieved_relevant / k) # Recall if gold_set: metrics[f"recall@{k}"].append(retrieved_relevant / len(gold_set)) else: metrics[f"recall@{k}"].append(1.0) # Average across queries return { metric: np.mean(values) for metric, values in metrics.items() } ``` ### Retrieval Source Analysis ```python def retrieval_source_analysis( query_results: List[dict], # [{results: [{doc_id, source, score}]}] ) -> dict: """Analyze contribution of different retrieval sources.""" source_counts = defaultdict(int) source_at_top1 = defaultdict(int) source_at_top5 = defaultdict(int) for query in query_results: results = query["results"] for i, result in enumerate(results[:10]): source = result["source"] source_counts[source] += 1 if i == 0: source_at_top1[source] += 1 if i < 5: source_at_top5[source] += 1 total_queries = len(query_results) return { "source_distribution": dict(source_counts), "top1_by_source": {s: c / total_queries for s, c in source_at_top1.items()}, "top5_by_source": {s: c / (total_queries * 5) for s, c in source_at_top5.items()}, } ``` ## 5. End-to-End RAG Metrics ### Answer Quality Metrics ```python class RAGEvaluator: """End-to-end RAG evaluation.""" def __init__(self, llm_judge=None): self.llm_judge = llm_judge or self._default_judge() def evaluate_answer( self, question: str, generated_answer: str, gold_answer: str, retrieved_context: str, ) -> dict: """Evaluate a single RAG response.""" metrics = {} # 1. Lexical overlap metrics["rouge"] = self._compute_rouge(generated_answer, gold_answer) # 2. Semantic similarity metrics["semantic_similarity"] = self._semantic_similarity( generated_answer, gold_answer ) # 3. Faithfulness (answer grounded in context) metrics["faithfulness"] = self._compute_faithfulness( generated_answer, retrieved_context ) # 4. Relevance (answer addresses question) metrics["relevance"] = self._compute_relevance( question, generated_answer ) # 5. LLM-as-judge (if available) if self.llm_judge: metrics["llm_judge"] = self._llm_judge_score( question, generated_answer, gold_answer ) return metrics def _compute_rouge(self, generated: str, reference: str) -> dict: from rouge_score import rouge_scorer scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL']) scores = scorer.score(reference, generated) return { "rouge1": scores["rouge1"].fmeasure, "rouge2": scores["rouge2"].fmeasure, "rougeL": scores["rougeL"].fmeasure, } def _semantic_similarity(self, text1: str, text2: str) -> float: from sentence_transformers import SentenceTransformer, util model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") emb1 = model.encode(text1) emb2 = model.encode(text2) return float(util.cos_sim(emb1, emb2)[0][0]) def _compute_faithfulness(self, answer: str, context: str) -> float: """Check if answer claims are supported by context.""" # Simplified: check sentence overlap answer_sents = answer.split(". ") supported = 0 for sent in answer_sents: if any(word in context.lower() for word in sent.lower().split() if len(word) > 4): supported += 1 return supported / len(answer_sents) if answer_sents else 0.0 def _compute_relevance(self, question: str, answer: str) -> float: """Check if answer addresses the question.""" return self._semantic_similarity(question, answer) ``` ### Factual Accuracy ```python def factual_accuracy( generated_claims: List[dict], # [{claim, verifiable}] gold_facts: List[dict], # [{fact, source}] ) -> dict: """Evaluate factual accuracy of generated answers.""" verifiable_claims = [c for c in generated_claims if c["verifiable"]] correct = 0 incorrect = 0 unverifiable = len(generated_claims) - len(verifiable_claims) for claim in verifiable_claims: if any(claim["claim"].lower() in fact["fact"].lower() for fact in gold_facts): correct += 1 else: incorrect += 1 total = correct + incorrect accuracy = correct / total if total > 0 else 0.0 return { "factual_accuracy": accuracy, "correct_claims": correct, "incorrect_claims": incorrect, "unverifiable_claims": unverifiable, "total_claims": len(generated_claims), } ``` ## 6. Gold Standard Datasets ### Dataset Requirements ```yaml # Gold Standard Dataset Schema gold_standard_dataset: metadata: name: "Heritage Custodian Evaluation v1.0" created_date: "2025-12-01" languages: ["nl", "en", "de", "fr"] domains: ["museums", "archives", "libraries"] ner_annotations: format: "BRAT/IOB2" entity_types: - "GRP.HER.MUS" - "GRP.HER.ARC" - "GRP.HER.LIB" - "GRP.HER.SOC" - "TOP" - "TMP" - "IDENTIFIER" samples_per_type: 100 inter_annotator_agreement: ">0.85 Cohen's Kappa" type_classification: format: "TSV" columns: ["text", "gold_type", "secondary_types"] samples_per_type: 50 balanced: true entity_linking: format: "JSON" fields: ["mention", "context", "gold_wikidata", "gold_viaf", "gold_isil"] nil_ratio: 0.15 samples: 500 retrieval: format: "TREC" queries: 200 judgments_per_query: 50 graded_relevance: [0, 1, 2, 3] qa: format: "JSON" fields: ["question", "gold_answer", "supporting_facts", "answer_type"] question_types: - "factual" - "comparative" - "relationship" - "temporal" samples: 300 ``` ### Dataset Splits ```python DATASET_SPLITS = { "train": 0.70, "dev": 0.15, "test": 0.15, } STRATIFICATION_FACTORS = [ "custodian_type", # GLAMORCUBESFIXPHDNT distribution "country", # Geographic coverage "data_tier", # Quality tier (1-4) "source_type", # Conversation, web, CSV ] ``` ### Sample Gold Standard Entry ```json { "id": "eval_001", "text": "Het Rijksmuseum Amsterdam (ISIL: NL-AmRM) werd opgericht in 1800 en beheert de grootste collectie Nederlandse kunst.", "ner_annotations": [ {"text": "Rijksmuseum Amsterdam", "type": "GRP.HER.MUS", "start": 4, "end": 25}, {"text": "NL-AmRM", "type": "IDENTIFIER", "start": 33, "end": 40}, {"text": "1800", "type": "TMP", "start": 58, "end": 62}, {"text": "Nederlandse", "type": "TOP", "start": 93, "end": 104} ], "type_classification": { "primary_type": "M", "secondary_types": [], "rationale": "Art museum with cultural heritage collections" }, "entity_linking": { "Rijksmuseum Amsterdam": { "wikidata_id": "Q190804", "viaf_id": "148691498", "isil_code": "NL-AmRM", "ghcid": "NL-NH-AMS-M-RM" } }, "qa_pairs": [ { "question": "Wanneer is het Rijksmuseum opgericht?", "answer": "Het Rijksmuseum werd opgericht in 1800.", "answer_type": "factual", "supporting_facts": ["opgericht in 1800"] } ] } ``` ## 7. Evaluation Runner ### Batch Evaluation Pipeline ```python class EvaluationRunner: """Run comprehensive evaluation across all tasks.""" def __init__(self, config: dict): self.config = config self.metrics = {} def run_full_evaluation( self, model, gold_dataset: List[dict], ) -> dict: """Run all evaluation tasks.""" results = {} # 1. NER Evaluation print("Evaluating NER...") ner_results = self._evaluate_ner(model, gold_dataset) results["ner"] = ner_results # 2. Type Classification Evaluation print("Evaluating Type Classification...") type_results = self._evaluate_type_classification(model, gold_dataset) results["type_classification"] = type_results # 3. Entity Linking Evaluation print("Evaluating Entity Linking...") linking_results = self._evaluate_entity_linking(model, gold_dataset) results["entity_linking"] = linking_results # 4. Retrieval Evaluation print("Evaluating Retrieval...") retrieval_results = self._evaluate_retrieval(model, gold_dataset) results["retrieval"] = retrieval_results # 5. End-to-End QA Evaluation print("Evaluating End-to-End QA...") qa_results = self._evaluate_qa(model, gold_dataset) results["qa"] = qa_results # Summary results["summary"] = self._compute_summary(results) return results def _evaluate_ner(self, model, dataset) -> dict: all_pred_entities = [] all_gold_entities = [] for sample in dataset: pred = model.extract_entities(sample["text"]) all_pred_entities.extend(pred) all_gold_entities.extend(sample["ner_annotations"]) strict = entity_level_metrics(all_pred_entities, all_gold_entities, "strict") partial = entity_level_metrics(all_pred_entities, all_gold_entities, "partial") return { "strict": strict, "partial": partial, "type_distribution": type_distribution_analysis( all_pred_entities, all_gold_entities ), } def _compute_summary(self, results: dict) -> dict: """Compute overall summary metrics.""" return { "ner_f1_strict": results["ner"]["strict"]["f1"], "ner_f1_partial": results["ner"]["partial"]["f1"], "type_classification_accuracy": results["type_classification"]["accuracy"], "entity_linking_mrr": results["entity_linking"]["mrr"], "entity_linking_hits@1": results["entity_linking"]["hits@1"], "retrieval_ndcg@10": results["retrieval"]["ndcg@10"], "qa_faithfulness": results["qa"]["avg_faithfulness"], "qa_relevance": results["qa"]["avg_relevance"], } ``` ### Statistical Significance Testing ```python from scipy import stats def significance_test( scores_a: List[float], scores_b: List[float], test_type: str = "paired_t", alpha: float = 0.05, ) -> dict: """Test statistical significance between two model scores.""" if test_type == "paired_t": statistic, p_value = stats.ttest_rel(scores_a, scores_b) elif test_type == "wilcoxon": statistic, p_value = stats.wilcoxon(scores_a, scores_b) elif test_type == "bootstrap": # Bootstrap confidence interval diff = np.array(scores_a) - np.array(scores_b) bootstrap_diffs = [] for _ in range(10000): sample = np.random.choice(diff, size=len(diff), replace=True) bootstrap_diffs.append(np.mean(sample)) ci_lower = np.percentile(bootstrap_diffs, 2.5) ci_upper = np.percentile(bootstrap_diffs, 97.5) return { "mean_difference": np.mean(diff), "ci_95_lower": ci_lower, "ci_95_upper": ci_upper, "significant": not (ci_lower <= 0 <= ci_upper), } return { "test": test_type, "statistic": statistic, "p_value": p_value, "significant": p_value < alpha, "alpha": alpha, } ``` ## 8. Performance Benchmarks ### Target Metrics | Task | Metric | Target | State-of-Art | |------|--------|--------|--------------| | **NER (strict)** | F1 | ≥0.85 | 0.92 (CoNLL) | | **NER (partial)** | F1 | ≥0.90 | 0.95 | | **Type Classification** | Accuracy | ≥0.80 | 0.85 | | **Type Classification** | Macro-F1 | ≥0.75 | 0.80 | | **Entity Linking** | Hits@1 | ≥0.70 | 0.75 | | **Entity Linking** | MRR | ≥0.75 | 0.82 | | **NIL Detection** | F1 | ≥0.65 | 0.72 | | **Retrieval** | NDCG@10 | ≥0.60 | 0.68 | | **Retrieval** | MRR | ≥0.55 | 0.62 | | **QA Faithfulness** | Score | ≥0.80 | 0.85 | | **QA Relevance** | Score | ≥0.75 | 0.82 | ### Baseline Models ```python BASELINE_MODELS = { "ner": { "spacy_nl": "nl_core_news_lg", "spacy_en": "en_core_web_trf", "regex_only": "pattern_based", }, "type_classification": { "keyword_based": "rule_based_classifier", "zero_shot": "bart-large-mnli", }, "entity_linking": { "exact_match": "string_matching", "fuzzy": "rapidfuzz_based", "wikidata_api": "wikidata_search", }, "retrieval": { "bm25": "elasticsearch_bm25", "dense": "all-MiniLM-L6-v2", }, } ``` ## See Also - [02-dspy-signatures.md](./02-dspy-signatures.md) - DSPy module definitions - [04-entity-extraction.md](./04-entity-extraction.md) - NER patterns - [05-entity-linking.md](./05-entity-linking.md) - Entity linking strategies - [06-retrieval-patterns.md](./06-retrieval-patterns.md) - Retrieval strategies - [AGENTS.md](../../AGENTS.md) - Project conventions and rules