kempersc b1f93b6f22 enrich person profiles

2025-12-12 12:51:10 +01:00

31 KiB

Raw Blame History

Evaluation Framework for Heritage Custodian RAG

Overview

This document defines evaluation metrics, benchmarks, and gold standard datasets for the Heritage Custodian RAG pipeline. Evaluation covers all pipeline stages: entity extraction, type classification, entity linking, retrieval, and end-to-end question answering.

Evaluation Architecture

┌─────────────────────────────────────────────────────────────────────┐
│                    Evaluation Framework                             │
├─────────────────────────────────────────────────────────────────────┤
│                                                                     │
│  ┌─────────────┐   ┌─────────────┐   ┌─────────────┐               │
│  │   Task      │   │   Gold      │   │   Metrics   │               │
│  │   Modules   │   │   Standard  │   │   Suite     │               │
│  └──────┬──────┘   └──────┬──────┘   └──────┬──────┘               │
│         │                 │                 │                       │
│         ▼                 ▼                 ▼                       │
│  ┌─────────────────────────────────────────────────────────┐       │
│  │              Evaluation Runner                           │       │
│  │  • Batch evaluation across datasets                      │       │
│  │  • Stratified sampling (by type, country, tier)         │       │
│  │  • Statistical significance testing                      │       │
│  └─────────────────────────────────────────────────────────┘       │
│                              │                                      │
│                              ▼                                      │
│  ┌─────────────────────────────────────────────────────────┐       │
│  │              Results Dashboard                           │       │
│  │  • Per-task metrics breakdown                           │       │
│  │  • Error analysis and confusion matrices                │       │
│  │  • Performance over time / model versions               │       │
│  └─────────────────────────────────────────────────────────┘       │
│                                                                     │
└─────────────────────────────────────────────────────────────────────┘

1. Entity Extraction Metrics

Token-Level Metrics

from typing import List, Tuple
from collections import defaultdict

def token_level_metrics(
    pred_spans: List[Tuple[int, int, str]],  # (start, end, type)
    gold_spans: List[Tuple[int, int, str]],
    text: str,
) -> dict:
    """Compute token-level precision, recall, F1.
    
    Args:
        pred_spans: Predicted entity spans with types
        gold_spans: Gold standard entity spans with types
        text: Original text for tokenization
    
    Returns:
        Dictionary with P, R, F1 for each entity type and overall
    """
    
    # Tokenize and create token-to-span mapping
    tokens = text.split()
    
    def spans_to_tokens(spans: List[Tuple[int, int, str]]) -> dict:
        """Map spans to token indices with types."""
        token_labels = {}
        char_pos = 0
        for idx, token in enumerate(tokens):
            for start, end, etype in spans:
                if char_pos >= start and char_pos < end:
                    token_labels[idx] = etype
            char_pos += len(token) + 1
        return token_labels
    
    pred_tokens = spans_to_tokens(pred_spans)
    gold_tokens = spans_to_tokens(gold_spans)
    
    # Compute metrics per type
    all_types = set(list(pred_tokens.values()) + list(gold_tokens.values()))
    metrics = {}
    
    for etype in all_types:
        pred_set = {k for k, v in pred_tokens.items() if v == etype}
        gold_set = {k for k, v in gold_tokens.items() if v == etype}
        
        tp = len(pred_set & gold_set)
        fp = len(pred_set - gold_set)
        fn = len(gold_set - pred_set)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        metrics[etype] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": len(gold_set),
        }
    
    # Macro average
    macro_p = sum(m["precision"] for m in metrics.values()) / len(metrics) if metrics else 0.0
    macro_r = sum(m["recall"] for m in metrics.values()) / len(metrics) if metrics else 0.0
    macro_f1 = sum(m["f1"] for m in metrics.values()) / len(metrics) if metrics else 0.0
    
    metrics["macro_avg"] = {"precision": macro_p, "recall": macro_r, "f1": macro_f1}
    
    return metrics

Entity-Level Metrics (Strict & Partial)

def entity_level_metrics(
    pred_entities: List[dict],  # [{text, type, start, end}]
    gold_entities: List[dict],
    match_type: str = "strict",  # "strict" or "partial"
) -> dict:
    """Compute entity-level precision, recall, F1.
    
    Strict: Exact span and type match
    Partial: Overlapping span and type match
    """
    
    def spans_match(pred: dict, gold: dict, match_type: str) -> bool:
        # Type must always match
        if pred["type"] != gold["type"]:
            return False
        
        if match_type == "strict":
            return pred["start"] == gold["start"] and pred["end"] == gold["end"]
        else:  # partial
            # Spans overlap
            return not (pred["end"] <= gold["start"] or pred["start"] >= gold["end"])
    
    matched_gold = set()
    matched_pred = set()
    
    for i, pred in enumerate(pred_entities):
        for j, gold in enumerate(gold_entities):
            if j not in matched_gold and spans_match(pred, gold, match_type):
                matched_pred.add(i)
                matched_gold.add(j)
                break
    
    tp = len(matched_gold)
    fp = len(pred_entities) - len(matched_pred)
    fn = len(gold_entities) - len(matched_gold)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "true_positives": tp,
        "false_positives": fp,
        "false_negatives": fn,
        "match_type": match_type,
    }

Entity Type Distribution Analysis

def type_distribution_analysis(
    pred_entities: List[dict],
    gold_entities: List[dict],
) -> dict:
    """Analyze entity type prediction distribution."""
    
    from collections import Counter
    
    pred_types = Counter(e["type"] for e in pred_entities)
    gold_types = Counter(e["type"] for e in gold_entities)
    
    all_types = set(pred_types.keys()) | set(gold_types.keys())
    
    analysis = {}
    for etype in all_types:
        pred_count = pred_types.get(etype, 0)
        gold_count = gold_types.get(etype, 0)
        
        analysis[etype] = {
            "predicted": pred_count,
            "gold": gold_count,
            "over_prediction": pred_count - gold_count,
            "ratio": pred_count / gold_count if gold_count > 0 else float('inf'),
        }
    
    return analysis

2. Type Classification Metrics

Multi-Class Classification Metrics

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)

def type_classification_metrics(
    y_true: List[str],  # Gold labels
    y_pred: List[str],  # Predicted labels
    labels: List[str] = None,  # All possible labels
) -> dict:
    """Compute classification metrics for GLAMORCUBESFIXPHDNT types."""
    
    if labels is None:
        labels = list("GLAMORCUBESFIXPHDNT")  # 19 types
    
    # Overall accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, zero_division=0
    )
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Build results
    per_class = {}
    for i, label in enumerate(labels):
        per_class[label] = {
            "precision": float(precision[i]),
            "recall": float(recall[i]),
            "f1": float(f1[i]),
            "support": int(support[i]),
        }
    
    return {
        "accuracy": accuracy,
        "macro_precision": float(np.mean(precision)),
        "macro_recall": float(np.mean(recall)),
        "macro_f1": float(np.mean(f1)),
        "weighted_f1": float(np.average(f1, weights=support)),
        "per_class": per_class,
        "confusion_matrix": cm.tolist(),
        "labels": labels,
    }

Type Confusion Analysis

def type_confusion_analysis(
    y_true: List[str],
    y_pred: List[str],
    top_k: int = 10,
) -> dict:
    """Identify most common type confusions."""
    
    from collections import Counter
    
    confusions = Counter()
    for true_label, pred_label in zip(y_true, y_pred):
        if true_label != pred_label:
            confusions[(true_label, pred_label)] += 1
    
    top_confusions = confusions.most_common(top_k)
    
    return {
        "top_confusions": [
            {
                "true_type": pair[0],
                "predicted_type": pair[1],
                "count": count,
            }
            for pair, count in top_confusions
        ],
        "total_errors": sum(confusions.values()),
        "unique_confusion_pairs": len(confusions),
    }

Hierarchical Type Metrics

def hierarchical_type_metrics(
    y_true: List[str],
    y_pred: List[str],
) -> dict:
    """Metrics accounting for type hierarchy relationships.
    
    GLAMORCUBESFIXPHDNT types can be grouped into higher-level categories:
    - CULTURAL: G, L, A, M (core GLAM)
    - INSTITUTIONAL: O, R, E
    - COMMUNITY: S, I, N
    - SPECIALIZED: B, H, T, F
    - DIGITAL: D
    - PRIVATE: C, P
    - UNKNOWN: U, X
    """
    
    TYPE_HIERARCHY = {
        "CULTURAL": ["G", "L", "A", "M"],
        "INSTITUTIONAL": ["O", "R", "E"],
        "COMMUNITY": ["S", "I", "N"],
        "SPECIALIZED": ["B", "H", "T", "F"],
        "DIGITAL": ["D"],
        "PRIVATE": ["C", "P"],
        "UNKNOWN": ["U", "X"],
    }
    
    # Reverse mapping
    type_to_category = {}
    for category, types in TYPE_HIERARCHY.items():
        for t in types:
            type_to_category[t] = category
    
    # Category-level predictions
    y_true_cat = [type_to_category.get(t, "UNKNOWN") for t in y_true]
    y_pred_cat = [type_to_category.get(t, "UNKNOWN") for t in y_pred]
    
    # Exact match
    exact_accuracy = accuracy_score(y_true, y_pred)
    
    # Category match (lenient)
    category_accuracy = accuracy_score(y_true_cat, y_pred_cat)
    
    return {
        "exact_accuracy": exact_accuracy,
        "category_accuracy": category_accuracy,
        "hierarchy_gap": category_accuracy - exact_accuracy,
    }

3. Entity Linking Metrics

Linking Accuracy (Hits@K)

def linking_accuracy(
    predictions: List[dict],  # [{mention, candidates: [{kb_id, score}]}]
    gold: List[dict],  # [{mention, gold_kb_id}]
    k_values: List[int] = [1, 5, 10],
) -> dict:
    """Compute Hits@K for entity linking."""
    
    results = {f"hits@{k}": 0.0 for k in k_values}
    
    # Build gold lookup
    gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
    
    for pred in predictions:
        mention = pred["mention"]
        gold_id = gold_lookup.get(mention)
        
        if gold_id is None:
            continue
        
        candidates = pred.get("candidates", [])
        candidate_ids = [c["kb_id"] for c in candidates]
        
        for k in k_values:
            if gold_id in candidate_ids[:k]:
                results[f"hits@{k}"] += 1
    
    # Normalize
    n = len(gold)
    for k in k_values:
        results[f"hits@{k}"] /= n if n > 0 else 1
    
    return results

Mean Reciprocal Rank (MRR)

def mean_reciprocal_rank(
    predictions: List[dict],
    gold: List[dict],
) -> float:
    """Compute MRR for entity linking."""
    
    gold_lookup = {g["mention"]: g["gold_kb_id"] for g in gold}
    
    reciprocal_ranks = []
    
    for pred in predictions:
        mention = pred["mention"]
        gold_id = gold_lookup.get(mention)
        
        if gold_id is None:
            continue
        
        candidates = pred.get("candidates", [])
        candidate_ids = [c["kb_id"] for c in candidates]
        
        if gold_id in candidate_ids:
            rank = candidate_ids.index(gold_id) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)
    
    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0

NIL Detection Metrics

def nil_detection_metrics(
    predictions: List[dict],  # [{mention, is_nil_pred}]
    gold: List[dict],  # [{mention, is_nil_gold}]
) -> dict:
    """Metrics for NIL entity detection."""
    
    y_true = [g["is_nil_gold"] for g in gold]
    y_pred = [p["is_nil_pred"] for p in predictions]
    
    tp = sum(1 for t, p in zip(y_true, y_pred) if t and p)
    fp = sum(1 for t, p in zip(y_true, y_pred) if not t and p)
    fn = sum(1 for t, p in zip(y_true, y_pred) if t and not p)
    tn = sum(1 for t, p in zip(y_true, y_pred) if not t and not p)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        "nil_precision": precision,
        "nil_recall": recall,
        "nil_f1": f1,
        "true_positives": tp,
        "false_positives": fp,
        "false_negatives": fn,
        "true_negatives": tn,
    }

Cross-KB Linking Consistency

def cross_kb_consistency(
    linked_entities: List[dict],  # [{wikidata_id, viaf_id, isil_code}]
) -> dict:
    """Check consistency of cross-KB identifier linking."""
    
    consistent = 0
    inconsistent = 0
    partial = 0
    
    for entity in linked_entities:
        wd = entity.get("wikidata_id")
        viaf = entity.get("viaf_id")
        isil = entity.get("isil_code")
        
        if wd and viaf:
            # Verify VIAF in Wikidata
            wd_viaf = get_wikidata_viaf(wd)
            if wd_viaf == viaf:
                consistent += 1
            else:
                inconsistent += 1
        elif wd or viaf or isil:
            partial += 1
    
    total = consistent + inconsistent + partial
    
    return {
        "consistent": consistent,
        "inconsistent": inconsistent,
        "partial": partial,
        "consistency_rate": consistent / (consistent + inconsistent) if (consistent + inconsistent) > 0 else 1.0,
        "coverage_rate": (consistent + partial) / total if total > 0 else 0.0,
    }

4. Retrieval Metrics

Normalized Discounted Cumulative Gain (NDCG)

import numpy as np

def ndcg_at_k(
    relevance_scores: List[float],  # Graded relevance (0-3)
    k: int = 10,
) -> float:
    """Compute NDCG@K for retrieval results."""
    
    # DCG
    relevance = np.array(relevance_scores[:k])
    gains = 2**relevance - 1
    discounts = np.log2(np.arange(2, len(relevance) + 2))
    dcg = np.sum(gains / discounts)
    
    # Ideal DCG
    ideal_relevance = np.sort(relevance_scores)[::-1][:k]
    ideal_gains = 2**np.array(ideal_relevance) - 1
    idcg = np.sum(ideal_gains / discounts[:len(ideal_relevance)])
    
    return dcg / idcg if idcg > 0 else 0.0


def compute_retrieval_metrics(
    queries: List[dict],  # [{query, results: [{doc_id, score}], gold_relevant: [doc_ids]}]
    k_values: List[int] = [5, 10, 20],
) -> dict:
    """Compute comprehensive retrieval metrics."""
    
    metrics = {f"ndcg@{k}": [] for k in k_values}
    metrics.update({f"precision@{k}": [] for k in k_values})
    metrics.update({f"recall@{k}": [] for k in k_values})
    metrics["mrr"] = []
    
    for query in queries:
        result_ids = [r["doc_id"] for r in query["results"]]
        gold_set = set(query["gold_relevant"])
        
        # Binary relevance for each result
        relevance = [1 if rid in gold_set else 0 for rid in result_ids]
        
        # MRR
        for i, rel in enumerate(relevance):
            if rel == 1:
                metrics["mrr"].append(1.0 / (i + 1))
                break
        else:
            metrics["mrr"].append(0.0)
        
        # Metrics at each K
        for k in k_values:
            # NDCG
            ndcg = ndcg_at_k(relevance, k)
            metrics[f"ndcg@{k}"].append(ndcg)
            
            # Precision
            retrieved_relevant = sum(relevance[:k])
            metrics[f"precision@{k}"].append(retrieved_relevant / k)
            
            # Recall
            if gold_set:
                metrics[f"recall@{k}"].append(retrieved_relevant / len(gold_set))
            else:
                metrics[f"recall@{k}"].append(1.0)
    
    # Average across queries
    return {
        metric: np.mean(values) for metric, values in metrics.items()
    }

Retrieval Source Analysis

def retrieval_source_analysis(
    query_results: List[dict],  # [{results: [{doc_id, source, score}]}]
) -> dict:
    """Analyze contribution of different retrieval sources."""
    
    source_counts = defaultdict(int)
    source_at_top1 = defaultdict(int)
    source_at_top5 = defaultdict(int)
    
    for query in query_results:
        results = query["results"]
        
        for i, result in enumerate(results[:10]):
            source = result["source"]
            source_counts[source] += 1
            
            if i == 0:
                source_at_top1[source] += 1
            if i < 5:
                source_at_top5[source] += 1
    
    total_queries = len(query_results)
    
    return {
        "source_distribution": dict(source_counts),
        "top1_by_source": {s: c / total_queries for s, c in source_at_top1.items()},
        "top5_by_source": {s: c / (total_queries * 5) for s, c in source_at_top5.items()},
    }

5. End-to-End RAG Metrics

Answer Quality Metrics

class RAGEvaluator:
    """End-to-end RAG evaluation."""
    
    def __init__(self, llm_judge=None):
        self.llm_judge = llm_judge or self._default_judge()
    
    def evaluate_answer(
        self,
        question: str,
        generated_answer: str,
        gold_answer: str,
        retrieved_context: str,
    ) -> dict:
        """Evaluate a single RAG response."""
        
        metrics = {}
        
        # 1. Lexical overlap
        metrics["rouge"] = self._compute_rouge(generated_answer, gold_answer)
        
        # 2. Semantic similarity
        metrics["semantic_similarity"] = self._semantic_similarity(
            generated_answer, gold_answer
        )
        
        # 3. Faithfulness (answer grounded in context)
        metrics["faithfulness"] = self._compute_faithfulness(
            generated_answer, retrieved_context
        )
        
        # 4. Relevance (answer addresses question)
        metrics["relevance"] = self._compute_relevance(
            question, generated_answer
        )
        
        # 5. LLM-as-judge (if available)
        if self.llm_judge:
            metrics["llm_judge"] = self._llm_judge_score(
                question, generated_answer, gold_answer
            )
        
        return metrics
    
    def _compute_rouge(self, generated: str, reference: str) -> dict:
        from rouge_score import rouge_scorer
        
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
        scores = scorer.score(reference, generated)
        
        return {
            "rouge1": scores["rouge1"].fmeasure,
            "rouge2": scores["rouge2"].fmeasure,
            "rougeL": scores["rougeL"].fmeasure,
        }
    
    def _semantic_similarity(self, text1: str, text2: str) -> float:
        from sentence_transformers import SentenceTransformer, util
        
        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        emb1 = model.encode(text1)
        emb2 = model.encode(text2)
        
        return float(util.cos_sim(emb1, emb2)[0][0])
    
    def _compute_faithfulness(self, answer: str, context: str) -> float:
        """Check if answer claims are supported by context."""
        # Simplified: check sentence overlap
        answer_sents = answer.split(". ")
        
        supported = 0
        for sent in answer_sents:
            if any(word in context.lower() for word in sent.lower().split() if len(word) > 4):
                supported += 1
        
        return supported / len(answer_sents) if answer_sents else 0.0
    
    def _compute_relevance(self, question: str, answer: str) -> float:
        """Check if answer addresses the question."""
        return self._semantic_similarity(question, answer)

Factual Accuracy

def factual_accuracy(
    generated_claims: List[dict],  # [{claim, verifiable}]
    gold_facts: List[dict],  # [{fact, source}]
) -> dict:
    """Evaluate factual accuracy of generated answers."""
    
    verifiable_claims = [c for c in generated_claims if c["verifiable"]]
    
    correct = 0
    incorrect = 0
    unverifiable = len(generated_claims) - len(verifiable_claims)
    
    for claim in verifiable_claims:
        if any(claim["claim"].lower() in fact["fact"].lower() for fact in gold_facts):
            correct += 1
        else:
            incorrect += 1
    
    total = correct + incorrect
    accuracy = correct / total if total > 0 else 0.0
    
    return {
        "factual_accuracy": accuracy,
        "correct_claims": correct,
        "incorrect_claims": incorrect,
        "unverifiable_claims": unverifiable,
        "total_claims": len(generated_claims),
    }

6. Gold Standard Datasets

Dataset Requirements

# Gold Standard Dataset Schema
gold_standard_dataset:
  metadata:
    name: "Heritage Custodian Evaluation v1.0"
    created_date: "2025-12-01"
    languages: ["nl", "en", "de", "fr"]
    domains: ["museums", "archives", "libraries"]
    
  ner_annotations:
    format: "BRAT/IOB2"
    entity_types:
      - "GRP.HER.MUS"
      - "GRP.HER.ARC"
      - "GRP.HER.LIB"
      - "GRP.HER.SOC"
      - "TOP"
      - "TMP"
      - "IDENTIFIER"
    samples_per_type: 100
    inter_annotator_agreement: ">0.85 Cohen's Kappa"
    
  type_classification:
    format: "TSV"
    columns: ["text", "gold_type", "secondary_types"]
    samples_per_type: 50
    balanced: true
    
  entity_linking:
    format: "JSON"
    fields: ["mention", "context", "gold_wikidata", "gold_viaf", "gold_isil"]
    nil_ratio: 0.15
    samples: 500
    
  retrieval:
    format: "TREC"
    queries: 200
    judgments_per_query: 50
    graded_relevance: [0, 1, 2, 3]
    
  qa:
    format: "JSON"
    fields: ["question", "gold_answer", "supporting_facts", "answer_type"]
    question_types:
      - "factual"
      - "comparative"
      - "relationship"
      - "temporal"
    samples: 300

Dataset Splits

DATASET_SPLITS = {
    "train": 0.70,
    "dev": 0.15,
    "test": 0.15,
}

STRATIFICATION_FACTORS = [
    "custodian_type",      # GLAMORCUBESFIXPHDNT distribution
    "country",             # Geographic coverage
    "data_tier",           # Quality tier (1-4)
    "source_type",         # Conversation, web, CSV
]

Sample Gold Standard Entry

{
  "id": "eval_001",
  "text": "Het Rijksmuseum Amsterdam (ISIL: NL-AmRM) werd opgericht in 1800 en beheert de grootste collectie Nederlandse kunst.",
  
  "ner_annotations": [
    {"text": "Rijksmuseum Amsterdam", "type": "GRP.HER.MUS", "start": 4, "end": 25},
    {"text": "NL-AmRM", "type": "IDENTIFIER", "start": 33, "end": 40},
    {"text": "1800", "type": "TMP", "start": 58, "end": 62},
    {"text": "Nederlandse", "type": "TOP", "start": 93, "end": 104}
  ],
  
  "type_classification": {
    "primary_type": "M",
    "secondary_types": [],
    "rationale": "Art museum with cultural heritage collections"
  },
  
  "entity_linking": {
    "Rijksmuseum Amsterdam": {
      "wikidata_id": "Q190804",
      "viaf_id": "148691498",
      "isil_code": "NL-AmRM",
      "ghcid": "NL-NH-AMS-M-RM"
    }
  },
  
  "qa_pairs": [
    {
      "question": "Wanneer is het Rijksmuseum opgericht?",
      "answer": "Het Rijksmuseum werd opgericht in 1800.",
      "answer_type": "factual",
      "supporting_facts": ["opgericht in 1800"]
    }
  ]
}

7. Evaluation Runner

Batch Evaluation Pipeline

class EvaluationRunner:
    """Run comprehensive evaluation across all tasks."""
    
    def __init__(self, config: dict):
        self.config = config
        self.metrics = {}
    
    def run_full_evaluation(
        self,
        model,
        gold_dataset: List[dict],
    ) -> dict:
        """Run all evaluation tasks."""
        
        results = {}
        
        # 1. NER Evaluation
        print("Evaluating NER...")
        ner_results = self._evaluate_ner(model, gold_dataset)
        results["ner"] = ner_results
        
        # 2. Type Classification Evaluation
        print("Evaluating Type Classification...")
        type_results = self._evaluate_type_classification(model, gold_dataset)
        results["type_classification"] = type_results
        
        # 3. Entity Linking Evaluation
        print("Evaluating Entity Linking...")
        linking_results = self._evaluate_entity_linking(model, gold_dataset)
        results["entity_linking"] = linking_results
        
        # 4. Retrieval Evaluation
        print("Evaluating Retrieval...")
        retrieval_results = self._evaluate_retrieval(model, gold_dataset)
        results["retrieval"] = retrieval_results
        
        # 5. End-to-End QA Evaluation
        print("Evaluating End-to-End QA...")
        qa_results = self._evaluate_qa(model, gold_dataset)
        results["qa"] = qa_results
        
        # Summary
        results["summary"] = self._compute_summary(results)
        
        return results
    
    def _evaluate_ner(self, model, dataset) -> dict:
        all_pred_entities = []
        all_gold_entities = []
        
        for sample in dataset:
            pred = model.extract_entities(sample["text"])
            all_pred_entities.extend(pred)
            all_gold_entities.extend(sample["ner_annotations"])
        
        strict = entity_level_metrics(all_pred_entities, all_gold_entities, "strict")
        partial = entity_level_metrics(all_pred_entities, all_gold_entities, "partial")
        
        return {
            "strict": strict,
            "partial": partial,
            "type_distribution": type_distribution_analysis(
                all_pred_entities, all_gold_entities
            ),
        }
    
    def _compute_summary(self, results: dict) -> dict:
        """Compute overall summary metrics."""
        
        return {
            "ner_f1_strict": results["ner"]["strict"]["f1"],
            "ner_f1_partial": results["ner"]["partial"]["f1"],
            "type_classification_accuracy": results["type_classification"]["accuracy"],
            "entity_linking_mrr": results["entity_linking"]["mrr"],
            "entity_linking_hits@1": results["entity_linking"]["hits@1"],
            "retrieval_ndcg@10": results["retrieval"]["ndcg@10"],
            "qa_faithfulness": results["qa"]["avg_faithfulness"],
            "qa_relevance": results["qa"]["avg_relevance"],
        }

Statistical Significance Testing

from scipy import stats

def significance_test(
    scores_a: List[float],
    scores_b: List[float],
    test_type: str = "paired_t",
    alpha: float = 0.05,
) -> dict:
    """Test statistical significance between two model scores."""
    
    if test_type == "paired_t":
        statistic, p_value = stats.ttest_rel(scores_a, scores_b)
    elif test_type == "wilcoxon":
        statistic, p_value = stats.wilcoxon(scores_a, scores_b)
    elif test_type == "bootstrap":
        # Bootstrap confidence interval
        diff = np.array(scores_a) - np.array(scores_b)
        bootstrap_diffs = []
        for _ in range(10000):
            sample = np.random.choice(diff, size=len(diff), replace=True)
            bootstrap_diffs.append(np.mean(sample))
        
        ci_lower = np.percentile(bootstrap_diffs, 2.5)
        ci_upper = np.percentile(bootstrap_diffs, 97.5)
        
        return {
            "mean_difference": np.mean(diff),
            "ci_95_lower": ci_lower,
            "ci_95_upper": ci_upper,
            "significant": not (ci_lower <= 0 <= ci_upper),
        }
    
    return {
        "test": test_type,
        "statistic": statistic,
        "p_value": p_value,
        "significant": p_value < alpha,
        "alpha": alpha,
    }

8. Performance Benchmarks

Target Metrics

Task	Metric	Target	State-of-Art
NER (strict)	F1	≥0.85	0.92 (CoNLL)
NER (partial)	F1	≥0.90	0.95
Type Classification	Accuracy	≥0.80	0.85
Type Classification	Macro-F1	≥0.75	0.80
Entity Linking	Hits@1	≥0.70	0.75
Entity Linking	MRR	≥0.75	0.82
NIL Detection	F1	≥0.65	0.72
Retrieval	NDCG@10	≥0.60	0.68
Retrieval	MRR	≥0.55	0.62
QA Faithfulness	Score	≥0.80	0.85
QA Relevance	Score	≥0.75	0.82

Baseline Models

BASELINE_MODELS = {
    "ner": {
        "spacy_nl": "nl_core_news_lg",
        "spacy_en": "en_core_web_trf",
        "regex_only": "pattern_based",
    },
    "type_classification": {
        "keyword_based": "rule_based_classifier",
        "zero_shot": "bart-large-mnli",
    },
    "entity_linking": {
        "exact_match": "string_matching",
        "fuzzy": "rapidfuzz_based",
        "wikidata_api": "wikidata_search",
    },
    "retrieval": {
        "bm25": "elasticsearch_bm25",
        "dense": "all-MiniLM-L6-v2",
    },
}

31 KiB Raw Blame History