glam/scripts/validate_dutch_extraction.py

#!/usr/bin/env python3
"""
Validate Dutch Institution Extraction Quality

Cross-links extracted NL institutions with authoritative ISIL registry to measure:
- Precision: % of extracted institutions that are real
- Recall: % of known institutions that were extracted
- F1 Score: Harmonic mean of precision and recall

Outputs detailed analysis to help identify false positives and false negatives.
"""

import csv
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
from dataclasses import dataclass
from rapidfuzz import fuzz
import re

@dataclass
class Institution:
    """Represents a heritage institution."""
    name: str
    city: str
    isil_code: str
    source: str  # 'extracted' or 'registry'
    raw_record: Dict

    def __str__(self):
        return f"{self.name} ({self.city}) [{self.isil_code or 'no ISIL'}]"


def normalize_name(name: str) -> str:
    """Normalize institution name for matching."""
    if not name:
        return ""

    # Lowercase
    normalized = name.lower()

    # Remove common prefixes/suffixes
    normalized = re.sub(r'\b(stichting|gemeente|museum|archief|bibliotheek)\b', '', normalized)

    # Remove punctuation and extra whitespace
    normalized = re.sub(r'[^\w\s]', ' ', normalized)
    normalized = re.sub(r'\s+', ' ', normalized).strip()

    return normalized


def extract_isil_codes(identifiers_str: str) -> List[str]:
    """Extract ISIL codes from semicolon-separated identifier string."""
    if not identifiers_str:
        return []

    isil_codes = []
    for part in identifiers_str.split(';'):
        if 'ISIL:' in part:
            code = part.split('ISIL:')[1].strip()
            if code and code.startswith('NL-'):
                isil_codes.append(code)

    return isil_codes


def load_extracted_institutions(csv_path: Path) -> List[Institution]:
    """Load extracted institutions from batch extraction CSV."""
    institutions = []

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Filter to Netherlands only
            if row.get('country') != 'NL':
                continue

            # Extract ISIL codes from identifiers field
            isil_codes = extract_isil_codes(row.get('identifiers', ''))
            isil_code = isil_codes[0] if isil_codes else ''

            inst = Institution(
                name=row['name'],
                city=row.get('city', ''),
                isil_code=isil_code,
                source='extracted',
                raw_record=row
            )
            institutions.append(inst)

    return institutions


def load_isil_registry(csv_path: Path) -> List[Institution]:
    """Load institutions from ISIL registry CSV using existing parser."""
    import re

    institutions = []

    with open(csv_path, 'r', encoding='utf-8-sig') as f:
        lines = f.readlines()

        # Skip header
        for line in lines[1:]:
            # Extract quoted fields using regex
            fields = re.findall(r'"([^"]*)"', line)
            # Filter out separators (commas, semicolons, empty strings)
            fields = [f for f in fields if f and f not in (',', ';')]

            if len(fields) < 4:
                continue

            # Parse fields: Volgnr, Plaats, Instelling, ISIL code, Toegekend op, Opmerking
            plaats = fields[1].strip()
            instelling = fields[2].strip()
            isil_code = fields[3].strip()

            # Skip empty or invalid
            if not instelling or not isil_code or not isil_code.startswith('NL-'):
                continue

            inst = Institution(
                name=instelling,
                city=plaats,
                isil_code=isil_code,
                source='registry',
                raw_record={'plaats': plaats, 'instelling': instelling, 'isil_code': isil_code}
            )
            institutions.append(inst)

    return institutions


def cross_link_by_isil(
    extracted: List[Institution],
    registry: List[Institution]
) -> Tuple[List[Tuple[Institution, Institution]], List[Institution], List[Institution]]:
    """
    Cross-link institutions by ISIL code.

    Returns:
        - matched: List of (extracted, registry) pairs
        - unmatched_extracted: Extracted institutions not in registry
        - unmatched_registry: Registry institutions not found in extraction
    """
    # Build ISIL lookup for registry
    registry_by_isil = {inst.isil_code: inst for inst in registry if inst.isil_code}
    extracted_by_isil = {inst.isil_code: inst for inst in extracted if inst.isil_code}

    # Find matches
    matched = []
    for isil, extracted_inst in extracted_by_isil.items():
        if isil in registry_by_isil:
            matched.append((extracted_inst, registry_by_isil[isil]))

    # Find unmatched
    matched_isil_codes = {m[0].isil_code for m in matched}
    unmatched_extracted = [
        inst for inst in extracted
        if not inst.isil_code or inst.isil_code not in matched_isil_codes
    ]
    unmatched_registry = [
        inst for inst in registry
        if inst.isil_code not in matched_isil_codes
    ]

    return matched, unmatched_extracted, unmatched_registry


def fuzzy_match_by_name(
    unmatched_extracted: List[Institution],
    unmatched_registry: List[Institution],
    threshold: float = 85.0
) -> Tuple[List[Tuple[Institution, Institution, float]], List[Institution], List[Institution]]:
    """
    Fuzzy match institutions by name.

    Returns:
        - matched: List of (extracted, registry, score) triples
        - remaining_extracted: Still unmatched extracted institutions
        - remaining_registry: Still unmatched registry institutions
    """
    matched = []
    used_registry = set()
    used_extracted = set()

    for ext_inst in unmatched_extracted:
        ext_name = normalize_name(ext_inst.name)

        best_match = None
        best_score = 0.0

        for i, reg_inst in enumerate(unmatched_registry):
            if i in used_registry:
                continue

            reg_name = normalize_name(reg_inst.name)

            # Try different fuzzy matching strategies
            ratio = fuzz.ratio(ext_name, reg_name)
            partial = fuzz.partial_ratio(ext_name, reg_name)
            token_sort = fuzz.token_sort_ratio(ext_name, reg_name)

            # Use highest score
            score = max(ratio, partial, token_sort)

            if score > best_score and score >= threshold:
                best_score = score
                best_match = (reg_inst, i)

        if best_match:
            matched.append((ext_inst, best_match[0], best_score))
            used_extracted.add(id(ext_inst))
            used_registry.add(best_match[1])

    # Remaining unmatched
    remaining_extracted = [
        inst for inst in unmatched_extracted
        if id(inst) not in used_extracted
    ]
    remaining_registry = [
        inst for i, inst in enumerate(unmatched_registry)
        if i not in used_registry
    ]

    return matched, remaining_extracted, remaining_registry


def calculate_metrics(
    isil_matched: int,
    fuzzy_matched: int,
    total_extracted_nl: int,
    total_registry: int
) -> Dict[str, float]:
    """Calculate precision, recall, and F1 score."""
    total_matched = isil_matched + fuzzy_matched

    # Precision: What % of extracted NL institutions are real?
    precision = total_matched / total_extracted_nl if total_extracted_nl > 0 else 0.0

    # Recall: What % of known NL institutions did we find?
    recall = total_matched / total_registry if total_registry > 0 else 0.0

    # F1 Score: Harmonic mean
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'total_matched': total_matched,
        'isil_matched': isil_matched,
        'fuzzy_matched': fuzzy_matched
    }


def print_report(
    extracted: List[Institution],
    registry: List[Institution],
    isil_matches: List[Tuple[Institution, Institution]],
    fuzzy_matches: List[Tuple[Institution, Institution, float]],
    false_positives: List[Institution],
    false_negatives: List[Institution],
    metrics: Dict[str, float]
):
    """Print comprehensive validation report."""

    print("=" * 80)
    print("DUTCH INSTITUTION EXTRACTION VALIDATION REPORT")
    print("=" * 80)
    print()

    print("📊 DATASET SUMMARY")
    print("-" * 80)
    print(f"Total extracted institutions (country=NL): {len(extracted)}")
    print(f"Total ISIL registry institutions:          {len(registry)}")
    print()

    print("🎯 MATCHING RESULTS")
    print("-" * 80)
    print(f"ISIL code matches:   {len(isil_matches):3d} institutions")
    print(f"Fuzzy name matches:  {len(fuzzy_matches):3d} institutions (threshold: 85%)")
    print(f"Total matched:       {metrics['total_matched']:3d} institutions")
    print()

    print("📈 QUALITY METRICS")
    print("-" * 80)
    print(f"Precision: {metrics['precision']:6.1%}  (What % of extracted are real?)")
    print(f"Recall:    {metrics['recall']:6.1%}  (What % of known institutions found?)")
    print(f"F1 Score:  {metrics['f1_score']:6.1%}  (Harmonic mean of precision/recall)")
    print()

    print("❌ FALSE POSITIVES (extracted but not in registry)")
    print("-" * 80)
    print(f"Count: {len(false_positives)}")
    if false_positives:
        print("\nTop 15 false positives:")
        for i, inst in enumerate(false_positives[:15], 1):
            confidence = inst.raw_record.get('confidence_score', 'N/A')
            print(f"  {i:2d}. {inst.name[:60]:60s} (confidence: {confidence})")
    print()

    print("❌ FALSE NEGATIVES (in registry but not extracted)")
    print("-" * 80)
    print(f"Count: {len(false_negatives)}")
    if false_negatives:
        print("\nTop 15 false negatives:")
        for i, inst in enumerate(false_negatives[:15], 1):
            print(f"  {i:2d}. {inst.name[:60]:60s} [{inst.isil_code}] ({inst.city})")
    print()

    if fuzzy_matches:
        print("🔍 FUZZY NAME MATCHES (require verification)")
        print("-" * 80)
        print(f"Count: {len(fuzzy_matches)}")
        print("\nTop 10 fuzzy matches:")
        for i, (ext, reg, score) in enumerate(sorted(fuzzy_matches, key=lambda x: -x[2])[:10], 1):
            print(f"  {i:2d}. Score: {score:5.1f}%")
            print(f"      Extracted: {ext.name}")
            print(f"      Registry:  {reg.name} [{reg.isil_code}]")
            print()

    print("=" * 80)
    print("📋 SUMMARY")
    print("-" * 80)
    print(f"✅ High-confidence matches (ISIL):  {len(isil_matches):3d}")
    print(f"⚠️  Medium-confidence (fuzzy name):  {len(fuzzy_matches):3d}")
    print(f"❌ False positives:                  {len(false_positives):3d}")
    print(f"❌ False negatives:                  {len(false_negatives):3d}")
    print()
    print(f"Overall quality: Precision={metrics['precision']:.1%}, Recall={metrics['recall']:.1%}, F1={metrics['f1_score']:.1%}")
    print("=" * 80)


def main():
    """Main validation workflow."""
    # Paths
    extracted_csv = Path('output/institutions.csv')
    registry_csv = Path('data/ISIL-codes_2025-08-01.csv')

    # Check files exist
    if not extracted_csv.exists():
        print(f"Error: {extracted_csv} not found", file=sys.stderr)
        return 1

    if not registry_csv.exists():
        print(f"Error: {registry_csv} not found", file=sys.stderr)
        return 1

    # Load data
    print("Loading extracted institutions...")
    extracted = load_extracted_institutions(extracted_csv)
    print(f"  Found {len(extracted)} NL institutions")

    print("Loading ISIL registry...")
    registry = load_isil_registry(registry_csv)
    print(f"  Found {len(registry)} registry institutions")
    print()

    # Cross-link by ISIL code
    print("Cross-linking by ISIL code...")
    isil_matches, unmatched_extracted, unmatched_registry = cross_link_by_isil(extracted, registry)
    print(f"  {len(isil_matches)} ISIL matches")
    print()

    # Fuzzy match by name
    print("Fuzzy matching by institution name...")
    fuzzy_matches, false_positives, false_negatives = fuzzy_match_by_name(
        unmatched_extracted,
        unmatched_registry,
        threshold=85.0
    )
    print(f"  {len(fuzzy_matches)} fuzzy matches (≥85% similarity)")
    print()

    # Calculate metrics
    metrics = calculate_metrics(
        isil_matched=len(isil_matches),
        fuzzy_matched=len(fuzzy_matches),
        total_extracted_nl=len(extracted),
        total_registry=len(registry)
    )

    # Print report
    print_report(
        extracted=extracted,
        registry=registry,
        isil_matches=isil_matches,
        fuzzy_matches=fuzzy_matches,
        false_positives=false_positives,
        false_negatives=false_negatives,
        metrics=metrics
    )

    return 0


if __name__ == '__main__':
    sys.exit(main())