glam/scripts/test_v5_extraction.py

#!/usr/bin/env python3
"""
Test V5 extraction implementation with known V4 errors.

This script tests the V5 validation methods by extracting from sample text
containing known V4 false positives and valid institutions.
"""

import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "src"))

from glam_extractor.extractors.nlp_extractor import InstitutionExtractor


# Sample text with known V4 errors + valid institutions
SAMPLE_TEXT = """
# Dutch Heritage Institutions Discussion

## Valid Institutions (should be extracted)

The Van Abbemuseum in Eindhoven is a renowned modern art museum with an
ISIL code NL-EhdVAM. Founded in 1936, it holds one of the finest collections
of modern and contemporary art in Europe.

The Zeeuws Archief in Middelburg preserves historical records from the
province of Zeeland. ISIL: NL-MdlZA. It maintains extensive archival
collections dating back to the 13th century.

Historisch Centrum Overijssel (ISIL: NL-ZwHCO) in Zwolle serves as the
regional archive for the province of Overijssel.

## Organizations/Networks (should be filtered by V5)

The IFLA Library is an international organization that coordinates library
networks worldwide.

Archive Net is a global network connecting archives across multiple continents.

## Generic Descriptors (should be filtered by V5)

The Library FabLab is a makerspace facility open to the public.

University Library is part of the academic infrastructure.

## Geographic Errors (should be filtered by V5 country validation)

The National Museum of Malaysia in Kuala Lumpur houses extensive Southeast
Asian collections.

University Malaysia specializes in research and education.
"""


def test_v5_extraction():
    """Test V5 extraction with sample text"""
    print("=" * 70)
    print("Testing V5 Extraction Implementation")
    print("=" * 70)
    print()

    # Initialize extractor
    extractor = InstitutionExtractor()

    # Extract institutions
    result = extractor.extract_from_text(
        SAMPLE_TEXT,
        conversation_id="test-v5-sample",
        conversation_name="Netherlands_GLAM_test"
    )

    if not result.success:
        print(f"✗ Extraction failed: {result.error}")
        return False

    institutions = result.value

    print(f"Extracted {len(institutions)} institutions:")
    print()

    # Categorize results
    valid_dutch = []
    false_positives = []

    for inst in institutions:
        # Get country
        country = "UNKNOWN"
        if inst.locations and len(inst.locations) > 0:
            country = inst.locations[0].country or "UNKNOWN"

        # Get confidence
        confidence = inst.provenance.confidence_score or 0.0

        # Display
        print(f"  {inst.name}")
        print(f"    Type: {inst.institution_type}")
        print(f"    Country: {country}")
        print(f"    Confidence: {confidence:.2f}")

        # Check identifiers
        if inst.identifiers:
            ids = [f"{i.identifier_scheme}:{i.identifier_value}" for i in inst.identifiers]
            print(f"    Identifiers: {', '.join(ids)}")

        print()

        # Categorize
        if country == "NL":
            valid_dutch.append(inst)
        else:
            false_positives.append(inst)

    # Summary
    print("=" * 70)
    print("V5 Test Results:")
    print("=" * 70)
    print()

    print(f"Total extracted: {len(institutions)}")
    print(f"Dutch institutions (NL): {len(valid_dutch)}")
    print(f"Other/unknown country: {len(false_positives)}")
    print()

    # Expected results
    expected_valid = [
        "Van Abbemuseum",
        "Zeeuws Archief",
        "Historisch Centrum Overijssel"
    ]

    expected_filtered = [
        "IFLA Library",  # Organization
        "Archive Net",  # Network
        "Library FabLab",  # Generic descriptor
        "University Library",  # Generic
        "National Museum of Malaysia",  # Wrong country
        "University Malaysia"  # Wrong country
    ]

    print("Expected to extract (3):")
    for name in expected_valid:
        found = any(name.lower() in inst.name.lower() for inst in valid_dutch)
        status = "✓" if found else "✗"
        print(f"  {status} {name}")
    print()

    print("Expected to filter (6):")
    for name in expected_filtered:
        found = any(name.lower() in inst.name.lower() for inst in institutions)
        status = "✗" if found else "✓"
        print(f"  {status} {name} {'(EXTRACTED - SHOULD BE FILTERED!)' if found else '(filtered)'}")
    print()

    # Calculate precision
    valid_count = len([name for name in expected_valid
                      if any(name.lower() in inst.name.lower() for inst in valid_dutch)])
    false_pos_count = len([name for name in expected_filtered
                          if any(name.lower() in inst.name.lower() for inst in institutions)])

    total_extracted = len(institutions)
    precision = (valid_count / total_extracted * 100) if total_extracted > 0 else 0

    print(f"Valid institutions extracted: {valid_count}/3 ({valid_count/3*100:.0f}%)")
    print(f"False positives extracted: {false_pos_count}/6")
    print(f"Precision: {precision:.1f}% ({valid_count}/{total_extracted})")
    print()

    # V4 baseline for comparison
    print("V4 Baseline: 50.0% precision (6/12 valid, 6 false positives)")
    print(f"V5 Target: ≥75% precision")
    print()

    if precision >= 75.0:
        print("✓ V5 PASSES precision target!")
    elif precision > 50.0:
        print("⚠ V5 improved over V4 but below 75% target")
    else:
        print("✗ V5 did not improve over V4")

    return precision >= 75.0


if __name__ == "__main__":
    success = test_v5_extraction()
    sys.exit(0 if success else 1)