#!/usr/bin/env python3 """ Test V5 extraction implementation with known V4 errors. This script tests the V5 validation methods by extracting from sample text containing known V4 false positives and valid institutions. """ import sys from pathlib import Path # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root / "src")) from glam_extractor.extractors.nlp_extractor import InstitutionExtractor # Sample text with known V4 errors + valid institutions SAMPLE_TEXT = """ # Dutch Heritage Institutions Discussion ## Valid Institutions (should be extracted) The Van Abbemuseum in Eindhoven is a renowned modern art museum with an ISIL code NL-EhdVAM. Founded in 1936, it holds one of the finest collections of modern and contemporary art in Europe. The Zeeuws Archief in Middelburg preserves historical records from the province of Zeeland. ISIL: NL-MdlZA. It maintains extensive archival collections dating back to the 13th century. Historisch Centrum Overijssel (ISIL: NL-ZwHCO) in Zwolle serves as the regional archive for the province of Overijssel. ## Organizations/Networks (should be filtered by V5) The IFLA Library is an international organization that coordinates library networks worldwide. Archive Net is a global network connecting archives across multiple continents. ## Generic Descriptors (should be filtered by V5) The Library FabLab is a makerspace facility open to the public. University Library is part of the academic infrastructure. ## Geographic Errors (should be filtered by V5 country validation) The National Museum of Malaysia in Kuala Lumpur houses extensive Southeast Asian collections. University Malaysia specializes in research and education. """ def test_v5_extraction(): """Test V5 extraction with sample text""" print("=" * 70) print("Testing V5 Extraction Implementation") print("=" * 70) print() # Initialize extractor extractor = InstitutionExtractor() # Extract institutions result = extractor.extract_from_text( SAMPLE_TEXT, conversation_id="test-v5-sample", conversation_name="Netherlands_GLAM_test" ) if not result.success: print(f"✗ Extraction failed: {result.error}") return False institutions = result.value print(f"Extracted {len(institutions)} institutions:") print() # Categorize results valid_dutch = [] false_positives = [] for inst in institutions: # Get country country = "UNKNOWN" if inst.locations and len(inst.locations) > 0: country = inst.locations[0].country or "UNKNOWN" # Get confidence confidence = inst.provenance.confidence_score or 0.0 # Display print(f" {inst.name}") print(f" Type: {inst.institution_type}") print(f" Country: {country}") print(f" Confidence: {confidence:.2f}") # Check identifiers if inst.identifiers: ids = [f"{i.identifier_scheme}:{i.identifier_value}" for i in inst.identifiers] print(f" Identifiers: {', '.join(ids)}") print() # Categorize if country == "NL": valid_dutch.append(inst) else: false_positives.append(inst) # Summary print("=" * 70) print("V5 Test Results:") print("=" * 70) print() print(f"Total extracted: {len(institutions)}") print(f"Dutch institutions (NL): {len(valid_dutch)}") print(f"Other/unknown country: {len(false_positives)}") print() # Expected results expected_valid = [ "Van Abbemuseum", "Zeeuws Archief", "Historisch Centrum Overijssel" ] expected_filtered = [ "IFLA Library", # Organization "Archive Net", # Network "Library FabLab", # Generic descriptor "University Library", # Generic "National Museum of Malaysia", # Wrong country "University Malaysia" # Wrong country ] print("Expected to extract (3):") for name in expected_valid: found = any(name.lower() in inst.name.lower() for inst in valid_dutch) status = "✓" if found else "✗" print(f" {status} {name}") print() print("Expected to filter (6):") for name in expected_filtered: found = any(name.lower() in inst.name.lower() for inst in institutions) status = "✗" if found else "✓" print(f" {status} {name} {'(EXTRACTED - SHOULD BE FILTERED!)' if found else '(filtered)'}") print() # Calculate precision valid_count = len([name for name in expected_valid if any(name.lower() in inst.name.lower() for inst in valid_dutch)]) false_pos_count = len([name for name in expected_filtered if any(name.lower() in inst.name.lower() for inst in institutions)]) total_extracted = len(institutions) precision = (valid_count / total_extracted * 100) if total_extracted > 0 else 0 print(f"Valid institutions extracted: {valid_count}/3 ({valid_count/3*100:.0f}%)") print(f"False positives extracted: {false_pos_count}/6") print(f"Precision: {precision:.1f}% ({valid_count}/{total_extracted})") print() # V4 baseline for comparison print("V4 Baseline: 50.0% precision (6/12 valid, 6 false positives)") print(f"V5 Target: ≥75% precision") print() if precision >= 75.0: print("✓ V5 PASSES precision target!") elif precision > 50.0: print("⚠ V5 improved over V4 but below 75% target") else: print("✗ V5 did not improve over V4") return precision >= 75.0 if __name__ == "__main__": success = test_v5_extraction() sys.exit(0 if success else 1)