glam/scripts/test_v5_extraction.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

188 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Test V5 extraction implementation with known V4 errors.
This script tests the V5 validation methods by extracting from sample text
containing known V4 false positives and valid institutions.
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "src"))
from glam_extractor.extractors.nlp_extractor import InstitutionExtractor
# Sample text with known V4 errors + valid institutions
SAMPLE_TEXT = """
# Dutch Heritage Institutions Discussion
## Valid Institutions (should be extracted)
The Van Abbemuseum in Eindhoven is a renowned modern art museum with an
ISIL code NL-EhdVAM. Founded in 1936, it holds one of the finest collections
of modern and contemporary art in Europe.
The Zeeuws Archief in Middelburg preserves historical records from the
province of Zeeland. ISIL: NL-MdlZA. It maintains extensive archival
collections dating back to the 13th century.
Historisch Centrum Overijssel (ISIL: NL-ZwHCO) in Zwolle serves as the
regional archive for the province of Overijssel.
## Organizations/Networks (should be filtered by V5)
The IFLA Library is an international organization that coordinates library
networks worldwide.
Archive Net is a global network connecting archives across multiple continents.
## Generic Descriptors (should be filtered by V5)
The Library FabLab is a makerspace facility open to the public.
University Library is part of the academic infrastructure.
## Geographic Errors (should be filtered by V5 country validation)
The National Museum of Malaysia in Kuala Lumpur houses extensive Southeast
Asian collections.
University Malaysia specializes in research and education.
"""
def test_v5_extraction():
"""Test V5 extraction with sample text"""
print("=" * 70)
print("Testing V5 Extraction Implementation")
print("=" * 70)
print()
# Initialize extractor
extractor = InstitutionExtractor()
# Extract institutions
result = extractor.extract_from_text(
SAMPLE_TEXT,
conversation_id="test-v5-sample",
conversation_name="Netherlands_GLAM_test"
)
if not result.success:
print(f"✗ Extraction failed: {result.error}")
return False
institutions = result.value
print(f"Extracted {len(institutions)} institutions:")
print()
# Categorize results
valid_dutch = []
false_positives = []
for inst in institutions:
# Get country
country = "UNKNOWN"
if inst.locations and len(inst.locations) > 0:
country = inst.locations[0].country or "UNKNOWN"
# Get confidence
confidence = inst.provenance.confidence_score or 0.0
# Display
print(f" {inst.name}")
print(f" Type: {inst.institution_type}")
print(f" Country: {country}")
print(f" Confidence: {confidence:.2f}")
# Check identifiers
if inst.identifiers:
ids = [f"{i.identifier_scheme}:{i.identifier_value}" for i in inst.identifiers]
print(f" Identifiers: {', '.join(ids)}")
print()
# Categorize
if country == "NL":
valid_dutch.append(inst)
else:
false_positives.append(inst)
# Summary
print("=" * 70)
print("V5 Test Results:")
print("=" * 70)
print()
print(f"Total extracted: {len(institutions)}")
print(f"Dutch institutions (NL): {len(valid_dutch)}")
print(f"Other/unknown country: {len(false_positives)}")
print()
# Expected results
expected_valid = [
"Van Abbemuseum",
"Zeeuws Archief",
"Historisch Centrum Overijssel"
]
expected_filtered = [
"IFLA Library", # Organization
"Archive Net", # Network
"Library FabLab", # Generic descriptor
"University Library", # Generic
"National Museum of Malaysia", # Wrong country
"University Malaysia" # Wrong country
]
print("Expected to extract (3):")
for name in expected_valid:
found = any(name.lower() in inst.name.lower() for inst in valid_dutch)
status = "" if found else ""
print(f" {status} {name}")
print()
print("Expected to filter (6):")
for name in expected_filtered:
found = any(name.lower() in inst.name.lower() for inst in institutions)
status = "" if found else ""
print(f" {status} {name} {'(EXTRACTED - SHOULD BE FILTERED!)' if found else '(filtered)'}")
print()
# Calculate precision
valid_count = len([name for name in expected_valid
if any(name.lower() in inst.name.lower() for inst in valid_dutch)])
false_pos_count = len([name for name in expected_filtered
if any(name.lower() in inst.name.lower() for inst in institutions)])
total_extracted = len(institutions)
precision = (valid_count / total_extracted * 100) if total_extracted > 0 else 0
print(f"Valid institutions extracted: {valid_count}/3 ({valid_count/3*100:.0f}%)")
print(f"False positives extracted: {false_pos_count}/6")
print(f"Precision: {precision:.1f}% ({valid_count}/{total_extracted})")
print()
# V4 baseline for comparison
print("V4 Baseline: 50.0% precision (6/12 valid, 6 false positives)")
print(f"V5 Target: ≥75% precision")
print()
if precision >= 75.0:
print("✓ V5 PASSES precision target!")
elif precision > 50.0:
print("⚠ V5 improved over V4 but below 75% target")
else:
print("✗ V5 did not improve over V4")
return precision >= 75.0
if __name__ == "__main__":
success = test_v5_extraction()
sys.exit(0 if success else 1)