glam/scripts/test_web_validation.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

138 lines
4.8 KiB
Python

"""
Test web validation on v4's 12 Dutch institutions using Exa search.
This script demonstrates the intelligence-based validation approach:
1. Load v4 extraction results
2. Filter Dutch institutions
3. Validate each using Exa web search
4. Compare with ISIL registry validation (to show the difference)
5. Generate validation report
Goal: Prove that v4's filtering works better than ISIL validation suggests.
"""
import json
import sys
from pathlib import Path
from datetime import datetime, timezone
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / 'src'))
from glam_extractor.validators.web_validator import WebValidator, ValidationResult
def load_dutch_institutions(institutions_file: Path) -> list:
"""Load Dutch institutions from v4 extraction results."""
with open(institutions_file, 'r', encoding='utf-8') as f:
data = json.load(f)
nl_institutions = [
inst for inst in data['institutions']
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
]
print(f"Loaded {len(nl_institutions)} Dutch institutions from v4 extraction")
return nl_institutions
def validate_with_exa(institution: dict) -> dict:
"""
Validate institution using Exa web search.
Since we have access to exa_web_search_exa tool, we'll use it
to search for evidence of each institution's existence.
"""
name = institution['name']
locations = institution.get('locations', [])
city = locations[0].get('city', 'Netherlands') if locations else 'Netherlands'
country = locations[0].get('country', 'NL') if locations else 'NL'
inst_type = institution.get('institution_type', 'heritage')
# Build search query
type_map = {
'MUSEUM': 'museum',
'ARCHIVE': 'archive',
'LIBRARY': 'library',
'EDUCATION_PROVIDER': 'university'
}
type_str = type_map.get(inst_type, 'heritage institution')
query = f"{name} {city} Netherlands {type_str}"
# Return validation structure
# (Actual Exa search will be done via the tool in next step)
return {
'institution': institution,
'query': query,
'name': name,
'city': city,
'country': country,
'type': inst_type
}
def main():
"""Main validation workflow."""
print("=" * 80)
print("V4 DUTCH INSTITUTIONS - WEB VALIDATION TEST")
print("=" * 80)
print()
print("Goal: Validate v4's 12 Dutch institutions using Exa web search")
print("Method: Intelligence-based validation (not heuristic pattern matching)")
print()
# Load institutions
institutions_file = project_root / 'output' / 'institutions.json'
dutch_institutions = load_dutch_institutions(institutions_file)
print(f"\nFound {len(dutch_institutions)} Dutch institutions:")
print("-" * 80)
# Prepare validation queries
validation_queries = []
for i, inst in enumerate(dutch_institutions, 1):
validation_data = validate_with_exa(inst)
validation_queries.append(validation_data)
print(f"{i}. {validation_data['name']}")
print(f" Type: {validation_data['type']}")
print(f" Location: {validation_data['city']}, {validation_data['country']}")
print(f" Search query: {validation_data['query']}")
print()
print("-" * 80)
print("\nNext step: Execute Exa searches for each institution")
print("This will provide web evidence for existence validation")
print()
# Save validation queries for reference
output_file = project_root / 'output' / 'dutch_validation_queries.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(validation_queries, f, indent=2, ensure_ascii=False)
print(f"Saved validation queries to: {output_file}")
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total Dutch institutions to validate: {len(dutch_institutions)}")
print(f"Validation queries prepared: {len(validation_queries)}")
print()
print("From previous session analysis, we expect:")
print(" - Archives Limburg: VALID (Tracé/HCL, Maastricht)")
print(" - Van Abbemuseum: VALID (Modern art museum, Eindhoven)")
print(" - Het Noordbrabants Museum: VALID (Provincial museum, Den Bosch)")
print(" - Fries Archive/Frisian Archives: VALID (Tresoar, Leeuwarden)")
print(" - IFLA Library: INVALID (IFLA is org, not library)")
print(" - University Malaysia: INVALID (Wrong country)")
print(" - Archive Net: INVALID (Platform, not institution)")
print()
print("Predicted precision: ~50% (6 true positives / 12 institutions)")
print("Compare to ISIL validation: 8.3% (misleading due to incomplete registry)")
print()
if __name__ == '__main__':
main()