- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
138 lines
4.8 KiB
Python
138 lines
4.8 KiB
Python
"""
|
|
Test web validation on v4's 12 Dutch institutions using Exa search.
|
|
|
|
This script demonstrates the intelligence-based validation approach:
|
|
1. Load v4 extraction results
|
|
2. Filter Dutch institutions
|
|
3. Validate each using Exa web search
|
|
4. Compare with ISIL registry validation (to show the difference)
|
|
5. Generate validation report
|
|
|
|
Goal: Prove that v4's filtering works better than ISIL validation suggests.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root / 'src'))
|
|
|
|
from glam_extractor.validators.web_validator import WebValidator, ValidationResult
|
|
|
|
|
|
def load_dutch_institutions(institutions_file: Path) -> list:
|
|
"""Load Dutch institutions from v4 extraction results."""
|
|
with open(institutions_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
nl_institutions = [
|
|
inst for inst in data['institutions']
|
|
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
|
|
]
|
|
|
|
print(f"Loaded {len(nl_institutions)} Dutch institutions from v4 extraction")
|
|
return nl_institutions
|
|
|
|
|
|
def validate_with_exa(institution: dict) -> dict:
|
|
"""
|
|
Validate institution using Exa web search.
|
|
|
|
Since we have access to exa_web_search_exa tool, we'll use it
|
|
to search for evidence of each institution's existence.
|
|
"""
|
|
name = institution['name']
|
|
locations = institution.get('locations', [])
|
|
city = locations[0].get('city', 'Netherlands') if locations else 'Netherlands'
|
|
country = locations[0].get('country', 'NL') if locations else 'NL'
|
|
inst_type = institution.get('institution_type', 'heritage')
|
|
|
|
# Build search query
|
|
type_map = {
|
|
'MUSEUM': 'museum',
|
|
'ARCHIVE': 'archive',
|
|
'LIBRARY': 'library',
|
|
'EDUCATION_PROVIDER': 'university'
|
|
}
|
|
type_str = type_map.get(inst_type, 'heritage institution')
|
|
|
|
query = f"{name} {city} Netherlands {type_str}"
|
|
|
|
# Return validation structure
|
|
# (Actual Exa search will be done via the tool in next step)
|
|
return {
|
|
'institution': institution,
|
|
'query': query,
|
|
'name': name,
|
|
'city': city,
|
|
'country': country,
|
|
'type': inst_type
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main validation workflow."""
|
|
print("=" * 80)
|
|
print("V4 DUTCH INSTITUTIONS - WEB VALIDATION TEST")
|
|
print("=" * 80)
|
|
print()
|
|
print("Goal: Validate v4's 12 Dutch institutions using Exa web search")
|
|
print("Method: Intelligence-based validation (not heuristic pattern matching)")
|
|
print()
|
|
|
|
# Load institutions
|
|
institutions_file = project_root / 'output' / 'institutions.json'
|
|
dutch_institutions = load_dutch_institutions(institutions_file)
|
|
|
|
print(f"\nFound {len(dutch_institutions)} Dutch institutions:")
|
|
print("-" * 80)
|
|
|
|
# Prepare validation queries
|
|
validation_queries = []
|
|
for i, inst in enumerate(dutch_institutions, 1):
|
|
validation_data = validate_with_exa(inst)
|
|
validation_queries.append(validation_data)
|
|
|
|
print(f"{i}. {validation_data['name']}")
|
|
print(f" Type: {validation_data['type']}")
|
|
print(f" Location: {validation_data['city']}, {validation_data['country']}")
|
|
print(f" Search query: {validation_data['query']}")
|
|
print()
|
|
|
|
print("-" * 80)
|
|
print("\nNext step: Execute Exa searches for each institution")
|
|
print("This will provide web evidence for existence validation")
|
|
print()
|
|
|
|
# Save validation queries for reference
|
|
output_file = project_root / 'output' / 'dutch_validation_queries.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(validation_queries, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Saved validation queries to: {output_file}")
|
|
print()
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Total Dutch institutions to validate: {len(dutch_institutions)}")
|
|
print(f"Validation queries prepared: {len(validation_queries)}")
|
|
print()
|
|
print("From previous session analysis, we expect:")
|
|
print(" - Archives Limburg: VALID (Tracé/HCL, Maastricht)")
|
|
print(" - Van Abbemuseum: VALID (Modern art museum, Eindhoven)")
|
|
print(" - Het Noordbrabants Museum: VALID (Provincial museum, Den Bosch)")
|
|
print(" - Fries Archive/Frisian Archives: VALID (Tresoar, Leeuwarden)")
|
|
print(" - IFLA Library: INVALID (IFLA is org, not library)")
|
|
print(" - University Malaysia: INVALID (Wrong country)")
|
|
print(" - Archive Net: INVALID (Platform, not institution)")
|
|
print()
|
|
print("Predicted precision: ~50% (6 true positives / 12 institutions)")
|
|
print("Compare to ISIL validation: 8.3% (misleading due to incomplete registry)")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|