""" Test web validation on v4's 12 Dutch institutions using Exa search. This script demonstrates the intelligence-based validation approach: 1. Load v4 extraction results 2. Filter Dutch institutions 3. Validate each using Exa web search 4. Compare with ISIL registry validation (to show the difference) 5. Generate validation report Goal: Prove that v4's filtering works better than ISIL validation suggests. """ import json import sys from pathlib import Path from datetime import datetime, timezone # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root / 'src')) from glam_extractor.validators.web_validator import WebValidator, ValidationResult def load_dutch_institutions(institutions_file: Path) -> list: """Load Dutch institutions from v4 extraction results.""" with open(institutions_file, 'r', encoding='utf-8') as f: data = json.load(f) nl_institutions = [ inst for inst in data['institutions'] if any(loc.get('country') == 'NL' for loc in inst.get('locations', [])) ] print(f"Loaded {len(nl_institutions)} Dutch institutions from v4 extraction") return nl_institutions def validate_with_exa(institution: dict) -> dict: """ Validate institution using Exa web search. Since we have access to exa_web_search_exa tool, we'll use it to search for evidence of each institution's existence. """ name = institution['name'] locations = institution.get('locations', []) city = locations[0].get('city', 'Netherlands') if locations else 'Netherlands' country = locations[0].get('country', 'NL') if locations else 'NL' inst_type = institution.get('institution_type', 'heritage') # Build search query type_map = { 'MUSEUM': 'museum', 'ARCHIVE': 'archive', 'LIBRARY': 'library', 'EDUCATION_PROVIDER': 'university' } type_str = type_map.get(inst_type, 'heritage institution') query = f"{name} {city} Netherlands {type_str}" # Return validation structure # (Actual Exa search will be done via the tool in next step) return { 'institution': institution, 'query': query, 'name': name, 'city': city, 'country': country, 'type': inst_type } def main(): """Main validation workflow.""" print("=" * 80) print("V4 DUTCH INSTITUTIONS - WEB VALIDATION TEST") print("=" * 80) print() print("Goal: Validate v4's 12 Dutch institutions using Exa web search") print("Method: Intelligence-based validation (not heuristic pattern matching)") print() # Load institutions institutions_file = project_root / 'output' / 'institutions.json' dutch_institutions = load_dutch_institutions(institutions_file) print(f"\nFound {len(dutch_institutions)} Dutch institutions:") print("-" * 80) # Prepare validation queries validation_queries = [] for i, inst in enumerate(dutch_institutions, 1): validation_data = validate_with_exa(inst) validation_queries.append(validation_data) print(f"{i}. {validation_data['name']}") print(f" Type: {validation_data['type']}") print(f" Location: {validation_data['city']}, {validation_data['country']}") print(f" Search query: {validation_data['query']}") print() print("-" * 80) print("\nNext step: Execute Exa searches for each institution") print("This will provide web evidence for existence validation") print() # Save validation queries for reference output_file = project_root / 'output' / 'dutch_validation_queries.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(validation_queries, f, indent=2, ensure_ascii=False) print(f"Saved validation queries to: {output_file}") print() print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total Dutch institutions to validate: {len(dutch_institutions)}") print(f"Validation queries prepared: {len(validation_queries)}") print() print("From previous session analysis, we expect:") print(" - Archives Limburg: VALID (Tracé/HCL, Maastricht)") print(" - Van Abbemuseum: VALID (Modern art museum, Eindhoven)") print(" - Het Noordbrabants Museum: VALID (Provincial museum, Den Bosch)") print(" - Fries Archive/Frisian Archives: VALID (Tresoar, Leeuwarden)") print(" - IFLA Library: INVALID (IFLA is org, not library)") print(" - University Malaysia: INVALID (Wrong country)") print(" - Archive Net: INVALID (Platform, not institution)") print() print("Predicted precision: ~50% (6 true positives / 12 institutions)") print("Compare to ISIL validation: 8.3% (misleading due to incomplete registry)") print() if __name__ == '__main__': main()