""" Web-based validation for heritage institutions using Exa search and Wikidata. This module implements intelligence-based validation to replace heuristic pattern matching. Instead of using regex patterns to determine if an institution is valid, we: 1. Search the web for evidence the institution exists 2. Query Wikidata for knowledge graph presence 3. Calculate confidence based on evidence quality Key principle: "Stop using heuristics, use language understanding + web validation" """ from dataclasses import dataclass from typing import List, Optional, Dict, Any from datetime import datetime, timezone import re @dataclass class ValidationEvidence: """Evidence supporting institution existence.""" source: str # 'exa_search', 'wikidata', 'website', 'registry' url: Optional[str] title: Optional[str] confidence: float # 0.0-1.0 details: Dict[str, Any] @dataclass class ValidationResult: """Result of validating an institution's existence.""" exists: bool confidence: float # 0.0-1.0 evidence: List[ValidationEvidence] wikidata_id: Optional[str] validated_at: datetime validation_method: str notes: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for JSON serialization.""" return { 'exists': self.exists, 'confidence': self.confidence, 'evidence': [ { 'source': e.source, 'url': e.url, 'title': e.title, 'confidence': e.confidence, 'details': e.details } for e in self.evidence ], 'wikidata_id': self.wikidata_id, 'validated_at': self.validated_at.isoformat(), 'validation_method': self.validation_method, 'notes': self.notes } class WebValidator: """ Validates heritage institutions using web search and knowledge graphs. This validator uses multiple sources of truth: - Exa web search for institutional websites/mentions - Wikidata SPARQL for knowledge graph presence - Cross-referencing with known registries (future) Example: >>> validator = WebValidator() >>> result = validator.validate_institution( ... name="Van Abbemuseum", ... city="Eindhoven", ... country="NL", ... institution_type="MUSEUM" ... ) >>> print(f"Exists: {result.exists}, Confidence: {result.confidence}") """ def __init__(self, exa_client=None, wikidata_client=None): """ Initialize validator with optional clients. Args: exa_client: Exa search client (if None, will be created on first use) wikidata_client: Wikidata SPARQL client (if None, will be created on first use) """ self.exa_client = exa_client self.wikidata_client = wikidata_client def validate_institution( self, name: str, city: Optional[str] = None, country: Optional[str] = None, institution_type: Optional[str] = None, alternative_names: Optional[List[str]] = None, homepage: Optional[str] = None ) -> ValidationResult: """ Validate an institution's existence using web sources. Args: name: Institution name city: City/location country: Country code (ISO 3166-1 alpha-2) institution_type: Type (MUSEUM, ARCHIVE, LIBRARY, etc.) alternative_names: Alternative names to search homepage: Known homepage URL (adds confidence if valid) Returns: ValidationResult with existence verdict, confidence, and evidence """ evidence = [] validated_at = datetime.now(timezone.utc) # Immediate rejection patterns (obvious false positives) if self._is_obvious_false_positive(name, city, country): return ValidationResult( exists=False, confidence=0.0, evidence=[], wikidata_id=None, validated_at=validated_at, validation_method='pattern_rejection', notes=f"Rejected by pattern matching: {name}" ) # 1. Web search validation (using Exa if available) if self.exa_client: web_evidence = self._validate_via_exa(name, city, country, institution_type) evidence.extend(web_evidence) # 2. Wikidata validation (future implementation) wikidata_id = None if self.wikidata_client: wikidata_evidence = self._validate_via_wikidata(name, city, country) evidence.extend(wikidata_evidence) # Extract Wikidata ID from evidence for e in wikidata_evidence: if e.source == 'wikidata' and 'wikidata_id' in e.details: wikidata_id = e.details['wikidata_id'] # 3. Homepage validation (if provided) if homepage: homepage_evidence = self._validate_homepage(homepage, name) evidence.extend(homepage_evidence) # Calculate overall confidence confidence = self._calculate_confidence(evidence) exists = confidence >= 0.5 # Threshold for existence verdict return ValidationResult( exists=exists, confidence=confidence, evidence=evidence, wikidata_id=wikidata_id, validated_at=validated_at, validation_method='web_search + wikidata', notes=f"Validated using {len(evidence)} evidence sources" ) def _is_obvious_false_positive( self, name: str, city: Optional[str], country: Optional[str] ) -> bool: """ Detect obvious false positives using minimal pattern matching. This is a safety net for clearly invalid extractions, not a replacement for web validation. We only reject obvious mistakes here. """ name_lower = name.lower() # Single character or empty if len(name.strip()) <= 1: return True # Pure numbers or symbols if re.match(r'^[\d\s\-\.,]+$', name): return True # Common markdown/artifact patterns markdown_artifacts = [ '```', '---', '===', '***', 'http://', 'https://', '[', ']', '(', ')', ] if any(artifact in name for artifact in markdown_artifacts): return True # Generic placeholder text placeholders = [ 'example', 'test', 'placeholder', 'lorem ipsum', 'unnamed', 'unknown', 'various', 'multiple' ] if name_lower in placeholders: return True # Country mismatch (if we have city info) # Example: "University Malaysia" with country="NL" is wrong country_keywords = { 'malaysia': ['MY'], 'indonesia': ['ID'], 'thailand': ['TH'], 'vietnam': ['VN'], 'singapore': ['SG'], 'philippines': ['PH'] } if country: for keyword, expected_countries in country_keywords.items(): if keyword in name_lower and country not in expected_countries: return True return False def _validate_via_exa( self, name: str, city: Optional[str], country: Optional[str], institution_type: Optional[str] ) -> List[ValidationEvidence]: """ Validate institution via Exa web search. Returns: List of ValidationEvidence from web search results """ # Placeholder for Exa integration # Will be implemented when we integrate with Exa API return [] def _validate_via_wikidata( self, name: str, city: Optional[str], country: Optional[str] ) -> List[ValidationEvidence]: """ Validate institution via Wikidata SPARQL query. Returns: List of ValidationEvidence from Wikidata """ # Placeholder for Wikidata integration # Will be implemented in next phase return [] def _validate_homepage( self, homepage: str, institution_name: str ) -> List[ValidationEvidence]: """ Validate homepage URL and check if it matches institution. Returns: List of ValidationEvidence from homepage validation """ evidence = [] # Basic URL validation if homepage.startswith('http://') or homepage.startswith('https://'): # Check if URL is accessible (future: make HTTP request) # For now, just check if it looks like a valid heritage institution URL valid_tlds = ['.museum', '.nl', '.org', '.edu', '.gov'] if any(tld in homepage for tld in valid_tlds): evidence.append(ValidationEvidence( source='website', url=homepage, title=None, confidence=0.7, details={'validation': 'url_format_valid'} )) return evidence def _calculate_confidence(self, evidence: List[ValidationEvidence]) -> float: """ Calculate overall confidence from multiple evidence sources. Uses weighted average where: - Wikidata presence: 0.9 weight - Official website: 0.8 weight - Web search results: 0.6-0.8 weight - Registry matches: 1.0 weight (future) Returns: Confidence score 0.0-1.0 """ if not evidence: return 0.0 # Weight different evidence types weights = { 'wikidata': 0.9, 'website': 0.8, 'exa_search': 0.7, 'registry': 1.0 } weighted_sum = 0.0 total_weight = 0.0 for e in evidence: weight = weights.get(e.source, 0.5) weighted_sum += e.confidence * weight total_weight += weight if total_weight == 0: return 0.0 # Normalize to 0.0-1.0 range confidence = weighted_sum / total_weight return min(1.0, confidence) def batch_validate_institutions( institutions: List[Dict[str, Any]], validator: Optional[WebValidator] = None ) -> List[ValidationResult]: """ Validate multiple institutions in batch. Args: institutions: List of institution dictionaries validator: WebValidator instance (created if None) Returns: List of ValidationResult objects """ if validator is None: validator = WebValidator() results = [] for inst in institutions: result = validator.validate_institution( name=inst.get('name', ''), city=inst.get('locations', [{}])[0].get('city') if inst.get('locations') else None, country=inst.get('locations', [{}])[0].get('country') if inst.get('locations') else None, institution_type=inst.get('institution_type'), alternative_names=inst.get('alternative_names', []), homepage=inst.get('homepage') ) results.append(result) return results __all__ = [ 'ValidationEvidence', 'ValidationResult', 'WebValidator', 'batch_validate_institutions' ]