glam/src/glam_extractor/validators/web_validator.py

"""
Web-based validation for heritage institutions using Exa search and Wikidata.

This module implements intelligence-based validation to replace heuristic pattern matching.
Instead of using regex patterns to determine if an institution is valid, we:
1. Search the web for evidence the institution exists
2. Query Wikidata for knowledge graph presence
3. Calculate confidence based on evidence quality

Key principle: "Stop using heuristics, use language understanding + web validation"
"""

from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from datetime import datetime, timezone
import re


@dataclass
class ValidationEvidence:
    """Evidence supporting institution existence."""
    source: str  # 'exa_search', 'wikidata', 'website', 'registry'
    url: Optional[str]
    title: Optional[str]
    confidence: float  # 0.0-1.0
    details: Dict[str, Any]


@dataclass
class ValidationResult:
    """Result of validating an institution's existence."""
    exists: bool
    confidence: float  # 0.0-1.0
    evidence: List[ValidationEvidence]
    wikidata_id: Optional[str]
    validated_at: datetime
    validation_method: str
    notes: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'exists': self.exists,
            'confidence': self.confidence,
            'evidence': [
                {
                    'source': e.source,
                    'url': e.url,
                    'title': e.title,
                    'confidence': e.confidence,
                    'details': e.details
                }
                for e in self.evidence
            ],
            'wikidata_id': self.wikidata_id,
            'validated_at': self.validated_at.isoformat(),
            'validation_method': self.validation_method,
            'notes': self.notes
        }


class WebValidator:
    """
    Validates heritage institutions using web search and knowledge graphs.

    This validator uses multiple sources of truth:
    - Exa web search for institutional websites/mentions
    - Wikidata SPARQL for knowledge graph presence
    - Cross-referencing with known registries (future)

    Example:
        >>> validator = WebValidator()
        >>> result = validator.validate_institution(
        ...     name="Van Abbemuseum",
        ...     city="Eindhoven",
        ...     country="NL",
        ...     institution_type="MUSEUM"
        ... )
        >>> print(f"Exists: {result.exists}, Confidence: {result.confidence}")
    """

    def __init__(self, exa_client=None, wikidata_client=None):
        """
        Initialize validator with optional clients.

        Args:
            exa_client: Exa search client (if None, will be created on first use)
            wikidata_client: Wikidata SPARQL client (if None, will be created on first use)
        """
        self.exa_client = exa_client
        self.wikidata_client = wikidata_client

    def validate_institution(
        self,
        name: str,
        city: Optional[str] = None,
        country: Optional[str] = None,
        institution_type: Optional[str] = None,
        alternative_names: Optional[List[str]] = None,
        homepage: Optional[str] = None
    ) -> ValidationResult:
        """
        Validate an institution's existence using web sources.

        Args:
            name: Institution name
            city: City/location
            country: Country code (ISO 3166-1 alpha-2)
            institution_type: Type (MUSEUM, ARCHIVE, LIBRARY, etc.)
            alternative_names: Alternative names to search
            homepage: Known homepage URL (adds confidence if valid)

        Returns:
            ValidationResult with existence verdict, confidence, and evidence
        """
        evidence = []
        validated_at = datetime.now(timezone.utc)

        # Immediate rejection patterns (obvious false positives)
        if self._is_obvious_false_positive(name, city, country):
            return ValidationResult(
                exists=False,
                confidence=0.0,
                evidence=[],
                wikidata_id=None,
                validated_at=validated_at,
                validation_method='pattern_rejection',
                notes=f"Rejected by pattern matching: {name}"
            )

        # 1. Web search validation (using Exa if available)
        if self.exa_client:
            web_evidence = self._validate_via_exa(name, city, country, institution_type)
            evidence.extend(web_evidence)

        # 2. Wikidata validation (future implementation)
        wikidata_id = None
        if self.wikidata_client:
            wikidata_evidence = self._validate_via_wikidata(name, city, country)
            evidence.extend(wikidata_evidence)
            # Extract Wikidata ID from evidence
            for e in wikidata_evidence:
                if e.source == 'wikidata' and 'wikidata_id' in e.details:
                    wikidata_id = e.details['wikidata_id']

        # 3. Homepage validation (if provided)
        if homepage:
            homepage_evidence = self._validate_homepage(homepage, name)
            evidence.extend(homepage_evidence)

        # Calculate overall confidence
        confidence = self._calculate_confidence(evidence)
        exists = confidence >= 0.5  # Threshold for existence verdict

        return ValidationResult(
            exists=exists,
            confidence=confidence,
            evidence=evidence,
            wikidata_id=wikidata_id,
            validated_at=validated_at,
            validation_method='web_search + wikidata',
            notes=f"Validated using {len(evidence)} evidence sources"
        )

    def _is_obvious_false_positive(
        self,
        name: str,
        city: Optional[str],
        country: Optional[str]
    ) -> bool:
        """
        Detect obvious false positives using minimal pattern matching.

        This is a safety net for clearly invalid extractions, not a replacement
        for web validation. We only reject obvious mistakes here.
        """
        name_lower = name.lower()

        # Single character or empty
        if len(name.strip()) <= 1:
            return True

        # Pure numbers or symbols
        if re.match(r'^[\d\s\-\.,]+$', name):
            return True

        # Common markdown/artifact patterns
        markdown_artifacts = [
            '```', '---', '===', '***',
            'http://', 'https://',
            '[', ']', '(', ')',
        ]
        if any(artifact in name for artifact in markdown_artifacts):
            return True

        # Generic placeholder text
        placeholders = [
            'example', 'test', 'placeholder', 'lorem ipsum',
            'unnamed', 'unknown', 'various', 'multiple'
        ]
        if name_lower in placeholders:
            return True

        # Country mismatch (if we have city info)
        # Example: "University Malaysia" with country="NL" is wrong
        country_keywords = {
            'malaysia': ['MY'],
            'indonesia': ['ID'],
            'thailand': ['TH'],
            'vietnam': ['VN'],
            'singapore': ['SG'],
            'philippines': ['PH']
        }
        if country:
            for keyword, expected_countries in country_keywords.items():
                if keyword in name_lower and country not in expected_countries:
                    return True

        return False

    def _validate_via_exa(
        self,
        name: str,
        city: Optional[str],
        country: Optional[str],
        institution_type: Optional[str]
    ) -> List[ValidationEvidence]:
        """
        Validate institution via Exa web search.

        Returns:
            List of ValidationEvidence from web search results
        """
        # Placeholder for Exa integration
        # Will be implemented when we integrate with Exa API
        return []

    def _validate_via_wikidata(
        self,
        name: str,
        city: Optional[str],
        country: Optional[str]
    ) -> List[ValidationEvidence]:
        """
        Validate institution via Wikidata SPARQL query.

        Returns:
            List of ValidationEvidence from Wikidata
        """
        # Placeholder for Wikidata integration
        # Will be implemented in next phase
        return []

    def _validate_homepage(
        self,
        homepage: str,
        institution_name: str
    ) -> List[ValidationEvidence]:
        """
        Validate homepage URL and check if it matches institution.

        Returns:
            List of ValidationEvidence from homepage validation
        """
        evidence = []

        # Basic URL validation
        if homepage.startswith('http://') or homepage.startswith('https://'):
            # Check if URL is accessible (future: make HTTP request)
            # For now, just check if it looks like a valid heritage institution URL
            valid_tlds = ['.museum', '.nl', '.org', '.edu', '.gov']
            if any(tld in homepage for tld in valid_tlds):
                evidence.append(ValidationEvidence(
                    source='website',
                    url=homepage,
                    title=None,
                    confidence=0.7,
                    details={'validation': 'url_format_valid'}
                ))

        return evidence

    def _calculate_confidence(self, evidence: List[ValidationEvidence]) -> float:
        """
        Calculate overall confidence from multiple evidence sources.

        Uses weighted average where:
        - Wikidata presence: 0.9 weight
        - Official website: 0.8 weight
        - Web search results: 0.6-0.8 weight
        - Registry matches: 1.0 weight (future)

        Returns:
            Confidence score 0.0-1.0
        """
        if not evidence:
            return 0.0

        # Weight different evidence types
        weights = {
            'wikidata': 0.9,
            'website': 0.8,
            'exa_search': 0.7,
            'registry': 1.0
        }

        weighted_sum = 0.0
        total_weight = 0.0

        for e in evidence:
            weight = weights.get(e.source, 0.5)
            weighted_sum += e.confidence * weight
            total_weight += weight

        if total_weight == 0:
            return 0.0

        # Normalize to 0.0-1.0 range
        confidence = weighted_sum / total_weight
        return min(1.0, confidence)


def batch_validate_institutions(
    institutions: List[Dict[str, Any]],
    validator: Optional[WebValidator] = None
) -> List[ValidationResult]:
    """
    Validate multiple institutions in batch.

    Args:
        institutions: List of institution dictionaries
        validator: WebValidator instance (created if None)

    Returns:
        List of ValidationResult objects
    """
    if validator is None:
        validator = WebValidator()

    results = []
    for inst in institutions:
        result = validator.validate_institution(
            name=inst.get('name', ''),
            city=inst.get('locations', [{}])[0].get('city') if inst.get('locations') else None,
            country=inst.get('locations', [{}])[0].get('country') if inst.get('locations') else None,
            institution_type=inst.get('institution_type'),
            alternative_names=inst.get('alternative_names', []),
            homepage=inst.get('homepage')
        )
        results.append(result)

    return results


__all__ = [
    'ValidationEvidence',
    'ValidationResult',
    'WebValidator',
    'batch_validate_institutions'
]