glam/src/glam_extractor/validators/web_validator.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

360 lines
12 KiB
Python

"""
Web-based validation for heritage institutions using Exa search and Wikidata.
This module implements intelligence-based validation to replace heuristic pattern matching.
Instead of using regex patterns to determine if an institution is valid, we:
1. Search the web for evidence the institution exists
2. Query Wikidata for knowledge graph presence
3. Calculate confidence based on evidence quality
Key principle: "Stop using heuristics, use language understanding + web validation"
"""
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from datetime import datetime, timezone
import re
@dataclass
class ValidationEvidence:
"""Evidence supporting institution existence."""
source: str # 'exa_search', 'wikidata', 'website', 'registry'
url: Optional[str]
title: Optional[str]
confidence: float # 0.0-1.0
details: Dict[str, Any]
@dataclass
class ValidationResult:
"""Result of validating an institution's existence."""
exists: bool
confidence: float # 0.0-1.0
evidence: List[ValidationEvidence]
wikidata_id: Optional[str]
validated_at: datetime
validation_method: str
notes: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
'exists': self.exists,
'confidence': self.confidence,
'evidence': [
{
'source': e.source,
'url': e.url,
'title': e.title,
'confidence': e.confidence,
'details': e.details
}
for e in self.evidence
],
'wikidata_id': self.wikidata_id,
'validated_at': self.validated_at.isoformat(),
'validation_method': self.validation_method,
'notes': self.notes
}
class WebValidator:
"""
Validates heritage institutions using web search and knowledge graphs.
This validator uses multiple sources of truth:
- Exa web search for institutional websites/mentions
- Wikidata SPARQL for knowledge graph presence
- Cross-referencing with known registries (future)
Example:
>>> validator = WebValidator()
>>> result = validator.validate_institution(
... name="Van Abbemuseum",
... city="Eindhoven",
... country="NL",
... institution_type="MUSEUM"
... )
>>> print(f"Exists: {result.exists}, Confidence: {result.confidence}")
"""
def __init__(self, exa_client=None, wikidata_client=None):
"""
Initialize validator with optional clients.
Args:
exa_client: Exa search client (if None, will be created on first use)
wikidata_client: Wikidata SPARQL client (if None, will be created on first use)
"""
self.exa_client = exa_client
self.wikidata_client = wikidata_client
def validate_institution(
self,
name: str,
city: Optional[str] = None,
country: Optional[str] = None,
institution_type: Optional[str] = None,
alternative_names: Optional[List[str]] = None,
homepage: Optional[str] = None
) -> ValidationResult:
"""
Validate an institution's existence using web sources.
Args:
name: Institution name
city: City/location
country: Country code (ISO 3166-1 alpha-2)
institution_type: Type (MUSEUM, ARCHIVE, LIBRARY, etc.)
alternative_names: Alternative names to search
homepage: Known homepage URL (adds confidence if valid)
Returns:
ValidationResult with existence verdict, confidence, and evidence
"""
evidence = []
validated_at = datetime.now(timezone.utc)
# Immediate rejection patterns (obvious false positives)
if self._is_obvious_false_positive(name, city, country):
return ValidationResult(
exists=False,
confidence=0.0,
evidence=[],
wikidata_id=None,
validated_at=validated_at,
validation_method='pattern_rejection',
notes=f"Rejected by pattern matching: {name}"
)
# 1. Web search validation (using Exa if available)
if self.exa_client:
web_evidence = self._validate_via_exa(name, city, country, institution_type)
evidence.extend(web_evidence)
# 2. Wikidata validation (future implementation)
wikidata_id = None
if self.wikidata_client:
wikidata_evidence = self._validate_via_wikidata(name, city, country)
evidence.extend(wikidata_evidence)
# Extract Wikidata ID from evidence
for e in wikidata_evidence:
if e.source == 'wikidata' and 'wikidata_id' in e.details:
wikidata_id = e.details['wikidata_id']
# 3. Homepage validation (if provided)
if homepage:
homepage_evidence = self._validate_homepage(homepage, name)
evidence.extend(homepage_evidence)
# Calculate overall confidence
confidence = self._calculate_confidence(evidence)
exists = confidence >= 0.5 # Threshold for existence verdict
return ValidationResult(
exists=exists,
confidence=confidence,
evidence=evidence,
wikidata_id=wikidata_id,
validated_at=validated_at,
validation_method='web_search + wikidata',
notes=f"Validated using {len(evidence)} evidence sources"
)
def _is_obvious_false_positive(
self,
name: str,
city: Optional[str],
country: Optional[str]
) -> bool:
"""
Detect obvious false positives using minimal pattern matching.
This is a safety net for clearly invalid extractions, not a replacement
for web validation. We only reject obvious mistakes here.
"""
name_lower = name.lower()
# Single character or empty
if len(name.strip()) <= 1:
return True
# Pure numbers or symbols
if re.match(r'^[\d\s\-\.,]+$', name):
return True
# Common markdown/artifact patterns
markdown_artifacts = [
'```', '---', '===', '***',
'http://', 'https://',
'[', ']', '(', ')',
]
if any(artifact in name for artifact in markdown_artifacts):
return True
# Generic placeholder text
placeholders = [
'example', 'test', 'placeholder', 'lorem ipsum',
'unnamed', 'unknown', 'various', 'multiple'
]
if name_lower in placeholders:
return True
# Country mismatch (if we have city info)
# Example: "University Malaysia" with country="NL" is wrong
country_keywords = {
'malaysia': ['MY'],
'indonesia': ['ID'],
'thailand': ['TH'],
'vietnam': ['VN'],
'singapore': ['SG'],
'philippines': ['PH']
}
if country:
for keyword, expected_countries in country_keywords.items():
if keyword in name_lower and country not in expected_countries:
return True
return False
def _validate_via_exa(
self,
name: str,
city: Optional[str],
country: Optional[str],
institution_type: Optional[str]
) -> List[ValidationEvidence]:
"""
Validate institution via Exa web search.
Returns:
List of ValidationEvidence from web search results
"""
# Placeholder for Exa integration
# Will be implemented when we integrate with Exa API
return []
def _validate_via_wikidata(
self,
name: str,
city: Optional[str],
country: Optional[str]
) -> List[ValidationEvidence]:
"""
Validate institution via Wikidata SPARQL query.
Returns:
List of ValidationEvidence from Wikidata
"""
# Placeholder for Wikidata integration
# Will be implemented in next phase
return []
def _validate_homepage(
self,
homepage: str,
institution_name: str
) -> List[ValidationEvidence]:
"""
Validate homepage URL and check if it matches institution.
Returns:
List of ValidationEvidence from homepage validation
"""
evidence = []
# Basic URL validation
if homepage.startswith('http://') or homepage.startswith('https://'):
# Check if URL is accessible (future: make HTTP request)
# For now, just check if it looks like a valid heritage institution URL
valid_tlds = ['.museum', '.nl', '.org', '.edu', '.gov']
if any(tld in homepage for tld in valid_tlds):
evidence.append(ValidationEvidence(
source='website',
url=homepage,
title=None,
confidence=0.7,
details={'validation': 'url_format_valid'}
))
return evidence
def _calculate_confidence(self, evidence: List[ValidationEvidence]) -> float:
"""
Calculate overall confidence from multiple evidence sources.
Uses weighted average where:
- Wikidata presence: 0.9 weight
- Official website: 0.8 weight
- Web search results: 0.6-0.8 weight
- Registry matches: 1.0 weight (future)
Returns:
Confidence score 0.0-1.0
"""
if not evidence:
return 0.0
# Weight different evidence types
weights = {
'wikidata': 0.9,
'website': 0.8,
'exa_search': 0.7,
'registry': 1.0
}
weighted_sum = 0.0
total_weight = 0.0
for e in evidence:
weight = weights.get(e.source, 0.5)
weighted_sum += e.confidence * weight
total_weight += weight
if total_weight == 0:
return 0.0
# Normalize to 0.0-1.0 range
confidence = weighted_sum / total_weight
return min(1.0, confidence)
def batch_validate_institutions(
institutions: List[Dict[str, Any]],
validator: Optional[WebValidator] = None
) -> List[ValidationResult]:
"""
Validate multiple institutions in batch.
Args:
institutions: List of institution dictionaries
validator: WebValidator instance (created if None)
Returns:
List of ValidationResult objects
"""
if validator is None:
validator = WebValidator()
results = []
for inst in institutions:
result = validator.validate_institution(
name=inst.get('name', ''),
city=inst.get('locations', [{}])[0].get('city') if inst.get('locations') else None,
country=inst.get('locations', [{}])[0].get('country') if inst.get('locations') else None,
institution_type=inst.get('institution_type'),
alternative_names=inst.get('alternative_names', []),
homepage=inst.get('homepage')
)
results.append(result)
return results
__all__ = [
'ValidationEvidence',
'ValidationResult',
'WebValidator',
'batch_validate_institutions'
]