- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
360 lines
12 KiB
Python
360 lines
12 KiB
Python
"""
|
|
Web-based validation for heritage institutions using Exa search and Wikidata.
|
|
|
|
This module implements intelligence-based validation to replace heuristic pattern matching.
|
|
Instead of using regex patterns to determine if an institution is valid, we:
|
|
1. Search the web for evidence the institution exists
|
|
2. Query Wikidata for knowledge graph presence
|
|
3. Calculate confidence based on evidence quality
|
|
|
|
Key principle: "Stop using heuristics, use language understanding + web validation"
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime, timezone
|
|
import re
|
|
|
|
|
|
@dataclass
|
|
class ValidationEvidence:
|
|
"""Evidence supporting institution existence."""
|
|
source: str # 'exa_search', 'wikidata', 'website', 'registry'
|
|
url: Optional[str]
|
|
title: Optional[str]
|
|
confidence: float # 0.0-1.0
|
|
details: Dict[str, Any]
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""Result of validating an institution's existence."""
|
|
exists: bool
|
|
confidence: float # 0.0-1.0
|
|
evidence: List[ValidationEvidence]
|
|
wikidata_id: Optional[str]
|
|
validated_at: datetime
|
|
validation_method: str
|
|
notes: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
'exists': self.exists,
|
|
'confidence': self.confidence,
|
|
'evidence': [
|
|
{
|
|
'source': e.source,
|
|
'url': e.url,
|
|
'title': e.title,
|
|
'confidence': e.confidence,
|
|
'details': e.details
|
|
}
|
|
for e in self.evidence
|
|
],
|
|
'wikidata_id': self.wikidata_id,
|
|
'validated_at': self.validated_at.isoformat(),
|
|
'validation_method': self.validation_method,
|
|
'notes': self.notes
|
|
}
|
|
|
|
|
|
class WebValidator:
|
|
"""
|
|
Validates heritage institutions using web search and knowledge graphs.
|
|
|
|
This validator uses multiple sources of truth:
|
|
- Exa web search for institutional websites/mentions
|
|
- Wikidata SPARQL for knowledge graph presence
|
|
- Cross-referencing with known registries (future)
|
|
|
|
Example:
|
|
>>> validator = WebValidator()
|
|
>>> result = validator.validate_institution(
|
|
... name="Van Abbemuseum",
|
|
... city="Eindhoven",
|
|
... country="NL",
|
|
... institution_type="MUSEUM"
|
|
... )
|
|
>>> print(f"Exists: {result.exists}, Confidence: {result.confidence}")
|
|
"""
|
|
|
|
def __init__(self, exa_client=None, wikidata_client=None):
|
|
"""
|
|
Initialize validator with optional clients.
|
|
|
|
Args:
|
|
exa_client: Exa search client (if None, will be created on first use)
|
|
wikidata_client: Wikidata SPARQL client (if None, will be created on first use)
|
|
"""
|
|
self.exa_client = exa_client
|
|
self.wikidata_client = wikidata_client
|
|
|
|
def validate_institution(
|
|
self,
|
|
name: str,
|
|
city: Optional[str] = None,
|
|
country: Optional[str] = None,
|
|
institution_type: Optional[str] = None,
|
|
alternative_names: Optional[List[str]] = None,
|
|
homepage: Optional[str] = None
|
|
) -> ValidationResult:
|
|
"""
|
|
Validate an institution's existence using web sources.
|
|
|
|
Args:
|
|
name: Institution name
|
|
city: City/location
|
|
country: Country code (ISO 3166-1 alpha-2)
|
|
institution_type: Type (MUSEUM, ARCHIVE, LIBRARY, etc.)
|
|
alternative_names: Alternative names to search
|
|
homepage: Known homepage URL (adds confidence if valid)
|
|
|
|
Returns:
|
|
ValidationResult with existence verdict, confidence, and evidence
|
|
"""
|
|
evidence = []
|
|
validated_at = datetime.now(timezone.utc)
|
|
|
|
# Immediate rejection patterns (obvious false positives)
|
|
if self._is_obvious_false_positive(name, city, country):
|
|
return ValidationResult(
|
|
exists=False,
|
|
confidence=0.0,
|
|
evidence=[],
|
|
wikidata_id=None,
|
|
validated_at=validated_at,
|
|
validation_method='pattern_rejection',
|
|
notes=f"Rejected by pattern matching: {name}"
|
|
)
|
|
|
|
# 1. Web search validation (using Exa if available)
|
|
if self.exa_client:
|
|
web_evidence = self._validate_via_exa(name, city, country, institution_type)
|
|
evidence.extend(web_evidence)
|
|
|
|
# 2. Wikidata validation (future implementation)
|
|
wikidata_id = None
|
|
if self.wikidata_client:
|
|
wikidata_evidence = self._validate_via_wikidata(name, city, country)
|
|
evidence.extend(wikidata_evidence)
|
|
# Extract Wikidata ID from evidence
|
|
for e in wikidata_evidence:
|
|
if e.source == 'wikidata' and 'wikidata_id' in e.details:
|
|
wikidata_id = e.details['wikidata_id']
|
|
|
|
# 3. Homepage validation (if provided)
|
|
if homepage:
|
|
homepage_evidence = self._validate_homepage(homepage, name)
|
|
evidence.extend(homepage_evidence)
|
|
|
|
# Calculate overall confidence
|
|
confidence = self._calculate_confidence(evidence)
|
|
exists = confidence >= 0.5 # Threshold for existence verdict
|
|
|
|
return ValidationResult(
|
|
exists=exists,
|
|
confidence=confidence,
|
|
evidence=evidence,
|
|
wikidata_id=wikidata_id,
|
|
validated_at=validated_at,
|
|
validation_method='web_search + wikidata',
|
|
notes=f"Validated using {len(evidence)} evidence sources"
|
|
)
|
|
|
|
def _is_obvious_false_positive(
|
|
self,
|
|
name: str,
|
|
city: Optional[str],
|
|
country: Optional[str]
|
|
) -> bool:
|
|
"""
|
|
Detect obvious false positives using minimal pattern matching.
|
|
|
|
This is a safety net for clearly invalid extractions, not a replacement
|
|
for web validation. We only reject obvious mistakes here.
|
|
"""
|
|
name_lower = name.lower()
|
|
|
|
# Single character or empty
|
|
if len(name.strip()) <= 1:
|
|
return True
|
|
|
|
# Pure numbers or symbols
|
|
if re.match(r'^[\d\s\-\.,]+$', name):
|
|
return True
|
|
|
|
# Common markdown/artifact patterns
|
|
markdown_artifacts = [
|
|
'```', '---', '===', '***',
|
|
'http://', 'https://',
|
|
'[', ']', '(', ')',
|
|
]
|
|
if any(artifact in name for artifact in markdown_artifacts):
|
|
return True
|
|
|
|
# Generic placeholder text
|
|
placeholders = [
|
|
'example', 'test', 'placeholder', 'lorem ipsum',
|
|
'unnamed', 'unknown', 'various', 'multiple'
|
|
]
|
|
if name_lower in placeholders:
|
|
return True
|
|
|
|
# Country mismatch (if we have city info)
|
|
# Example: "University Malaysia" with country="NL" is wrong
|
|
country_keywords = {
|
|
'malaysia': ['MY'],
|
|
'indonesia': ['ID'],
|
|
'thailand': ['TH'],
|
|
'vietnam': ['VN'],
|
|
'singapore': ['SG'],
|
|
'philippines': ['PH']
|
|
}
|
|
if country:
|
|
for keyword, expected_countries in country_keywords.items():
|
|
if keyword in name_lower and country not in expected_countries:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _validate_via_exa(
|
|
self,
|
|
name: str,
|
|
city: Optional[str],
|
|
country: Optional[str],
|
|
institution_type: Optional[str]
|
|
) -> List[ValidationEvidence]:
|
|
"""
|
|
Validate institution via Exa web search.
|
|
|
|
Returns:
|
|
List of ValidationEvidence from web search results
|
|
"""
|
|
# Placeholder for Exa integration
|
|
# Will be implemented when we integrate with Exa API
|
|
return []
|
|
|
|
def _validate_via_wikidata(
|
|
self,
|
|
name: str,
|
|
city: Optional[str],
|
|
country: Optional[str]
|
|
) -> List[ValidationEvidence]:
|
|
"""
|
|
Validate institution via Wikidata SPARQL query.
|
|
|
|
Returns:
|
|
List of ValidationEvidence from Wikidata
|
|
"""
|
|
# Placeholder for Wikidata integration
|
|
# Will be implemented in next phase
|
|
return []
|
|
|
|
def _validate_homepage(
|
|
self,
|
|
homepage: str,
|
|
institution_name: str
|
|
) -> List[ValidationEvidence]:
|
|
"""
|
|
Validate homepage URL and check if it matches institution.
|
|
|
|
Returns:
|
|
List of ValidationEvidence from homepage validation
|
|
"""
|
|
evidence = []
|
|
|
|
# Basic URL validation
|
|
if homepage.startswith('http://') or homepage.startswith('https://'):
|
|
# Check if URL is accessible (future: make HTTP request)
|
|
# For now, just check if it looks like a valid heritage institution URL
|
|
valid_tlds = ['.museum', '.nl', '.org', '.edu', '.gov']
|
|
if any(tld in homepage for tld in valid_tlds):
|
|
evidence.append(ValidationEvidence(
|
|
source='website',
|
|
url=homepage,
|
|
title=None,
|
|
confidence=0.7,
|
|
details={'validation': 'url_format_valid'}
|
|
))
|
|
|
|
return evidence
|
|
|
|
def _calculate_confidence(self, evidence: List[ValidationEvidence]) -> float:
|
|
"""
|
|
Calculate overall confidence from multiple evidence sources.
|
|
|
|
Uses weighted average where:
|
|
- Wikidata presence: 0.9 weight
|
|
- Official website: 0.8 weight
|
|
- Web search results: 0.6-0.8 weight
|
|
- Registry matches: 1.0 weight (future)
|
|
|
|
Returns:
|
|
Confidence score 0.0-1.0
|
|
"""
|
|
if not evidence:
|
|
return 0.0
|
|
|
|
# Weight different evidence types
|
|
weights = {
|
|
'wikidata': 0.9,
|
|
'website': 0.8,
|
|
'exa_search': 0.7,
|
|
'registry': 1.0
|
|
}
|
|
|
|
weighted_sum = 0.0
|
|
total_weight = 0.0
|
|
|
|
for e in evidence:
|
|
weight = weights.get(e.source, 0.5)
|
|
weighted_sum += e.confidence * weight
|
|
total_weight += weight
|
|
|
|
if total_weight == 0:
|
|
return 0.0
|
|
|
|
# Normalize to 0.0-1.0 range
|
|
confidence = weighted_sum / total_weight
|
|
return min(1.0, confidence)
|
|
|
|
|
|
def batch_validate_institutions(
|
|
institutions: List[Dict[str, Any]],
|
|
validator: Optional[WebValidator] = None
|
|
) -> List[ValidationResult]:
|
|
"""
|
|
Validate multiple institutions in batch.
|
|
|
|
Args:
|
|
institutions: List of institution dictionaries
|
|
validator: WebValidator instance (created if None)
|
|
|
|
Returns:
|
|
List of ValidationResult objects
|
|
"""
|
|
if validator is None:
|
|
validator = WebValidator()
|
|
|
|
results = []
|
|
for inst in institutions:
|
|
result = validator.validate_institution(
|
|
name=inst.get('name', ''),
|
|
city=inst.get('locations', [{}])[0].get('city') if inst.get('locations') else None,
|
|
country=inst.get('locations', [{}])[0].get('country') if inst.get('locations') else None,
|
|
institution_type=inst.get('institution_type'),
|
|
alternative_names=inst.get('alternative_names', []),
|
|
homepage=inst.get('homepage')
|
|
)
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
|
|
__all__ = [
|
|
'ValidationEvidence',
|
|
'ValidationResult',
|
|
'WebValidator',
|
|
'batch_validate_institutions'
|
|
]
|