- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
527 lines
23 KiB
Python
527 lines
23 KiB
Python
"""
|
|
Tests for Temporal Intent Extraction Module
|
|
|
|
Tests the TemporalConstraintExtractor and TemporalIntentExtractorModule classes
|
|
which enable fast LLM-free extraction of temporal constraints from queries.
|
|
"""
|
|
|
|
import pytest
|
|
from .temporal_intent import (
|
|
TemporalConstraint,
|
|
TemporalConstraintExtractor,
|
|
TemporalIntentExtractorModule,
|
|
get_temporal_extractor,
|
|
)
|
|
|
|
|
|
class TestTemporalConstraintExtractor:
|
|
"""Tests for TemporalConstraintExtractor class."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return TemporalConstraintExtractor()
|
|
|
|
# ===== Timeline/History Queries =====
|
|
|
|
def test_timeline_dutch_geschiedenis(self, extractor):
|
|
"""Dutch 'geschiedenis' should trigger timeline constraint."""
|
|
constraint = extractor.extract("Wat is de geschiedenis van het Rijksmuseum?")
|
|
assert constraint.constraint_type == "timeline"
|
|
assert constraint.recommended_template == "institution_timeline"
|
|
assert constraint.confidence >= 0.9
|
|
|
|
def test_timeline_english_history(self, extractor):
|
|
"""English 'history' should trigger timeline constraint."""
|
|
constraint = extractor.extract("Tell me the history of the British Museum")
|
|
assert constraint.constraint_type == "timeline"
|
|
assert constraint.recommended_template == "institution_timeline"
|
|
|
|
def test_timeline_tijdlijn(self, extractor):
|
|
"""Dutch 'tijdlijn' should trigger timeline constraint."""
|
|
constraint = extractor.extract("Geef me een tijdlijn van het Noord-Hollands Archief")
|
|
assert constraint.constraint_type == "timeline"
|
|
|
|
def test_timeline_evolution(self, extractor):
|
|
"""English 'evolution' should trigger timeline constraint."""
|
|
constraint = extractor.extract("What was the evolution of this archive?")
|
|
assert constraint.constraint_type == "timeline"
|
|
|
|
# ===== Superlative Queries (Oldest/Newest) =====
|
|
|
|
def test_oldest_dutch_oudste(self, extractor):
|
|
"""Dutch 'oudste' should trigger oldest constraint."""
|
|
constraint = extractor.extract("Wat is het oudste museum in Nederland?")
|
|
assert constraint.constraint_type == "oldest"
|
|
assert constraint.recommended_template == "find_by_founding"
|
|
assert constraint.confidence >= 0.9
|
|
|
|
def test_oldest_english(self, extractor):
|
|
"""English 'oldest' should trigger oldest constraint."""
|
|
constraint = extractor.extract("What is the oldest library in Amsterdam?")
|
|
assert constraint.constraint_type == "oldest"
|
|
|
|
def test_oldest_eerste(self, extractor):
|
|
"""Dutch 'eerste' (first) should trigger oldest constraint."""
|
|
constraint = extractor.extract("Welke was de eerste openbare bibliotheek?")
|
|
assert constraint.constraint_type == "oldest"
|
|
|
|
def test_oldest_earliest(self, extractor):
|
|
"""English 'earliest' should trigger oldest constraint."""
|
|
constraint = extractor.extract("What is the earliest archive in the region?")
|
|
assert constraint.constraint_type == "oldest"
|
|
|
|
def test_newest_dutch_nieuwste(self, extractor):
|
|
"""Dutch 'nieuwste' should trigger newest constraint."""
|
|
constraint = extractor.extract("Wat is het nieuwste museum?")
|
|
assert constraint.constraint_type == "newest"
|
|
assert constraint.recommended_template == "find_by_founding"
|
|
|
|
def test_newest_english_latest(self, extractor):
|
|
"""English 'latest' should trigger newest constraint."""
|
|
constraint = extractor.extract("What is the latest museum to open?")
|
|
assert constraint.constraint_type == "newest"
|
|
|
|
def test_newest_most_recent(self, extractor):
|
|
"""English 'most recent' should trigger newest constraint."""
|
|
constraint = extractor.extract("What is the most recent archive established?")
|
|
assert constraint.constraint_type == "newest"
|
|
|
|
# ===== Change Event Keywords =====
|
|
|
|
def test_merger_dutch_fusie(self, extractor):
|
|
"""Dutch 'fusie' should trigger change_event constraint."""
|
|
constraint = extractor.extract("Wanneer was de fusie van het archief?")
|
|
assert constraint.constraint_type == "change_event"
|
|
assert constraint.reference_event == "merger"
|
|
assert constraint.recommended_template == "events_in_period"
|
|
|
|
def test_merger_english(self, extractor):
|
|
"""English 'merger' should trigger change_event constraint."""
|
|
constraint = extractor.extract("When did the merger happen?")
|
|
assert constraint.constraint_type == "change_event"
|
|
assert constraint.reference_event == "merger"
|
|
|
|
def test_merger_merged(self, extractor):
|
|
"""English 'merged' should trigger change_event constraint."""
|
|
constraint = extractor.extract("Which archives merged in 2001?")
|
|
assert constraint.constraint_type == "change_event"
|
|
|
|
def test_founding_dutch_opgericht(self, extractor):
|
|
"""Dutch 'opgericht' should trigger founding constraint."""
|
|
constraint = extractor.extract("Wanneer is het Rijksmuseum opgericht?")
|
|
assert constraint.constraint_type == "founding"
|
|
assert constraint.recommended_template == "institution_timeline"
|
|
|
|
def test_founding_english_founded(self, extractor):
|
|
"""English 'founded' should trigger founding constraint."""
|
|
constraint = extractor.extract("When was the library founded?")
|
|
assert constraint.constraint_type == "founding"
|
|
|
|
def test_founding_established(self, extractor):
|
|
"""English 'established' should trigger founding constraint."""
|
|
constraint = extractor.extract("When was this archive established?")
|
|
assert constraint.constraint_type == "founding"
|
|
|
|
def test_closure_dutch_gesloten(self, extractor):
|
|
"""Dutch 'gesloten' should trigger closure constraint."""
|
|
constraint = extractor.extract("Wanneer is het museum gesloten?")
|
|
assert constraint.constraint_type == "closure"
|
|
assert constraint.recommended_template == "institution_timeline"
|
|
|
|
def test_closure_english_closed(self, extractor):
|
|
"""English 'closed' should trigger closure constraint."""
|
|
# Note: "close" (verb form) vs "closed" (past participle)
|
|
# The extractor only has "closed" in CLOSURE_KEYWORDS
|
|
constraint = extractor.extract("When was the archive closed?")
|
|
assert constraint.constraint_type == "closure"
|
|
|
|
def test_closure_dissolved(self, extractor):
|
|
"""English 'dissolved' should trigger closure constraint."""
|
|
constraint = extractor.extract("When was the organization dissolved?")
|
|
assert constraint.constraint_type == "closure"
|
|
|
|
# ===== Year Extraction =====
|
|
|
|
def test_single_year_point_in_time(self, extractor):
|
|
"""Single year should trigger point_in_time constraint."""
|
|
constraint = extractor.extract("Wat was de status van het museum in 1990?")
|
|
assert constraint.constraint_type == "point_in_time"
|
|
assert constraint.date_start == "1990-01-01"
|
|
assert constraint.date_end == "1990-12-31"
|
|
assert constraint.recommended_template == "point_in_time_state"
|
|
|
|
def test_two_years_between(self, extractor):
|
|
"""Two years should trigger between constraint."""
|
|
constraint = extractor.extract("Welke veranderingen waren er tussen 1990 en 2000?")
|
|
assert constraint.constraint_type == "between"
|
|
assert constraint.date_start == "1990-01-01"
|
|
assert constraint.date_end == "2000-12-31"
|
|
assert constraint.recommended_template == "events_in_period"
|
|
|
|
def test_three_years_uses_first_and_last(self, extractor):
|
|
"""Three years should use first and last for range."""
|
|
constraint = extractor.extract("Musea in 1950, 1975 en 2000")
|
|
assert constraint.constraint_type == "between"
|
|
assert constraint.date_start == "1950-01-01"
|
|
assert constraint.date_end == "2000-12-31"
|
|
|
|
def test_year_with_before_dutch(self, extractor):
|
|
"""Year with Dutch 'voor' should trigger before constraint."""
|
|
constraint = extractor.extract("Welke archieven bestonden voor 1950?")
|
|
assert constraint.constraint_type == "before"
|
|
assert constraint.date_end == "1950-01-01"
|
|
assert constraint.recommended_template == "point_in_time_state"
|
|
|
|
def test_year_with_before_english(self, extractor):
|
|
"""Year with English 'before' should trigger before constraint."""
|
|
constraint = extractor.extract("Which museums existed before 1900?")
|
|
assert constraint.constraint_type == "before"
|
|
assert constraint.date_end == "1900-01-01"
|
|
|
|
def test_year_with_after_dutch(self, extractor):
|
|
"""Year with Dutch 'na' should trigger after constraint.
|
|
|
|
Note: More specific keywords (like 'opgericht') take precedence.
|
|
We use a neutral query without founding/closure keywords.
|
|
"""
|
|
constraint = extractor.extract("Welke veranderingen waren er na 1980?")
|
|
assert constraint.constraint_type == "after"
|
|
assert constraint.date_start == "1980-12-31"
|
|
assert constraint.recommended_template == "point_in_time_state"
|
|
|
|
def test_year_with_after_english(self, extractor):
|
|
"""Year with English 'after' should trigger after constraint."""
|
|
constraint = extractor.extract("What happened after 2010?")
|
|
assert constraint.constraint_type == "after"
|
|
assert constraint.date_start == "2010-12-31"
|
|
|
|
def test_year_with_since(self, extractor):
|
|
"""'Since' should trigger after constraint."""
|
|
constraint = extractor.extract("Museums opened since 2000")
|
|
assert constraint.constraint_type == "after"
|
|
assert constraint.date_start == "2000-12-31"
|
|
|
|
# ===== Year Extraction Edge Cases =====
|
|
|
|
def test_year_1800s(self, extractor):
|
|
"""Should extract years from 1800s."""
|
|
constraint = extractor.extract("Archieven uit 1856")
|
|
assert constraint.constraint_type == "point_in_time"
|
|
assert "1856" in constraint.date_start
|
|
|
|
def test_year_2020s(self, extractor):
|
|
"""Should extract years from 2020s."""
|
|
constraint = extractor.extract("Nieuwe musea in 2023")
|
|
assert constraint.constraint_type == "point_in_time"
|
|
assert "2023" in constraint.date_start
|
|
|
|
def test_ignore_numbers_that_are_not_years(self, extractor):
|
|
"""Should not extract non-year numbers as years."""
|
|
# Numbers like 500 or 50 should not be treated as years
|
|
constraint = extractor.extract("Het museum heeft 500 werken in de collectie")
|
|
assert constraint.constraint_type == "none"
|
|
|
|
# ===== No Temporal Constraint =====
|
|
|
|
def test_no_constraint_simple_query(self, extractor):
|
|
"""Query without temporal indicators should return none."""
|
|
constraint = extractor.extract("Welke musea zijn er in Amsterdam?")
|
|
assert constraint.constraint_type == "none"
|
|
assert constraint.recommended_template is None
|
|
|
|
def test_no_constraint_descriptive_query(self, extractor):
|
|
"""Descriptive query should return none."""
|
|
constraint = extractor.extract("Vertel me over de collectie van het Rijksmuseum")
|
|
assert constraint.constraint_type == "none"
|
|
|
|
# ===== Word Boundary Matching =====
|
|
|
|
def test_na_in_nationaal_not_matched(self, extractor):
|
|
"""'na' inside 'nationaal' should NOT trigger after constraint."""
|
|
constraint = extractor.extract("Nationaal Archief in Den Haag")
|
|
# 'nationaal' contains 'na' but it's not a word boundary
|
|
assert constraint.constraint_type == "none"
|
|
|
|
def test_na_as_word_is_matched(self, extractor):
|
|
"""'na' as standalone word SHOULD trigger after constraint."""
|
|
constraint = extractor.extract("Na de renovatie in 1995 werd het museum heropend")
|
|
assert constraint.constraint_type == "after"
|
|
assert "1995" in constraint.date_start
|
|
|
|
def test_voor_in_voorwerpen_not_matched(self, extractor):
|
|
"""'voor' inside 'voorwerpen' should NOT trigger before."""
|
|
constraint = extractor.extract("De collectie bevat voorwerpen uit de 18e eeuw")
|
|
# No explicit year, so should be none
|
|
assert constraint.constraint_type == "none"
|
|
|
|
def test_voor_as_word_is_matched(self, extractor):
|
|
"""'voor' as standalone word SHOULD trigger before constraint."""
|
|
constraint = extractor.extract("Archieven van voor 1900")
|
|
assert constraint.constraint_type == "before"
|
|
assert "1900" in constraint.date_end
|
|
|
|
# ===== Template Mapping =====
|
|
|
|
def test_template_mapping_point_in_time(self, extractor):
|
|
"""point_in_time should map to point_in_time_state template."""
|
|
constraint = extractor.extract("Status in 1990")
|
|
template = extractor.get_template_for_constraint(constraint)
|
|
assert template == "point_in_time_state"
|
|
|
|
def test_template_mapping_between(self, extractor):
|
|
"""between should map to events_in_period template."""
|
|
constraint = extractor.extract("Veranderingen tussen 1990 en 2000")
|
|
template = extractor.get_template_for_constraint(constraint)
|
|
assert template == "events_in_period"
|
|
|
|
def test_template_mapping_oldest(self, extractor):
|
|
"""oldest should map to find_by_founding template."""
|
|
constraint = extractor.extract("Het oudste museum")
|
|
template = extractor.get_template_for_constraint(constraint)
|
|
assert template == "find_by_founding"
|
|
|
|
def test_template_mapping_timeline(self, extractor):
|
|
"""timeline should map to institution_timeline template."""
|
|
constraint = extractor.extract("Geschiedenis van het archief")
|
|
template = extractor.get_template_for_constraint(constraint)
|
|
assert template == "institution_timeline"
|
|
|
|
def test_template_mapping_none(self, extractor):
|
|
"""none constraint should return None template."""
|
|
constraint = extractor.extract("Welke musea zijn er?")
|
|
template = extractor.get_template_for_constraint(constraint)
|
|
assert template is None
|
|
|
|
# ===== Confidence Scoring =====
|
|
|
|
def test_high_confidence_timeline(self, extractor):
|
|
"""Timeline queries should have high confidence."""
|
|
constraint = extractor.extract("Geschiedenis van het Rijksmuseum")
|
|
assert constraint.confidence >= 0.9
|
|
|
|
def test_high_confidence_superlative(self, extractor):
|
|
"""Superlative queries should have high confidence."""
|
|
constraint = extractor.extract("Het oudste archief")
|
|
assert constraint.confidence >= 0.9
|
|
|
|
def test_moderate_confidence_year_only(self, extractor):
|
|
"""Year-only queries should have moderate confidence."""
|
|
constraint = extractor.extract("Musea in 1990")
|
|
assert 0.7 <= constraint.confidence <= 0.9
|
|
|
|
def test_lower_confidence_no_constraint(self, extractor):
|
|
"""No-constraint queries should have lower confidence."""
|
|
constraint = extractor.extract("Algemene informatie over erfgoed")
|
|
assert constraint.confidence <= 0.75
|
|
|
|
|
|
class TestTemporalConstraintDataclass:
|
|
"""Tests for TemporalConstraint dataclass."""
|
|
|
|
def test_default_values(self):
|
|
"""Test default values of TemporalConstraint."""
|
|
constraint = TemporalConstraint(constraint_type="none")
|
|
assert constraint.date_start is None
|
|
assert constraint.date_end is None
|
|
assert constraint.reference_event is None
|
|
assert constraint.confidence == 0.8
|
|
assert constraint.recommended_template is None
|
|
|
|
def test_full_constraint(self):
|
|
"""Test TemporalConstraint with all fields."""
|
|
constraint = TemporalConstraint(
|
|
constraint_type="between",
|
|
date_start="1990-01-01",
|
|
date_end="2000-12-31",
|
|
reference_event=None,
|
|
confidence=0.95,
|
|
recommended_template="events_in_period"
|
|
)
|
|
assert constraint.constraint_type == "between"
|
|
assert constraint.date_start == "1990-01-01"
|
|
assert constraint.date_end == "2000-12-31"
|
|
assert constraint.confidence == 0.95
|
|
assert constraint.recommended_template == "events_in_period"
|
|
|
|
|
|
class TestTemporalIntentExtractorModule:
|
|
"""Tests for the DSPy module (without actual LLM calls)."""
|
|
|
|
def test_module_initialization(self):
|
|
"""Test module initializes correctly."""
|
|
module = TemporalIntentExtractorModule(confidence_threshold=0.75)
|
|
assert module.confidence_threshold == 0.75
|
|
assert module.fast_extractor is not None
|
|
|
|
def test_high_confidence_uses_fast_extraction(self):
|
|
"""High confidence queries should use fast extraction, not LLM."""
|
|
module = TemporalIntentExtractorModule(confidence_threshold=0.75)
|
|
|
|
# This query has high confidence (timeline keyword)
|
|
constraint = module.forward("Geschiedenis van het Rijksmuseum")
|
|
|
|
# Should use fast extraction result
|
|
assert constraint.constraint_type == "timeline"
|
|
assert constraint.confidence >= 0.75
|
|
|
|
|
|
class TestSingletonInstance:
|
|
"""Tests for singleton pattern."""
|
|
|
|
def test_get_temporal_extractor_singleton(self):
|
|
"""get_temporal_extractor should return same instance."""
|
|
ext1 = get_temporal_extractor()
|
|
ext2 = get_temporal_extractor()
|
|
assert ext1 is ext2
|
|
|
|
def test_singleton_is_temporal_constraint_extractor(self):
|
|
"""Singleton should be TemporalConstraintExtractor instance."""
|
|
ext = get_temporal_extractor()
|
|
assert isinstance(ext, TemporalConstraintExtractor)
|
|
|
|
|
|
class TestIntegration:
|
|
"""Integration tests for full temporal extraction flow."""
|
|
|
|
def test_dutch_point_in_time_full_flow(self):
|
|
"""Test complete flow for Dutch point-in-time query."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
constraint = extractor.extract(
|
|
"Wat was de status van het Rijksmuseum in 1990?"
|
|
)
|
|
|
|
assert constraint.constraint_type == "point_in_time"
|
|
assert constraint.date_start == "1990-01-01"
|
|
assert constraint.date_end == "1990-12-31"
|
|
assert constraint.recommended_template == "point_in_time_state"
|
|
|
|
def test_english_timeline_full_flow(self):
|
|
"""Test complete flow for English timeline query."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
constraint = extractor.extract(
|
|
"What is the history of the British Museum?"
|
|
)
|
|
|
|
assert constraint.constraint_type == "timeline"
|
|
assert constraint.recommended_template == "institution_timeline"
|
|
|
|
def test_date_range_full_flow(self):
|
|
"""Test complete flow for date range query."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
constraint = extractor.extract(
|
|
"Welke fusies vonden plaats tussen 1990 en 2010?"
|
|
)
|
|
|
|
# Should detect "fusie" (merger) keyword first
|
|
# But since there are two years, it should be change_event or between
|
|
# Merger keywords take precedence
|
|
assert constraint.constraint_type == "change_event"
|
|
assert constraint.reference_event == "merger"
|
|
|
|
def test_superlative_with_location(self):
|
|
"""Test superlative query with location."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
constraint = extractor.extract(
|
|
"Wat is het oudste archief in Noord-Holland?"
|
|
)
|
|
|
|
assert constraint.constraint_type == "oldest"
|
|
assert constraint.recommended_template == "find_by_founding"
|
|
|
|
def test_complex_query_multiple_indicators(self):
|
|
"""Test query with multiple temporal indicators."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
# "geschiedenis" (timeline) and "oudste" (oldest) - timeline wins (checked first)
|
|
constraint = extractor.extract(
|
|
"Vertel me de geschiedenis van de oudste bibliotheek"
|
|
)
|
|
|
|
assert constraint.constraint_type == "timeline"
|
|
|
|
def test_query_templates_for_sparql(self):
|
|
"""Test that all temporal constraints map to valid templates."""
|
|
extractor = get_temporal_extractor()
|
|
|
|
test_cases = [
|
|
("Geschiedenis van het archief", "institution_timeline"),
|
|
("Het oudste museum", "find_by_founding"),
|
|
("Het nieuwste archief", "find_by_founding"),
|
|
("Status in 1990", "point_in_time_state"),
|
|
("Voor 1950", "point_in_time_state"), # Year + before
|
|
("Na 2000", "point_in_time_state"), # Year + after
|
|
("Fusies in de regio", "events_in_period"),
|
|
("Wanneer opgericht", "institution_timeline"),
|
|
("Wanneer gesloten", "institution_timeline"),
|
|
]
|
|
|
|
for query, expected_template in test_cases:
|
|
constraint = extractor.extract(query)
|
|
# Some queries may not extract years, check if template matches expectation
|
|
if constraint.constraint_type != "none":
|
|
assert constraint.recommended_template == expected_template, (
|
|
f"Query '{query}' expected template '{expected_template}', "
|
|
f"got '{constraint.recommended_template}' "
|
|
f"(constraint_type: {constraint.constraint_type})"
|
|
)
|
|
|
|
|
|
class TestRealWorldQueries:
|
|
"""Tests with real-world heritage queries."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return get_temporal_extractor()
|
|
|
|
def test_noord_hollands_archief_history(self, extractor):
|
|
"""Real query about Noord-Hollands Archief history."""
|
|
constraint = extractor.extract(
|
|
"Wat is de geschiedenis van het Noord-Hollands Archief sinds de fusie in 2001?"
|
|
)
|
|
# "geschiedenis" (timeline) is checked before merger/year
|
|
assert constraint.constraint_type == "timeline"
|
|
|
|
def test_museum_founding_date(self, extractor):
|
|
"""Real query about museum founding."""
|
|
constraint = extractor.extract(
|
|
"Wanneer is het Rijksmuseum in Amsterdam opgericht?"
|
|
)
|
|
assert constraint.constraint_type == "founding"
|
|
|
|
def test_archives_before_ww2(self, extractor):
|
|
"""Query about archives before WWII."""
|
|
constraint = extractor.extract(
|
|
"Welke gemeentearchieven bestonden voor 1940?"
|
|
)
|
|
assert constraint.constraint_type == "before"
|
|
assert "1940" in constraint.date_end
|
|
|
|
def test_oldest_university_library(self, extractor):
|
|
"""Query about oldest university library."""
|
|
constraint = extractor.extract(
|
|
"Wat is de oudste universiteitsbibliotheek van Nederland?"
|
|
)
|
|
assert constraint.constraint_type == "oldest"
|
|
|
|
def test_museum_closures_pandemic(self, extractor):
|
|
"""Query about closures during pandemic."""
|
|
constraint = extractor.extract(
|
|
"Welke musea zijn gesloten tijdens de pandemie in 2020?"
|
|
)
|
|
# "gesloten" (closure) keyword
|
|
assert constraint.constraint_type == "closure"
|
|
|
|
def test_digital_archives_recent(self, extractor):
|
|
"""Query about recent digital archives."""
|
|
constraint = extractor.extract(
|
|
"Welke digitale archieven zijn na 2015 gelanceerd?"
|
|
)
|
|
assert constraint.constraint_type == "after"
|
|
assert "2015" in constraint.date_start
|
|
|
|
|
|
# Run with: pytest backend/rag/test_temporal_intent.py -v
|