glam/backend/rag/test_temporal_intent.py
kempersc 98c42bf272 Fix LinkML URI conflicts and generate RDF outputs
- Fix scope_note → finding_aid_scope_note in FindingAid.yaml
- Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead)
- Remove duplicate rico_record_set_type from class_metadata_slots.yaml
- Fix range types for equals_string compatibility (uriorcurie → string)
- Move class names from close_mappings to see_also in 10 RecordSetTypes files
- Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context
- Sync schemas to frontend/public/schemas/

Files: 1,151 changed (includes prior CustodianType migration)
2026-01-07 12:32:59 +01:00

527 lines
23 KiB
Python

"""
Tests for Temporal Intent Extraction Module
Tests the TemporalConstraintExtractor and TemporalIntentExtractorModule classes
which enable fast LLM-free extraction of temporal constraints from queries.
"""
import pytest
from .temporal_intent import (
TemporalConstraint,
TemporalConstraintExtractor,
TemporalIntentExtractorModule,
get_temporal_extractor,
)
class TestTemporalConstraintExtractor:
"""Tests for TemporalConstraintExtractor class."""
@pytest.fixture
def extractor(self):
return TemporalConstraintExtractor()
# ===== Timeline/History Queries =====
def test_timeline_dutch_geschiedenis(self, extractor):
"""Dutch 'geschiedenis' should trigger timeline constraint."""
constraint = extractor.extract("Wat is de geschiedenis van het Rijksmuseum?")
assert constraint.constraint_type == "timeline"
assert constraint.recommended_template == "institution_timeline"
assert constraint.confidence >= 0.9
def test_timeline_english_history(self, extractor):
"""English 'history' should trigger timeline constraint."""
constraint = extractor.extract("Tell me the history of the British Museum")
assert constraint.constraint_type == "timeline"
assert constraint.recommended_template == "institution_timeline"
def test_timeline_tijdlijn(self, extractor):
"""Dutch 'tijdlijn' should trigger timeline constraint."""
constraint = extractor.extract("Geef me een tijdlijn van het Noord-Hollands Archief")
assert constraint.constraint_type == "timeline"
def test_timeline_evolution(self, extractor):
"""English 'evolution' should trigger timeline constraint."""
constraint = extractor.extract("What was the evolution of this archive?")
assert constraint.constraint_type == "timeline"
# ===== Superlative Queries (Oldest/Newest) =====
def test_oldest_dutch_oudste(self, extractor):
"""Dutch 'oudste' should trigger oldest constraint."""
constraint = extractor.extract("Wat is het oudste museum in Nederland?")
assert constraint.constraint_type == "oldest"
assert constraint.recommended_template == "find_by_founding"
assert constraint.confidence >= 0.9
def test_oldest_english(self, extractor):
"""English 'oldest' should trigger oldest constraint."""
constraint = extractor.extract("What is the oldest library in Amsterdam?")
assert constraint.constraint_type == "oldest"
def test_oldest_eerste(self, extractor):
"""Dutch 'eerste' (first) should trigger oldest constraint."""
constraint = extractor.extract("Welke was de eerste openbare bibliotheek?")
assert constraint.constraint_type == "oldest"
def test_oldest_earliest(self, extractor):
"""English 'earliest' should trigger oldest constraint."""
constraint = extractor.extract("What is the earliest archive in the region?")
assert constraint.constraint_type == "oldest"
def test_newest_dutch_nieuwste(self, extractor):
"""Dutch 'nieuwste' should trigger newest constraint."""
constraint = extractor.extract("Wat is het nieuwste museum?")
assert constraint.constraint_type == "newest"
assert constraint.recommended_template == "find_by_founding"
def test_newest_english_latest(self, extractor):
"""English 'latest' should trigger newest constraint."""
constraint = extractor.extract("What is the latest museum to open?")
assert constraint.constraint_type == "newest"
def test_newest_most_recent(self, extractor):
"""English 'most recent' should trigger newest constraint."""
constraint = extractor.extract("What is the most recent archive established?")
assert constraint.constraint_type == "newest"
# ===== Change Event Keywords =====
def test_merger_dutch_fusie(self, extractor):
"""Dutch 'fusie' should trigger change_event constraint."""
constraint = extractor.extract("Wanneer was de fusie van het archief?")
assert constraint.constraint_type == "change_event"
assert constraint.reference_event == "merger"
assert constraint.recommended_template == "events_in_period"
def test_merger_english(self, extractor):
"""English 'merger' should trigger change_event constraint."""
constraint = extractor.extract("When did the merger happen?")
assert constraint.constraint_type == "change_event"
assert constraint.reference_event == "merger"
def test_merger_merged(self, extractor):
"""English 'merged' should trigger change_event constraint."""
constraint = extractor.extract("Which archives merged in 2001?")
assert constraint.constraint_type == "change_event"
def test_founding_dutch_opgericht(self, extractor):
"""Dutch 'opgericht' should trigger founding constraint."""
constraint = extractor.extract("Wanneer is het Rijksmuseum opgericht?")
assert constraint.constraint_type == "founding"
assert constraint.recommended_template == "institution_timeline"
def test_founding_english_founded(self, extractor):
"""English 'founded' should trigger founding constraint."""
constraint = extractor.extract("When was the library founded?")
assert constraint.constraint_type == "founding"
def test_founding_established(self, extractor):
"""English 'established' should trigger founding constraint."""
constraint = extractor.extract("When was this archive established?")
assert constraint.constraint_type == "founding"
def test_closure_dutch_gesloten(self, extractor):
"""Dutch 'gesloten' should trigger closure constraint."""
constraint = extractor.extract("Wanneer is het museum gesloten?")
assert constraint.constraint_type == "closure"
assert constraint.recommended_template == "institution_timeline"
def test_closure_english_closed(self, extractor):
"""English 'closed' should trigger closure constraint."""
# Note: "close" (verb form) vs "closed" (past participle)
# The extractor only has "closed" in CLOSURE_KEYWORDS
constraint = extractor.extract("When was the archive closed?")
assert constraint.constraint_type == "closure"
def test_closure_dissolved(self, extractor):
"""English 'dissolved' should trigger closure constraint."""
constraint = extractor.extract("When was the organization dissolved?")
assert constraint.constraint_type == "closure"
# ===== Year Extraction =====
def test_single_year_point_in_time(self, extractor):
"""Single year should trigger point_in_time constraint."""
constraint = extractor.extract("Wat was de status van het museum in 1990?")
assert constraint.constraint_type == "point_in_time"
assert constraint.date_start == "1990-01-01"
assert constraint.date_end == "1990-12-31"
assert constraint.recommended_template == "point_in_time_state"
def test_two_years_between(self, extractor):
"""Two years should trigger between constraint."""
constraint = extractor.extract("Welke veranderingen waren er tussen 1990 en 2000?")
assert constraint.constraint_type == "between"
assert constraint.date_start == "1990-01-01"
assert constraint.date_end == "2000-12-31"
assert constraint.recommended_template == "events_in_period"
def test_three_years_uses_first_and_last(self, extractor):
"""Three years should use first and last for range."""
constraint = extractor.extract("Musea in 1950, 1975 en 2000")
assert constraint.constraint_type == "between"
assert constraint.date_start == "1950-01-01"
assert constraint.date_end == "2000-12-31"
def test_year_with_before_dutch(self, extractor):
"""Year with Dutch 'voor' should trigger before constraint."""
constraint = extractor.extract("Welke archieven bestonden voor 1950?")
assert constraint.constraint_type == "before"
assert constraint.date_end == "1950-01-01"
assert constraint.recommended_template == "point_in_time_state"
def test_year_with_before_english(self, extractor):
"""Year with English 'before' should trigger before constraint."""
constraint = extractor.extract("Which museums existed before 1900?")
assert constraint.constraint_type == "before"
assert constraint.date_end == "1900-01-01"
def test_year_with_after_dutch(self, extractor):
"""Year with Dutch 'na' should trigger after constraint.
Note: More specific keywords (like 'opgericht') take precedence.
We use a neutral query without founding/closure keywords.
"""
constraint = extractor.extract("Welke veranderingen waren er na 1980?")
assert constraint.constraint_type == "after"
assert constraint.date_start == "1980-12-31"
assert constraint.recommended_template == "point_in_time_state"
def test_year_with_after_english(self, extractor):
"""Year with English 'after' should trigger after constraint."""
constraint = extractor.extract("What happened after 2010?")
assert constraint.constraint_type == "after"
assert constraint.date_start == "2010-12-31"
def test_year_with_since(self, extractor):
"""'Since' should trigger after constraint."""
constraint = extractor.extract("Museums opened since 2000")
assert constraint.constraint_type == "after"
assert constraint.date_start == "2000-12-31"
# ===== Year Extraction Edge Cases =====
def test_year_1800s(self, extractor):
"""Should extract years from 1800s."""
constraint = extractor.extract("Archieven uit 1856")
assert constraint.constraint_type == "point_in_time"
assert "1856" in constraint.date_start
def test_year_2020s(self, extractor):
"""Should extract years from 2020s."""
constraint = extractor.extract("Nieuwe musea in 2023")
assert constraint.constraint_type == "point_in_time"
assert "2023" in constraint.date_start
def test_ignore_numbers_that_are_not_years(self, extractor):
"""Should not extract non-year numbers as years."""
# Numbers like 500 or 50 should not be treated as years
constraint = extractor.extract("Het museum heeft 500 werken in de collectie")
assert constraint.constraint_type == "none"
# ===== No Temporal Constraint =====
def test_no_constraint_simple_query(self, extractor):
"""Query without temporal indicators should return none."""
constraint = extractor.extract("Welke musea zijn er in Amsterdam?")
assert constraint.constraint_type == "none"
assert constraint.recommended_template is None
def test_no_constraint_descriptive_query(self, extractor):
"""Descriptive query should return none."""
constraint = extractor.extract("Vertel me over de collectie van het Rijksmuseum")
assert constraint.constraint_type == "none"
# ===== Word Boundary Matching =====
def test_na_in_nationaal_not_matched(self, extractor):
"""'na' inside 'nationaal' should NOT trigger after constraint."""
constraint = extractor.extract("Nationaal Archief in Den Haag")
# 'nationaal' contains 'na' but it's not a word boundary
assert constraint.constraint_type == "none"
def test_na_as_word_is_matched(self, extractor):
"""'na' as standalone word SHOULD trigger after constraint."""
constraint = extractor.extract("Na de renovatie in 1995 werd het museum heropend")
assert constraint.constraint_type == "after"
assert "1995" in constraint.date_start
def test_voor_in_voorwerpen_not_matched(self, extractor):
"""'voor' inside 'voorwerpen' should NOT trigger before."""
constraint = extractor.extract("De collectie bevat voorwerpen uit de 18e eeuw")
# No explicit year, so should be none
assert constraint.constraint_type == "none"
def test_voor_as_word_is_matched(self, extractor):
"""'voor' as standalone word SHOULD trigger before constraint."""
constraint = extractor.extract("Archieven van voor 1900")
assert constraint.constraint_type == "before"
assert "1900" in constraint.date_end
# ===== Template Mapping =====
def test_template_mapping_point_in_time(self, extractor):
"""point_in_time should map to point_in_time_state template."""
constraint = extractor.extract("Status in 1990")
template = extractor.get_template_for_constraint(constraint)
assert template == "point_in_time_state"
def test_template_mapping_between(self, extractor):
"""between should map to events_in_period template."""
constraint = extractor.extract("Veranderingen tussen 1990 en 2000")
template = extractor.get_template_for_constraint(constraint)
assert template == "events_in_period"
def test_template_mapping_oldest(self, extractor):
"""oldest should map to find_by_founding template."""
constraint = extractor.extract("Het oudste museum")
template = extractor.get_template_for_constraint(constraint)
assert template == "find_by_founding"
def test_template_mapping_timeline(self, extractor):
"""timeline should map to institution_timeline template."""
constraint = extractor.extract("Geschiedenis van het archief")
template = extractor.get_template_for_constraint(constraint)
assert template == "institution_timeline"
def test_template_mapping_none(self, extractor):
"""none constraint should return None template."""
constraint = extractor.extract("Welke musea zijn er?")
template = extractor.get_template_for_constraint(constraint)
assert template is None
# ===== Confidence Scoring =====
def test_high_confidence_timeline(self, extractor):
"""Timeline queries should have high confidence."""
constraint = extractor.extract("Geschiedenis van het Rijksmuseum")
assert constraint.confidence >= 0.9
def test_high_confidence_superlative(self, extractor):
"""Superlative queries should have high confidence."""
constraint = extractor.extract("Het oudste archief")
assert constraint.confidence >= 0.9
def test_moderate_confidence_year_only(self, extractor):
"""Year-only queries should have moderate confidence."""
constraint = extractor.extract("Musea in 1990")
assert 0.7 <= constraint.confidence <= 0.9
def test_lower_confidence_no_constraint(self, extractor):
"""No-constraint queries should have lower confidence."""
constraint = extractor.extract("Algemene informatie over erfgoed")
assert constraint.confidence <= 0.75
class TestTemporalConstraintDataclass:
"""Tests for TemporalConstraint dataclass."""
def test_default_values(self):
"""Test default values of TemporalConstraint."""
constraint = TemporalConstraint(constraint_type="none")
assert constraint.date_start is None
assert constraint.date_end is None
assert constraint.reference_event is None
assert constraint.confidence == 0.8
assert constraint.recommended_template is None
def test_full_constraint(self):
"""Test TemporalConstraint with all fields."""
constraint = TemporalConstraint(
constraint_type="between",
date_start="1990-01-01",
date_end="2000-12-31",
reference_event=None,
confidence=0.95,
recommended_template="events_in_period"
)
assert constraint.constraint_type == "between"
assert constraint.date_start == "1990-01-01"
assert constraint.date_end == "2000-12-31"
assert constraint.confidence == 0.95
assert constraint.recommended_template == "events_in_period"
class TestTemporalIntentExtractorModule:
"""Tests for the DSPy module (without actual LLM calls)."""
def test_module_initialization(self):
"""Test module initializes correctly."""
module = TemporalIntentExtractorModule(confidence_threshold=0.75)
assert module.confidence_threshold == 0.75
assert module.fast_extractor is not None
def test_high_confidence_uses_fast_extraction(self):
"""High confidence queries should use fast extraction, not LLM."""
module = TemporalIntentExtractorModule(confidence_threshold=0.75)
# This query has high confidence (timeline keyword)
constraint = module.forward("Geschiedenis van het Rijksmuseum")
# Should use fast extraction result
assert constraint.constraint_type == "timeline"
assert constraint.confidence >= 0.75
class TestSingletonInstance:
"""Tests for singleton pattern."""
def test_get_temporal_extractor_singleton(self):
"""get_temporal_extractor should return same instance."""
ext1 = get_temporal_extractor()
ext2 = get_temporal_extractor()
assert ext1 is ext2
def test_singleton_is_temporal_constraint_extractor(self):
"""Singleton should be TemporalConstraintExtractor instance."""
ext = get_temporal_extractor()
assert isinstance(ext, TemporalConstraintExtractor)
class TestIntegration:
"""Integration tests for full temporal extraction flow."""
def test_dutch_point_in_time_full_flow(self):
"""Test complete flow for Dutch point-in-time query."""
extractor = get_temporal_extractor()
constraint = extractor.extract(
"Wat was de status van het Rijksmuseum in 1990?"
)
assert constraint.constraint_type == "point_in_time"
assert constraint.date_start == "1990-01-01"
assert constraint.date_end == "1990-12-31"
assert constraint.recommended_template == "point_in_time_state"
def test_english_timeline_full_flow(self):
"""Test complete flow for English timeline query."""
extractor = get_temporal_extractor()
constraint = extractor.extract(
"What is the history of the British Museum?"
)
assert constraint.constraint_type == "timeline"
assert constraint.recommended_template == "institution_timeline"
def test_date_range_full_flow(self):
"""Test complete flow for date range query."""
extractor = get_temporal_extractor()
constraint = extractor.extract(
"Welke fusies vonden plaats tussen 1990 en 2010?"
)
# Should detect "fusie" (merger) keyword first
# But since there are two years, it should be change_event or between
# Merger keywords take precedence
assert constraint.constraint_type == "change_event"
assert constraint.reference_event == "merger"
def test_superlative_with_location(self):
"""Test superlative query with location."""
extractor = get_temporal_extractor()
constraint = extractor.extract(
"Wat is het oudste archief in Noord-Holland?"
)
assert constraint.constraint_type == "oldest"
assert constraint.recommended_template == "find_by_founding"
def test_complex_query_multiple_indicators(self):
"""Test query with multiple temporal indicators."""
extractor = get_temporal_extractor()
# "geschiedenis" (timeline) and "oudste" (oldest) - timeline wins (checked first)
constraint = extractor.extract(
"Vertel me de geschiedenis van de oudste bibliotheek"
)
assert constraint.constraint_type == "timeline"
def test_query_templates_for_sparql(self):
"""Test that all temporal constraints map to valid templates."""
extractor = get_temporal_extractor()
test_cases = [
("Geschiedenis van het archief", "institution_timeline"),
("Het oudste museum", "find_by_founding"),
("Het nieuwste archief", "find_by_founding"),
("Status in 1990", "point_in_time_state"),
("Voor 1950", "point_in_time_state"), # Year + before
("Na 2000", "point_in_time_state"), # Year + after
("Fusies in de regio", "events_in_period"),
("Wanneer opgericht", "institution_timeline"),
("Wanneer gesloten", "institution_timeline"),
]
for query, expected_template in test_cases:
constraint = extractor.extract(query)
# Some queries may not extract years, check if template matches expectation
if constraint.constraint_type != "none":
assert constraint.recommended_template == expected_template, (
f"Query '{query}' expected template '{expected_template}', "
f"got '{constraint.recommended_template}' "
f"(constraint_type: {constraint.constraint_type})"
)
class TestRealWorldQueries:
"""Tests with real-world heritage queries."""
@pytest.fixture
def extractor(self):
return get_temporal_extractor()
def test_noord_hollands_archief_history(self, extractor):
"""Real query about Noord-Hollands Archief history."""
constraint = extractor.extract(
"Wat is de geschiedenis van het Noord-Hollands Archief sinds de fusie in 2001?"
)
# "geschiedenis" (timeline) is checked before merger/year
assert constraint.constraint_type == "timeline"
def test_museum_founding_date(self, extractor):
"""Real query about museum founding."""
constraint = extractor.extract(
"Wanneer is het Rijksmuseum in Amsterdam opgericht?"
)
assert constraint.constraint_type == "founding"
def test_archives_before_ww2(self, extractor):
"""Query about archives before WWII."""
constraint = extractor.extract(
"Welke gemeentearchieven bestonden voor 1940?"
)
assert constraint.constraint_type == "before"
assert "1940" in constraint.date_end
def test_oldest_university_library(self, extractor):
"""Query about oldest university library."""
constraint = extractor.extract(
"Wat is de oudste universiteitsbibliotheek van Nederland?"
)
assert constraint.constraint_type == "oldest"
def test_museum_closures_pandemic(self, extractor):
"""Query about closures during pandemic."""
constraint = extractor.extract(
"Welke musea zijn gesloten tijdens de pandemie in 2020?"
)
# "gesloten" (closure) keyword
assert constraint.constraint_type == "closure"
def test_digital_archives_recent(self, extractor):
"""Query about recent digital archives."""
constraint = extractor.extract(
"Welke digitale archieven zijn na 2015 gelanceerd?"
)
assert constraint.constraint_type == "after"
assert "2015" in constraint.date_start
# Run with: pytest backend/rag/test_temporal_intent.py -v