glam/backend/rag/temporal_intent.py
kempersc 98c42bf272 Fix LinkML URI conflicts and generate RDF outputs
- Fix scope_note → finding_aid_scope_note in FindingAid.yaml
- Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead)
- Remove duplicate rico_record_set_type from class_metadata_slots.yaml
- Fix range types for equals_string compatibility (uriorcurie → string)
- Move class names from close_mappings to see_also in 10 RecordSetTypes files
- Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context
- Sync schemas to frontend/public/schemas/

Files: 1,151 changed (includes prior CustodianType migration)
2026-01-07 12:32:59 +01:00

311 lines
12 KiB
Python

"""
Temporal Query Intent Extraction for Heritage RAG
Extracts temporal constraints from natural language queries to enable
temporal SPARQL template selection and conflict resolution.
Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md
"""
import dspy
from dataclasses import dataclass, field
from typing import Optional, Literal
from datetime import datetime
import re
import logging
logger = logging.getLogger(__name__)
@dataclass
class TemporalConstraint:
"""Extracted temporal constraint from a query."""
constraint_type: Literal[
"point_in_time", # "in 1990", "on January 1, 2000"
"before", # "before 2000", "vóór de fusie"
"after", # "after 1995", "na de renovatie"
"between", # "between 1990 and 2000"
"oldest", # "oldest museum", "oudste archief"
"newest", # "newest library", "nieuwste bibliotheek"
"founding", # "when was X founded", "opgericht"
"closure", # "when did X close", "gesloten"
"change_event", # "merger", "split", "relocation"
"timeline", # "history of", "geschiedenis van"
"none" # No temporal constraint detected
]
# Extracted dates (ISO format or year)
date_start: Optional[str] = None
date_end: Optional[str] = None
# For relative references
reference_event: Optional[str] = None # e.g., "de fusie", "the merger"
# Confidence
confidence: float = 0.8
# Recommended SPARQL template
recommended_template: Optional[str] = None
class TemporalConstraintExtractor:
"""
Fast extraction of temporal constraints without LLM.
Uses pattern matching for common temporal expressions.
Falls back to LLM for complex/ambiguous cases.
"""
# Year patterns
YEAR_PATTERN = re.compile(r'\b(1[0-9]{3}|20[0-2][0-9])\b') # 1000-2029
DATE_PATTERN = re.compile(
r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2})\b'
)
# Dutch temporal keywords
BEFORE_KEYWORDS_NL = ["voor", "vóór", "voordat", "eerder dan"]
AFTER_KEYWORDS_NL = ["na", "nadat", "later dan", "sinds"]
BETWEEN_KEYWORDS_NL = ["tussen", "van", "tot"]
OLDEST_KEYWORDS_NL = ["oudste", "eerste", "oorspronkelijke"]
NEWEST_KEYWORDS_NL = ["nieuwste", "laatste", "meest recente"]
# English temporal keywords
BEFORE_KEYWORDS_EN = ["before", "prior to", "earlier than"]
AFTER_KEYWORDS_EN = ["after", "following", "since", "later than"]
BETWEEN_KEYWORDS_EN = ["between", "from", "to"]
OLDEST_KEYWORDS_EN = ["oldest", "first", "original", "earliest"]
NEWEST_KEYWORDS_EN = ["newest", "latest", "most recent"]
# Event keywords
FOUNDING_KEYWORDS = ["opgericht", "gesticht", "founded", "established", "created"]
CLOSURE_KEYWORDS = ["gesloten", "opgeheven", "closed", "dissolved", "terminated"]
MERGER_KEYWORDS = ["fusie", "samenvoeging", "merger", "merged", "combined"]
TIMELINE_KEYWORDS = [
"geschiedenis", "tijdlijn", "history", "timeline", "evolution",
"door de jaren", "over time", "changes"
]
# Template mapping
TEMPLATE_MAP = {
"point_in_time": "point_in_time_state",
"before": "point_in_time_state",
"after": "point_in_time_state",
"between": "events_in_period",
"oldest": "find_by_founding",
"newest": "find_by_founding",
"founding": "institution_timeline",
"closure": "institution_timeline",
"change_event": "events_in_period",
"timeline": "institution_timeline",
}
def extract(self, query: str) -> TemporalConstraint:
"""
Extract temporal constraint from query.
Fast operation using pattern matching.
"""
query_lower = query.lower()
# 1. Check for timeline/history queries
if any(kw in query_lower for kw in self.TIMELINE_KEYWORDS):
return TemporalConstraint(
constraint_type="timeline",
confidence=0.9,
recommended_template="institution_timeline"
)
# 2. Check for superlatives (oldest/newest)
if any(kw in query_lower for kw in self.OLDEST_KEYWORDS_NL + self.OLDEST_KEYWORDS_EN):
return TemporalConstraint(
constraint_type="oldest",
confidence=0.9,
recommended_template="find_by_founding"
)
if any(kw in query_lower for kw in self.NEWEST_KEYWORDS_NL + self.NEWEST_KEYWORDS_EN):
return TemporalConstraint(
constraint_type="newest",
confidence=0.9,
recommended_template="find_by_founding"
)
# 3. Check for change event keywords
if any(kw in query_lower for kw in self.MERGER_KEYWORDS):
return TemporalConstraint(
constraint_type="change_event",
reference_event="merger",
confidence=0.85,
recommended_template="events_in_period"
)
if any(kw in query_lower for kw in self.FOUNDING_KEYWORDS):
return TemporalConstraint(
constraint_type="founding",
confidence=0.85,
recommended_template="institution_timeline"
)
if any(kw in query_lower for kw in self.CLOSURE_KEYWORDS):
return TemporalConstraint(
constraint_type="closure",
confidence=0.85,
recommended_template="institution_timeline"
)
# 4. Extract years from query
years = self.YEAR_PATTERN.findall(query)
if len(years) >= 2:
# "between 1990 and 2000"
years_sorted = sorted([int(y) for y in years])
return TemporalConstraint(
constraint_type="between",
date_start=f"{years_sorted[0]}-01-01",
date_end=f"{years_sorted[-1]}-12-31",
confidence=0.85,
recommended_template="events_in_period"
)
if len(years) == 1:
year = years[0]
# Check for before/after indicators with word boundary
before_match = any(
re.search(rf'\b{kw}\b', query_lower)
for kw in self.BEFORE_KEYWORDS_NL + self.BEFORE_KEYWORDS_EN
)
after_match = any(
re.search(rf'\b{kw}\b', query_lower)
for kw in self.AFTER_KEYWORDS_NL + self.AFTER_KEYWORDS_EN
)
if before_match:
return TemporalConstraint(
constraint_type="before",
date_end=f"{year}-01-01",
confidence=0.85,
recommended_template="point_in_time_state"
)
if after_match:
return TemporalConstraint(
constraint_type="after",
date_start=f"{year}-12-31",
confidence=0.85,
recommended_template="point_in_time_state"
)
# Default: point in time
return TemporalConstraint(
constraint_type="point_in_time",
date_start=f"{year}-01-01",
date_end=f"{year}-12-31",
confidence=0.8,
recommended_template="point_in_time_state"
)
# 5. No clear temporal constraint
return TemporalConstraint(
constraint_type="none",
confidence=0.7
)
def get_template_for_constraint(
self,
constraint: TemporalConstraint
) -> Optional[str]:
"""Get recommended SPARQL template ID for temporal constraint."""
return self.TEMPLATE_MAP.get(constraint.constraint_type)
# DSPy Signature for complex temporal extraction
class TemporalQueryIntent(dspy.Signature):
"""
Extract temporal constraints from a heritage institution query.
Use this for complex queries where pattern matching fails.
"""
query: str = dspy.InputField(desc="Natural language query about heritage institutions")
language: str = dspy.InputField(desc="Query language: 'nl' or 'en'", default="nl")
constraint_type: str = dspy.OutputField(
desc="Type of temporal constraint: point_in_time, before, after, between, "
"oldest, newest, founding, closure, change_event, timeline, none"
)
date_start: str = dspy.OutputField(
desc="Start date in ISO format (YYYY-MM-DD) or empty string if not applicable"
)
date_end: str = dspy.OutputField(
desc="End date in ISO format (YYYY-MM-DD) or empty string if not applicable"
)
reference_event: str = dspy.OutputField(
desc="Referenced event (e.g., 'fusie', 'merger') or empty string"
)
confidence: float = dspy.OutputField(
desc="Confidence score 0.0-1.0"
)
class TemporalIntentExtractorModule(dspy.Module):
"""
DSPy module for temporal intent extraction.
Uses fast pattern matching first, falls back to LLM for complex cases.
"""
def __init__(self, confidence_threshold: float = 0.75):
super().__init__()
self.fast_extractor = TemporalConstraintExtractor()
self.llm_extractor = dspy.ChainOfThought(TemporalQueryIntent)
self.confidence_threshold = confidence_threshold
def forward(self, query: str, language: str = "nl") -> TemporalConstraint:
"""
Extract temporal constraint from query.
Args:
query: Natural language query
language: Query language ('nl' or 'en')
Returns:
TemporalConstraint with extracted information
"""
# Try fast extraction first
constraint = self.fast_extractor.extract(query)
# If confidence is high enough, use fast result
if constraint.confidence >= self.confidence_threshold:
logger.debug(f"Fast temporal extraction: {constraint.constraint_type} (conf={constraint.confidence})")
return constraint
# Fall back to LLM for low confidence cases
logger.debug(f"LLM temporal extraction (fast conf={constraint.confidence})")
try:
result = self.llm_extractor(query=query, language=language)
return TemporalConstraint(
constraint_type=result.constraint_type or "none",
date_start=result.date_start if result.date_start else None,
date_end=result.date_end if result.date_end else None,
reference_event=result.reference_event if result.reference_event else None,
confidence=float(result.confidence) if result.confidence else 0.7,
recommended_template=self.fast_extractor.TEMPLATE_MAP.get(result.constraint_type)
)
except Exception as e:
logger.warning(f"LLM temporal extraction failed: {e}")
# Return fast extraction result as fallback
return constraint
# Singleton instance
_temporal_extractor: Optional[TemporalConstraintExtractor] = None
def get_temporal_extractor() -> TemporalConstraintExtractor:
"""Get or create singleton temporal extractor instance."""
global _temporal_extractor
if _temporal_extractor is None:
_temporal_extractor = TemporalConstraintExtractor()
return _temporal_extractor