""" Temporal Query Intent Extraction for Heritage RAG Extracts temporal constraints from natural language queries to enable temporal SPARQL template selection and conflict resolution. Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md """ import dspy from dataclasses import dataclass, field from typing import Optional, Literal from datetime import datetime import re import logging logger = logging.getLogger(__name__) @dataclass class TemporalConstraint: """Extracted temporal constraint from a query.""" constraint_type: Literal[ "point_in_time", # "in 1990", "on January 1, 2000" "before", # "before 2000", "vóór de fusie" "after", # "after 1995", "na de renovatie" "between", # "between 1990 and 2000" "oldest", # "oldest museum", "oudste archief" "newest", # "newest library", "nieuwste bibliotheek" "founding", # "when was X founded", "opgericht" "closure", # "when did X close", "gesloten" "change_event", # "merger", "split", "relocation" "timeline", # "history of", "geschiedenis van" "none" # No temporal constraint detected ] # Extracted dates (ISO format or year) date_start: Optional[str] = None date_end: Optional[str] = None # For relative references reference_event: Optional[str] = None # e.g., "de fusie", "the merger" # Confidence confidence: float = 0.8 # Recommended SPARQL template recommended_template: Optional[str] = None class TemporalConstraintExtractor: """ Fast extraction of temporal constraints without LLM. Uses pattern matching for common temporal expressions. Falls back to LLM for complex/ambiguous cases. """ # Year patterns YEAR_PATTERN = re.compile(r'\b(1[0-9]{3}|20[0-2][0-9])\b') # 1000-2029 DATE_PATTERN = re.compile( r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2})\b' ) # Dutch temporal keywords BEFORE_KEYWORDS_NL = ["voor", "vóór", "voordat", "eerder dan"] AFTER_KEYWORDS_NL = ["na", "nadat", "later dan", "sinds"] BETWEEN_KEYWORDS_NL = ["tussen", "van", "tot"] OLDEST_KEYWORDS_NL = ["oudste", "eerste", "oorspronkelijke"] NEWEST_KEYWORDS_NL = ["nieuwste", "laatste", "meest recente"] # English temporal keywords BEFORE_KEYWORDS_EN = ["before", "prior to", "earlier than"] AFTER_KEYWORDS_EN = ["after", "following", "since", "later than"] BETWEEN_KEYWORDS_EN = ["between", "from", "to"] OLDEST_KEYWORDS_EN = ["oldest", "first", "original", "earliest"] NEWEST_KEYWORDS_EN = ["newest", "latest", "most recent"] # Event keywords FOUNDING_KEYWORDS = ["opgericht", "gesticht", "founded", "established", "created"] CLOSURE_KEYWORDS = ["gesloten", "opgeheven", "closed", "dissolved", "terminated"] MERGER_KEYWORDS = ["fusie", "samenvoeging", "merger", "merged", "combined"] TIMELINE_KEYWORDS = [ "geschiedenis", "tijdlijn", "history", "timeline", "evolution", "door de jaren", "over time", "changes" ] # Template mapping TEMPLATE_MAP = { "point_in_time": "point_in_time_state", "before": "point_in_time_state", "after": "point_in_time_state", "between": "events_in_period", "oldest": "find_by_founding", "newest": "find_by_founding", "founding": "institution_timeline", "closure": "institution_timeline", "change_event": "events_in_period", "timeline": "institution_timeline", } def extract(self, query: str) -> TemporalConstraint: """ Extract temporal constraint from query. Fast operation using pattern matching. """ query_lower = query.lower() # 1. Check for timeline/history queries if any(kw in query_lower for kw in self.TIMELINE_KEYWORDS): return TemporalConstraint( constraint_type="timeline", confidence=0.9, recommended_template="institution_timeline" ) # 2. Check for superlatives (oldest/newest) if any(kw in query_lower for kw in self.OLDEST_KEYWORDS_NL + self.OLDEST_KEYWORDS_EN): return TemporalConstraint( constraint_type="oldest", confidence=0.9, recommended_template="find_by_founding" ) if any(kw in query_lower for kw in self.NEWEST_KEYWORDS_NL + self.NEWEST_KEYWORDS_EN): return TemporalConstraint( constraint_type="newest", confidence=0.9, recommended_template="find_by_founding" ) # 3. Check for change event keywords if any(kw in query_lower for kw in self.MERGER_KEYWORDS): return TemporalConstraint( constraint_type="change_event", reference_event="merger", confidence=0.85, recommended_template="events_in_period" ) if any(kw in query_lower for kw in self.FOUNDING_KEYWORDS): return TemporalConstraint( constraint_type="founding", confidence=0.85, recommended_template="institution_timeline" ) if any(kw in query_lower for kw in self.CLOSURE_KEYWORDS): return TemporalConstraint( constraint_type="closure", confidence=0.85, recommended_template="institution_timeline" ) # 4. Extract years from query years = self.YEAR_PATTERN.findall(query) if len(years) >= 2: # "between 1990 and 2000" years_sorted = sorted([int(y) for y in years]) return TemporalConstraint( constraint_type="between", date_start=f"{years_sorted[0]}-01-01", date_end=f"{years_sorted[-1]}-12-31", confidence=0.85, recommended_template="events_in_period" ) if len(years) == 1: year = years[0] # Check for before/after indicators with word boundary before_match = any( re.search(rf'\b{kw}\b', query_lower) for kw in self.BEFORE_KEYWORDS_NL + self.BEFORE_KEYWORDS_EN ) after_match = any( re.search(rf'\b{kw}\b', query_lower) for kw in self.AFTER_KEYWORDS_NL + self.AFTER_KEYWORDS_EN ) if before_match: return TemporalConstraint( constraint_type="before", date_end=f"{year}-01-01", confidence=0.85, recommended_template="point_in_time_state" ) if after_match: return TemporalConstraint( constraint_type="after", date_start=f"{year}-12-31", confidence=0.85, recommended_template="point_in_time_state" ) # Default: point in time return TemporalConstraint( constraint_type="point_in_time", date_start=f"{year}-01-01", date_end=f"{year}-12-31", confidence=0.8, recommended_template="point_in_time_state" ) # 5. No clear temporal constraint return TemporalConstraint( constraint_type="none", confidence=0.7 ) def get_template_for_constraint( self, constraint: TemporalConstraint ) -> Optional[str]: """Get recommended SPARQL template ID for temporal constraint.""" return self.TEMPLATE_MAP.get(constraint.constraint_type) # DSPy Signature for complex temporal extraction class TemporalQueryIntent(dspy.Signature): """ Extract temporal constraints from a heritage institution query. Use this for complex queries where pattern matching fails. """ query: str = dspy.InputField(desc="Natural language query about heritage institutions") language: str = dspy.InputField(desc="Query language: 'nl' or 'en'", default="nl") constraint_type: str = dspy.OutputField( desc="Type of temporal constraint: point_in_time, before, after, between, " "oldest, newest, founding, closure, change_event, timeline, none" ) date_start: str = dspy.OutputField( desc="Start date in ISO format (YYYY-MM-DD) or empty string if not applicable" ) date_end: str = dspy.OutputField( desc="End date in ISO format (YYYY-MM-DD) or empty string if not applicable" ) reference_event: str = dspy.OutputField( desc="Referenced event (e.g., 'fusie', 'merger') or empty string" ) confidence: float = dspy.OutputField( desc="Confidence score 0.0-1.0" ) class TemporalIntentExtractorModule(dspy.Module): """ DSPy module for temporal intent extraction. Uses fast pattern matching first, falls back to LLM for complex cases. """ def __init__(self, confidence_threshold: float = 0.75): super().__init__() self.fast_extractor = TemporalConstraintExtractor() self.llm_extractor = dspy.ChainOfThought(TemporalQueryIntent) self.confidence_threshold = confidence_threshold def forward(self, query: str, language: str = "nl") -> TemporalConstraint: """ Extract temporal constraint from query. Args: query: Natural language query language: Query language ('nl' or 'en') Returns: TemporalConstraint with extracted information """ # Try fast extraction first constraint = self.fast_extractor.extract(query) # If confidence is high enough, use fast result if constraint.confidence >= self.confidence_threshold: logger.debug(f"Fast temporal extraction: {constraint.constraint_type} (conf={constraint.confidence})") return constraint # Fall back to LLM for low confidence cases logger.debug(f"LLM temporal extraction (fast conf={constraint.confidence})") try: result = self.llm_extractor(query=query, language=language) return TemporalConstraint( constraint_type=result.constraint_type or "none", date_start=result.date_start if result.date_start else None, date_end=result.date_end if result.date_end else None, reference_event=result.reference_event if result.reference_event else None, confidence=float(result.confidence) if result.confidence else 0.7, recommended_template=self.fast_extractor.TEMPLATE_MAP.get(result.constraint_type) ) except Exception as e: logger.warning(f"LLM temporal extraction failed: {e}") # Return fast extraction result as fallback return constraint # Singleton instance _temporal_extractor: Optional[TemporalConstraintExtractor] = None def get_temporal_extractor() -> TemporalConstraintExtractor: """Get or create singleton temporal extractor instance.""" global _temporal_extractor if _temporal_extractor is None: _temporal_extractor = TemporalConstraintExtractor() return _temporal_extractor