- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
311 lines
12 KiB
Python
311 lines
12 KiB
Python
"""
|
|
Temporal Query Intent Extraction for Heritage RAG
|
|
|
|
Extracts temporal constraints from natural language queries to enable
|
|
temporal SPARQL template selection and conflict resolution.
|
|
|
|
Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md
|
|
"""
|
|
|
|
import dspy
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional, Literal
|
|
from datetime import datetime
|
|
import re
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TemporalConstraint:
|
|
"""Extracted temporal constraint from a query."""
|
|
constraint_type: Literal[
|
|
"point_in_time", # "in 1990", "on January 1, 2000"
|
|
"before", # "before 2000", "vóór de fusie"
|
|
"after", # "after 1995", "na de renovatie"
|
|
"between", # "between 1990 and 2000"
|
|
"oldest", # "oldest museum", "oudste archief"
|
|
"newest", # "newest library", "nieuwste bibliotheek"
|
|
"founding", # "when was X founded", "opgericht"
|
|
"closure", # "when did X close", "gesloten"
|
|
"change_event", # "merger", "split", "relocation"
|
|
"timeline", # "history of", "geschiedenis van"
|
|
"none" # No temporal constraint detected
|
|
]
|
|
|
|
# Extracted dates (ISO format or year)
|
|
date_start: Optional[str] = None
|
|
date_end: Optional[str] = None
|
|
|
|
# For relative references
|
|
reference_event: Optional[str] = None # e.g., "de fusie", "the merger"
|
|
|
|
# Confidence
|
|
confidence: float = 0.8
|
|
|
|
# Recommended SPARQL template
|
|
recommended_template: Optional[str] = None
|
|
|
|
|
|
class TemporalConstraintExtractor:
|
|
"""
|
|
Fast extraction of temporal constraints without LLM.
|
|
|
|
Uses pattern matching for common temporal expressions.
|
|
Falls back to LLM for complex/ambiguous cases.
|
|
"""
|
|
|
|
# Year patterns
|
|
YEAR_PATTERN = re.compile(r'\b(1[0-9]{3}|20[0-2][0-9])\b') # 1000-2029
|
|
DATE_PATTERN = re.compile(
|
|
r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2})\b'
|
|
)
|
|
|
|
# Dutch temporal keywords
|
|
BEFORE_KEYWORDS_NL = ["voor", "vóór", "voordat", "eerder dan"]
|
|
AFTER_KEYWORDS_NL = ["na", "nadat", "later dan", "sinds"]
|
|
BETWEEN_KEYWORDS_NL = ["tussen", "van", "tot"]
|
|
OLDEST_KEYWORDS_NL = ["oudste", "eerste", "oorspronkelijke"]
|
|
NEWEST_KEYWORDS_NL = ["nieuwste", "laatste", "meest recente"]
|
|
|
|
# English temporal keywords
|
|
BEFORE_KEYWORDS_EN = ["before", "prior to", "earlier than"]
|
|
AFTER_KEYWORDS_EN = ["after", "following", "since", "later than"]
|
|
BETWEEN_KEYWORDS_EN = ["between", "from", "to"]
|
|
OLDEST_KEYWORDS_EN = ["oldest", "first", "original", "earliest"]
|
|
NEWEST_KEYWORDS_EN = ["newest", "latest", "most recent"]
|
|
|
|
# Event keywords
|
|
FOUNDING_KEYWORDS = ["opgericht", "gesticht", "founded", "established", "created"]
|
|
CLOSURE_KEYWORDS = ["gesloten", "opgeheven", "closed", "dissolved", "terminated"]
|
|
MERGER_KEYWORDS = ["fusie", "samenvoeging", "merger", "merged", "combined"]
|
|
TIMELINE_KEYWORDS = [
|
|
"geschiedenis", "tijdlijn", "history", "timeline", "evolution",
|
|
"door de jaren", "over time", "changes"
|
|
]
|
|
|
|
# Template mapping
|
|
TEMPLATE_MAP = {
|
|
"point_in_time": "point_in_time_state",
|
|
"before": "point_in_time_state",
|
|
"after": "point_in_time_state",
|
|
"between": "events_in_period",
|
|
"oldest": "find_by_founding",
|
|
"newest": "find_by_founding",
|
|
"founding": "institution_timeline",
|
|
"closure": "institution_timeline",
|
|
"change_event": "events_in_period",
|
|
"timeline": "institution_timeline",
|
|
}
|
|
|
|
def extract(self, query: str) -> TemporalConstraint:
|
|
"""
|
|
Extract temporal constraint from query.
|
|
|
|
Fast operation using pattern matching.
|
|
"""
|
|
query_lower = query.lower()
|
|
|
|
# 1. Check for timeline/history queries
|
|
if any(kw in query_lower for kw in self.TIMELINE_KEYWORDS):
|
|
return TemporalConstraint(
|
|
constraint_type="timeline",
|
|
confidence=0.9,
|
|
recommended_template="institution_timeline"
|
|
)
|
|
|
|
# 2. Check for superlatives (oldest/newest)
|
|
if any(kw in query_lower for kw in self.OLDEST_KEYWORDS_NL + self.OLDEST_KEYWORDS_EN):
|
|
return TemporalConstraint(
|
|
constraint_type="oldest",
|
|
confidence=0.9,
|
|
recommended_template="find_by_founding"
|
|
)
|
|
|
|
if any(kw in query_lower for kw in self.NEWEST_KEYWORDS_NL + self.NEWEST_KEYWORDS_EN):
|
|
return TemporalConstraint(
|
|
constraint_type="newest",
|
|
confidence=0.9,
|
|
recommended_template="find_by_founding"
|
|
)
|
|
|
|
# 3. Check for change event keywords
|
|
if any(kw in query_lower for kw in self.MERGER_KEYWORDS):
|
|
return TemporalConstraint(
|
|
constraint_type="change_event",
|
|
reference_event="merger",
|
|
confidence=0.85,
|
|
recommended_template="events_in_period"
|
|
)
|
|
|
|
if any(kw in query_lower for kw in self.FOUNDING_KEYWORDS):
|
|
return TemporalConstraint(
|
|
constraint_type="founding",
|
|
confidence=0.85,
|
|
recommended_template="institution_timeline"
|
|
)
|
|
|
|
if any(kw in query_lower for kw in self.CLOSURE_KEYWORDS):
|
|
return TemporalConstraint(
|
|
constraint_type="closure",
|
|
confidence=0.85,
|
|
recommended_template="institution_timeline"
|
|
)
|
|
|
|
# 4. Extract years from query
|
|
years = self.YEAR_PATTERN.findall(query)
|
|
|
|
if len(years) >= 2:
|
|
# "between 1990 and 2000"
|
|
years_sorted = sorted([int(y) for y in years])
|
|
return TemporalConstraint(
|
|
constraint_type="between",
|
|
date_start=f"{years_sorted[0]}-01-01",
|
|
date_end=f"{years_sorted[-1]}-12-31",
|
|
confidence=0.85,
|
|
recommended_template="events_in_period"
|
|
)
|
|
|
|
if len(years) == 1:
|
|
year = years[0]
|
|
|
|
# Check for before/after indicators with word boundary
|
|
before_match = any(
|
|
re.search(rf'\b{kw}\b', query_lower)
|
|
for kw in self.BEFORE_KEYWORDS_NL + self.BEFORE_KEYWORDS_EN
|
|
)
|
|
after_match = any(
|
|
re.search(rf'\b{kw}\b', query_lower)
|
|
for kw in self.AFTER_KEYWORDS_NL + self.AFTER_KEYWORDS_EN
|
|
)
|
|
|
|
if before_match:
|
|
return TemporalConstraint(
|
|
constraint_type="before",
|
|
date_end=f"{year}-01-01",
|
|
confidence=0.85,
|
|
recommended_template="point_in_time_state"
|
|
)
|
|
|
|
if after_match:
|
|
return TemporalConstraint(
|
|
constraint_type="after",
|
|
date_start=f"{year}-12-31",
|
|
confidence=0.85,
|
|
recommended_template="point_in_time_state"
|
|
)
|
|
|
|
# Default: point in time
|
|
return TemporalConstraint(
|
|
constraint_type="point_in_time",
|
|
date_start=f"{year}-01-01",
|
|
date_end=f"{year}-12-31",
|
|
confidence=0.8,
|
|
recommended_template="point_in_time_state"
|
|
)
|
|
|
|
# 5. No clear temporal constraint
|
|
return TemporalConstraint(
|
|
constraint_type="none",
|
|
confidence=0.7
|
|
)
|
|
|
|
def get_template_for_constraint(
|
|
self,
|
|
constraint: TemporalConstraint
|
|
) -> Optional[str]:
|
|
"""Get recommended SPARQL template ID for temporal constraint."""
|
|
return self.TEMPLATE_MAP.get(constraint.constraint_type)
|
|
|
|
|
|
# DSPy Signature for complex temporal extraction
|
|
class TemporalQueryIntent(dspy.Signature):
|
|
"""
|
|
Extract temporal constraints from a heritage institution query.
|
|
|
|
Use this for complex queries where pattern matching fails.
|
|
"""
|
|
query: str = dspy.InputField(desc="Natural language query about heritage institutions")
|
|
language: str = dspy.InputField(desc="Query language: 'nl' or 'en'", default="nl")
|
|
|
|
constraint_type: str = dspy.OutputField(
|
|
desc="Type of temporal constraint: point_in_time, before, after, between, "
|
|
"oldest, newest, founding, closure, change_event, timeline, none"
|
|
)
|
|
date_start: str = dspy.OutputField(
|
|
desc="Start date in ISO format (YYYY-MM-DD) or empty string if not applicable"
|
|
)
|
|
date_end: str = dspy.OutputField(
|
|
desc="End date in ISO format (YYYY-MM-DD) or empty string if not applicable"
|
|
)
|
|
reference_event: str = dspy.OutputField(
|
|
desc="Referenced event (e.g., 'fusie', 'merger') or empty string"
|
|
)
|
|
confidence: float = dspy.OutputField(
|
|
desc="Confidence score 0.0-1.0"
|
|
)
|
|
|
|
|
|
class TemporalIntentExtractorModule(dspy.Module):
|
|
"""
|
|
DSPy module for temporal intent extraction.
|
|
|
|
Uses fast pattern matching first, falls back to LLM for complex cases.
|
|
"""
|
|
|
|
def __init__(self, confidence_threshold: float = 0.75):
|
|
super().__init__()
|
|
self.fast_extractor = TemporalConstraintExtractor()
|
|
self.llm_extractor = dspy.ChainOfThought(TemporalQueryIntent)
|
|
self.confidence_threshold = confidence_threshold
|
|
|
|
def forward(self, query: str, language: str = "nl") -> TemporalConstraint:
|
|
"""
|
|
Extract temporal constraint from query.
|
|
|
|
Args:
|
|
query: Natural language query
|
|
language: Query language ('nl' or 'en')
|
|
|
|
Returns:
|
|
TemporalConstraint with extracted information
|
|
"""
|
|
# Try fast extraction first
|
|
constraint = self.fast_extractor.extract(query)
|
|
|
|
# If confidence is high enough, use fast result
|
|
if constraint.confidence >= self.confidence_threshold:
|
|
logger.debug(f"Fast temporal extraction: {constraint.constraint_type} (conf={constraint.confidence})")
|
|
return constraint
|
|
|
|
# Fall back to LLM for low confidence cases
|
|
logger.debug(f"LLM temporal extraction (fast conf={constraint.confidence})")
|
|
|
|
try:
|
|
result = self.llm_extractor(query=query, language=language)
|
|
|
|
return TemporalConstraint(
|
|
constraint_type=result.constraint_type or "none",
|
|
date_start=result.date_start if result.date_start else None,
|
|
date_end=result.date_end if result.date_end else None,
|
|
reference_event=result.reference_event if result.reference_event else None,
|
|
confidence=float(result.confidence) if result.confidence else 0.7,
|
|
recommended_template=self.fast_extractor.TEMPLATE_MAP.get(result.constraint_type)
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"LLM temporal extraction failed: {e}")
|
|
# Return fast extraction result as fallback
|
|
return constraint
|
|
|
|
|
|
# Singleton instance
|
|
_temporal_extractor: Optional[TemporalConstraintExtractor] = None
|
|
|
|
|
|
def get_temporal_extractor() -> TemporalConstraintExtractor:
|
|
"""Get or create singleton temporal extractor instance."""
|
|
global _temporal_extractor
|
|
if _temporal_extractor is None:
|
|
_temporal_extractor = TemporalConstraintExtractor()
|
|
return _temporal_extractor
|