fix: filter DSPy field markers from streaming output

Implements a state machine to filter streaming tokens:
- Only stream tokens from the 'answer' field to the frontend
- Skip tokens from 'reasoning', 'citations', 'confidence', 'follow_up' fields
- Remove DSPy field markers like '[[ ## answer ## ]]' from streamed content

This fixes the issue where raw DSPy signature field markers were being
displayed in the chat interface instead of clean answer text.
This commit is contained in:
kempersc 2025-12-26 03:11:44 +01:00
parent 6b9fa33767
commit fb7993e3af

View file

@ -21,6 +21,7 @@ import asyncio
import json
import logging
import random
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
@ -166,6 +167,7 @@ get_cacheable_sparql_docstring: Optional[Callable[[], str]] = None
get_cacheable_entity_docstring: Optional[Callable[[], str]] = None
get_cacheable_query_intent_docstring: Optional[Callable[[], str]] = None
get_cacheable_answer_docstring: Optional[Callable[[], str]] = None
get_cacheable_person_sparql_docstring: Optional[Callable[[], str]] = None
try:
from .schema_loader import (
@ -184,6 +186,7 @@ try:
get_cacheable_entity_docstring as _get_cacheable_entity_docstring,
get_cacheable_query_intent_docstring as _get_cacheable_query_intent_docstring,
get_cacheable_answer_docstring as _get_cacheable_answer_docstring,
get_cacheable_person_sparql_docstring as _get_cacheable_person_sparql_docstring,
)
get_heritage_schema = _get_heritage_schema
get_sparql_prefixes = _get_sparql_prefixes
@ -200,6 +203,7 @@ try:
get_cacheable_entity_docstring = _get_cacheable_entity_docstring
get_cacheable_query_intent_docstring = _get_cacheable_query_intent_docstring
get_cacheable_answer_docstring = _get_cacheable_answer_docstring
get_cacheable_person_sparql_docstring = _get_cacheable_person_sparql_docstring
SCHEMA_LOADER_AVAILABLE = True
except ImportError:
logger.info("Schema loader not available - using static signatures")
@ -473,6 +477,123 @@ class HeritageSPARQLGenerator(dspy.Signature):
explanation: str = dspy.OutputField(desc="What the query retrieves")
class HeritagePersonSPARQLGenerator(dspy.Signature):
"""Generate SPARQL queries for heritage person/staff queries.
You are an expert in SPARQL and the Heritage Person data model.
Generate valid SPARQL queries for finding people in heritage institutions.
REQUIRED PREFIXES:
PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX hc: <https://nde.nl/ontology/hc/>
MAIN CLASS:
- schema:Person - Person records
KEY PROPERTIES:
- schema:name - Person's full name (REQUIRED in SELECT)
- schema:jobTitle - Job title with embedded organization (e.g., "Manager bij Nationaal Archief")
- foaf:name - Alternative name field
- hc:custodianName - Associated institution name (may be empty)
CRITICAL PATTERN:
Organization names are often embedded IN the jobTitle, not in a separate field.
Use FILTER(CONTAINS(LCASE(?jobTitle), "organization name")) to find people at specific organizations.
ROLE TERMS (use in FILTER patterns with OR combinations):
Leadership (English): director, executive director, CEO, deputy director, assistant director,
head, chief, manager, team lead, coordinator, supervisor
Leadership (Dutch): directeur, adjunct-directeur, hoofd, manager, teamleider, teammanager,
coördinator, leidinggevende, afdelingshoofd
Governance (English): chair, chairman, chairperson, president, vice president, secretary,
treasurer, board member, trustee
Governance (Dutch): voorzitter, vice-voorzitter, secretaris, penningmeester, bestuurslid,
bestuursvoorzitter
Curatorial (English): curator, senior curator, chief curator, collections manager,
registrar, conservator
Curatorial (Dutch): conservator, collectiebeheerder, registrar
Archival (English): archivist, senior archivist, digital archivist, records manager,
archival manager, processing archivist
Archival (Dutch): archivaris, archiefmedewerker, informatiespecialist
Library (English): librarian, chief librarian, reference librarian, cataloger
Library (Dutch): bibliothecaris, catalogiseur
Research (English): researcher, historian, genealogist, research fellow
Research (Dutch): onderzoeker, historicus, genealoog
Digital (English): digital preservation specialist, digitization specialist, data manager,
metadata specialist, developer, IT specialist
Digital (Dutch): digitaliseringsmedewerker, datamanager, ICT-medewerker
Education (English): educator, education officer, tour guide, docent
Education (Dutch): educatiemedewerker, gids, rondleider
ALWAYS EXCLUDE anonymous profiles:
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
EXAMPLE QUERY - Find managers at Nationaal Archief:
```sparql
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
FILTER(CONTAINS(LCASE(?jobTitle), "manager") ||
CONTAINS(LCASE(?jobTitle), "hoofd") ||
CONTAINS(LCASE(?jobTitle), "directeur") ||
CONTAINS(LCASE(?jobTitle), "teamleider"))
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
LIMIT 50
```
EXAMPLE QUERY - Find all archivists:
```sparql
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "archiv") ||
CONTAINS(LCASE(?jobTitle), "archivist"))
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
LIMIT 100
```
EXAMPLE QUERY - Find curators at a specific museum:
```sparql
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "rijksmuseum"))
FILTER(CONTAINS(LCASE(?jobTitle), "curator") ||
CONTAINS(LCASE(?jobTitle), "conservator"))
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
```
"""
question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
intent: str = dspy.InputField(desc="Query intent from classifier")
entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
context: str = dspy.InputField(desc="Previous conversation context", default="")
sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
class HeritageSQLGenerator(dspy.Signature):
"""Generate SQL queries for DuckLake heritage analytics database.
@ -809,6 +930,8 @@ CRITICAL LANGUAGE RULE:
_schema_aware_sparql_signature = None
_schema_aware_entity_signature = None
_schema_aware_answer_signature = None
_schema_aware_person_sparql_signature = None
_schema_aware_person_sparql_signature = None
def get_schema_aware_sparql_signature() -> type[dspy.Signature]:
@ -835,6 +958,48 @@ def get_schema_aware_answer_signature() -> type[dspy.Signature]:
return _schema_aware_answer_signature
def _create_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
"""Factory to create Person SPARQL signature with schema-derived docstring.
Uses LinkML schema to inject correct prefixes and properties from
PersonObservation slot_usage into the signature docstring.
OpenAI Prompt Caching: Uses get_cacheable_person_sparql_docstring() which
prepends the full ontology context (1,200+ tokens) to ensure cache eligibility.
"""
if not SCHEMA_LOADER_AVAILABLE or get_cacheable_person_sparql_docstring is None:
logger.warning("Schema loader unavailable, using static person SPARQL signature")
return HeritagePersonSPARQLGenerator
try:
# Use cacheable docstring (1,500+ tokens) for OpenAI prompt caching
docstring = get_cacheable_person_sparql_docstring()
class SchemaAwarePersonSPARQLGenerator(dspy.Signature):
__doc__ = docstring
question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
intent: str = dspy.InputField(desc="Query intent from classifier")
entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
context: str = dspy.InputField(desc="Previous conversation context", default="")
sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
return SchemaAwarePersonSPARQLGenerator
except Exception as e:
logger.warning(f"Failed to create schema-aware person SPARQL signature: {e}")
return HeritagePersonSPARQLGenerator
def get_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
"""Get cached schema-aware person SPARQL signature."""
global _schema_aware_person_sparql_signature
if _schema_aware_person_sparql_signature is None:
_schema_aware_person_sparql_signature = _create_schema_aware_person_sparql_signature()
return _schema_aware_person_sparql_signature
def _create_schema_aware_query_intent_signature() -> type[dspy.Signature]:
"""Factory to create HeritageQueryIntent with schema-derived valid values.
@ -2923,12 +3088,21 @@ class HeritageRAGPipeline(dspy.Module):
self.entity_extractor = dspy.Predict(HeritageEntityExtractor)
# SPARQL generation - use schema-aware signature if available
# Institution SPARQL generator (crm:E39_Actor, hc:institutionType)
if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
self.sparql_gen = dspy.ChainOfThought(get_schema_aware_sparql_signature())
logger.info("Using schema-aware SPARQL generator with LinkML-derived prefixes")
else:
self.sparql_gen = dspy.ChainOfThought(HeritageSPARQLGenerator)
# Person SPARQL generator - use schema-aware signature if available
if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
self.person_sparql_gen = dspy.ChainOfThought(get_schema_aware_person_sparql_signature())
logger.info("Using schema-aware Person SPARQL generator with LinkML-derived properties")
else:
self.person_sparql_gen = dspy.ChainOfThought(HeritagePersonSPARQLGenerator)
logger.info("Person SPARQL generator initialized with static predicates")
# Multi-hop retrieval (uses its own signatures internally)
self.multi_hop = MultiHopHeritageRetriever(max_hops=max_hops)
@ -3089,20 +3263,32 @@ class HeritageRAGPipeline(dspy.Module):
# Step 3: Generate SPARQL if needed (use resolved question)
# Use fast_lm for SPARQL generation if available (performance optimization)
# Select the appropriate SPARQL generator based on entity_type:
# - person queries use HeritagePersonSPARQLGenerator (schema:Person predicates)
# - institution queries use HeritageSPARQLGenerator (crm:E39_Actor predicates)
sparql = None
if "sparql" in routing.sources:
# Select SPARQL generator based on entity_type from router
entity_type = getattr(routing, 'entity_type', 'institution')
if entity_type == "person":
sparql_generator = self.person_sparql_gen
logger.debug(f"Using person SPARQL generator for entity_type='{entity_type}'")
else:
sparql_generator = self.sparql_gen
logger.debug(f"Using institution SPARQL generator for entity_type='{entity_type}'")
if tracker:
with tracker.track_llm_call("gpt-4o-mini") as llm_usage:
if self.fast_lm:
with dspy.settings.context(lm=self.fast_lm):
sparql_result = self.sparql_gen(
sparql_result = sparql_generator(
question=resolved_question,
intent=routing.intent,
entities=routing.entities,
context="",
)
else:
sparql_result = self.sparql_gen(
sparql_result = sparql_generator(
question=resolved_question,
intent=routing.intent,
entities=routing.entities,
@ -3111,14 +3297,14 @@ class HeritageRAGPipeline(dspy.Module):
else:
if self.fast_lm:
with dspy.settings.context(lm=self.fast_lm):
sparql_result = self.sparql_gen(
sparql_result = sparql_generator(
question=resolved_question,
intent=routing.intent,
entities=routing.entities,
context="",
)
else:
sparql_result = self.sparql_gen(
sparql_result = sparql_generator(
question=resolved_question,
intent=routing.intent,
entities=routing.entities,
@ -3697,6 +3883,14 @@ class HeritageRAGPipeline(dspy.Module):
retry_count = 0
max_stream_retries = 2
# DSPy field marker pattern for filtering streaming output
# DSPy emits markers like [[ ## answer ## ]], [[ ## reasoning ## ]], etc.
DSPY_FIELD_MARKER = re.compile(r'\[\[\s*##\s*(\w+)\s*##\s*\]\]')
# State machine for DSPy streaming: only stream 'answer' field tokens
current_field = None # Track which DSPy output field we're in
STREAMABLE_FIELDS = {'answer'} # Only stream these fields to frontend
while not streaming_succeeded and retry_count <= max_stream_retries:
try:
# Create streamified version of the answer generator
@ -3721,8 +3915,21 @@ class HeritageRAGPipeline(dspy.Module):
follow_up = getattr(value, 'follow_up', [])
streaming_succeeded = True
elif isinstance(value, str):
# Streaming token
yield {"type": "token", "content": value}
# Filter DSPy field markers and only stream answer field
# Check if this token contains a field marker
marker_match = DSPY_FIELD_MARKER.search(value)
if marker_match:
# Update current field state
current_field = marker_match.group(1).lower()
# Remove the marker from the token
cleaned_token = DSPY_FIELD_MARKER.sub('', value).strip()
# Only yield if we're in a streamable field and have content
if current_field in STREAMABLE_FIELDS and cleaned_token:
yield {"type": "token", "content": cleaned_token}
elif current_field in STREAMABLE_FIELDS:
# We're in the answer field, stream this token
yield {"type": "token", "content": value}
# Tokens from other fields (reasoning, citations, etc.) are silently consumed
else:
# Handle ModelResponseStream from litellm/DSPy
# Token text is in choices[0].delta.content or .reasoning_content
@ -3740,9 +3947,16 @@ class HeritageRAGPipeline(dspy.Module):
yield {"type": "status", "message": value.message}
continue
# Yield extracted token if we got text
# Apply same field filtering to extracted token text
if token_text:
yield {"type": "token", "content": token_text}
marker_match = DSPY_FIELD_MARKER.search(token_text)
if marker_match:
current_field = marker_match.group(1).lower()
cleaned_token = DSPY_FIELD_MARKER.sub('', token_text).strip()
if current_field in STREAMABLE_FIELDS and cleaned_token:
yield {"type": "token", "content": cleaned_token}
elif current_field in STREAMABLE_FIELDS:
yield {"type": "token", "content": token_text}
# If we get here, streaming completed
streaming_succeeded = True