fix: filter DSPy field markers from streaming output
Implements a state machine to filter streaming tokens: - Only stream tokens from the 'answer' field to the frontend - Skip tokens from 'reasoning', 'citations', 'confidence', 'follow_up' fields - Remove DSPy field markers like '[[ ## answer ## ]]' from streamed content This fixes the issue where raw DSPy signature field markers were being displayed in the chat interface instead of clean answer text.
This commit is contained in:
parent
6b9fa33767
commit
fb7993e3af
1 changed files with 222 additions and 8 deletions
|
|
@ -21,6 +21,7 @@ import asyncio
|
|||
import json
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
|
|
@ -166,6 +167,7 @@ get_cacheable_sparql_docstring: Optional[Callable[[], str]] = None
|
|||
get_cacheable_entity_docstring: Optional[Callable[[], str]] = None
|
||||
get_cacheable_query_intent_docstring: Optional[Callable[[], str]] = None
|
||||
get_cacheable_answer_docstring: Optional[Callable[[], str]] = None
|
||||
get_cacheable_person_sparql_docstring: Optional[Callable[[], str]] = None
|
||||
|
||||
try:
|
||||
from .schema_loader import (
|
||||
|
|
@ -184,6 +186,7 @@ try:
|
|||
get_cacheable_entity_docstring as _get_cacheable_entity_docstring,
|
||||
get_cacheable_query_intent_docstring as _get_cacheable_query_intent_docstring,
|
||||
get_cacheable_answer_docstring as _get_cacheable_answer_docstring,
|
||||
get_cacheable_person_sparql_docstring as _get_cacheable_person_sparql_docstring,
|
||||
)
|
||||
get_heritage_schema = _get_heritage_schema
|
||||
get_sparql_prefixes = _get_sparql_prefixes
|
||||
|
|
@ -200,6 +203,7 @@ try:
|
|||
get_cacheable_entity_docstring = _get_cacheable_entity_docstring
|
||||
get_cacheable_query_intent_docstring = _get_cacheable_query_intent_docstring
|
||||
get_cacheable_answer_docstring = _get_cacheable_answer_docstring
|
||||
get_cacheable_person_sparql_docstring = _get_cacheable_person_sparql_docstring
|
||||
SCHEMA_LOADER_AVAILABLE = True
|
||||
except ImportError:
|
||||
logger.info("Schema loader not available - using static signatures")
|
||||
|
|
@ -473,6 +477,123 @@ class HeritageSPARQLGenerator(dspy.Signature):
|
|||
explanation: str = dspy.OutputField(desc="What the query retrieves")
|
||||
|
||||
|
||||
class HeritagePersonSPARQLGenerator(dspy.Signature):
|
||||
"""Generate SPARQL queries for heritage person/staff queries.
|
||||
|
||||
You are an expert in SPARQL and the Heritage Person data model.
|
||||
Generate valid SPARQL queries for finding people in heritage institutions.
|
||||
|
||||
REQUIRED PREFIXES:
|
||||
PREFIX schema: <http://schema.org/>
|
||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||
PREFIX hc: <https://nde.nl/ontology/hc/>
|
||||
|
||||
MAIN CLASS:
|
||||
- schema:Person - Person records
|
||||
|
||||
KEY PROPERTIES:
|
||||
- schema:name - Person's full name (REQUIRED in SELECT)
|
||||
- schema:jobTitle - Job title with embedded organization (e.g., "Manager bij Nationaal Archief")
|
||||
- foaf:name - Alternative name field
|
||||
- hc:custodianName - Associated institution name (may be empty)
|
||||
|
||||
CRITICAL PATTERN:
|
||||
Organization names are often embedded IN the jobTitle, not in a separate field.
|
||||
Use FILTER(CONTAINS(LCASE(?jobTitle), "organization name")) to find people at specific organizations.
|
||||
|
||||
ROLE TERMS (use in FILTER patterns with OR combinations):
|
||||
|
||||
Leadership (English): director, executive director, CEO, deputy director, assistant director,
|
||||
head, chief, manager, team lead, coordinator, supervisor
|
||||
Leadership (Dutch): directeur, adjunct-directeur, hoofd, manager, teamleider, teammanager,
|
||||
coördinator, leidinggevende, afdelingshoofd
|
||||
|
||||
Governance (English): chair, chairman, chairperson, president, vice president, secretary,
|
||||
treasurer, board member, trustee
|
||||
Governance (Dutch): voorzitter, vice-voorzitter, secretaris, penningmeester, bestuurslid,
|
||||
bestuursvoorzitter
|
||||
|
||||
Curatorial (English): curator, senior curator, chief curator, collections manager,
|
||||
registrar, conservator
|
||||
Curatorial (Dutch): conservator, collectiebeheerder, registrar
|
||||
|
||||
Archival (English): archivist, senior archivist, digital archivist, records manager,
|
||||
archival manager, processing archivist
|
||||
Archival (Dutch): archivaris, archiefmedewerker, informatiespecialist
|
||||
|
||||
Library (English): librarian, chief librarian, reference librarian, cataloger
|
||||
Library (Dutch): bibliothecaris, catalogiseur
|
||||
|
||||
Research (English): researcher, historian, genealogist, research fellow
|
||||
Research (Dutch): onderzoeker, historicus, genealoog
|
||||
|
||||
Digital (English): digital preservation specialist, digitization specialist, data manager,
|
||||
metadata specialist, developer, IT specialist
|
||||
Digital (Dutch): digitaliseringsmedewerker, datamanager, ICT-medewerker
|
||||
|
||||
Education (English): educator, education officer, tour guide, docent
|
||||
Education (Dutch): educatiemedewerker, gids, rondleider
|
||||
|
||||
ALWAYS EXCLUDE anonymous profiles:
|
||||
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
||||
|
||||
EXAMPLE QUERY - Find managers at Nationaal Archief:
|
||||
```sparql
|
||||
PREFIX schema: <http://schema.org/>
|
||||
SELECT DISTINCT ?name ?jobTitle WHERE {
|
||||
?person a schema:Person ;
|
||||
schema:name ?name ;
|
||||
schema:jobTitle ?jobTitle .
|
||||
FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
|
||||
FILTER(CONTAINS(LCASE(?jobTitle), "manager") ||
|
||||
CONTAINS(LCASE(?jobTitle), "hoofd") ||
|
||||
CONTAINS(LCASE(?jobTitle), "directeur") ||
|
||||
CONTAINS(LCASE(?jobTitle), "teamleider"))
|
||||
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
||||
}
|
||||
ORDER BY ?name
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
EXAMPLE QUERY - Find all archivists:
|
||||
```sparql
|
||||
PREFIX schema: <http://schema.org/>
|
||||
SELECT DISTINCT ?name ?jobTitle WHERE {
|
||||
?person a schema:Person ;
|
||||
schema:name ?name ;
|
||||
schema:jobTitle ?jobTitle .
|
||||
FILTER(CONTAINS(LCASE(?jobTitle), "archiv") ||
|
||||
CONTAINS(LCASE(?jobTitle), "archivist"))
|
||||
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
||||
}
|
||||
ORDER BY ?name
|
||||
LIMIT 100
|
||||
```
|
||||
|
||||
EXAMPLE QUERY - Find curators at a specific museum:
|
||||
```sparql
|
||||
PREFIX schema: <http://schema.org/>
|
||||
SELECT DISTINCT ?name ?jobTitle WHERE {
|
||||
?person a schema:Person ;
|
||||
schema:name ?name ;
|
||||
schema:jobTitle ?jobTitle .
|
||||
FILTER(CONTAINS(LCASE(?jobTitle), "rijksmuseum"))
|
||||
FILTER(CONTAINS(LCASE(?jobTitle), "curator") ||
|
||||
CONTAINS(LCASE(?jobTitle), "conservator"))
|
||||
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
||||
}
|
||||
ORDER BY ?name
|
||||
```
|
||||
"""
|
||||
|
||||
question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
|
||||
intent: str = dspy.InputField(desc="Query intent from classifier")
|
||||
entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
|
||||
context: str = dspy.InputField(desc="Previous conversation context", default="")
|
||||
|
||||
sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
|
||||
explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
|
||||
|
||||
|
||||
class HeritageSQLGenerator(dspy.Signature):
|
||||
"""Generate SQL queries for DuckLake heritage analytics database.
|
||||
|
|
@ -809,6 +930,8 @@ CRITICAL LANGUAGE RULE:
|
|||
_schema_aware_sparql_signature = None
|
||||
_schema_aware_entity_signature = None
|
||||
_schema_aware_answer_signature = None
|
||||
_schema_aware_person_sparql_signature = None
|
||||
_schema_aware_person_sparql_signature = None
|
||||
|
||||
|
||||
def get_schema_aware_sparql_signature() -> type[dspy.Signature]:
|
||||
|
|
@ -835,6 +958,48 @@ def get_schema_aware_answer_signature() -> type[dspy.Signature]:
|
|||
return _schema_aware_answer_signature
|
||||
|
||||
|
||||
def _create_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
|
||||
"""Factory to create Person SPARQL signature with schema-derived docstring.
|
||||
|
||||
Uses LinkML schema to inject correct prefixes and properties from
|
||||
PersonObservation slot_usage into the signature docstring.
|
||||
|
||||
OpenAI Prompt Caching: Uses get_cacheable_person_sparql_docstring() which
|
||||
prepends the full ontology context (1,200+ tokens) to ensure cache eligibility.
|
||||
"""
|
||||
if not SCHEMA_LOADER_AVAILABLE or get_cacheable_person_sparql_docstring is None:
|
||||
logger.warning("Schema loader unavailable, using static person SPARQL signature")
|
||||
return HeritagePersonSPARQLGenerator
|
||||
|
||||
try:
|
||||
# Use cacheable docstring (1,500+ tokens) for OpenAI prompt caching
|
||||
docstring = get_cacheable_person_sparql_docstring()
|
||||
|
||||
class SchemaAwarePersonSPARQLGenerator(dspy.Signature):
|
||||
__doc__ = docstring
|
||||
|
||||
question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
|
||||
intent: str = dspy.InputField(desc="Query intent from classifier")
|
||||
entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
|
||||
context: str = dspy.InputField(desc="Previous conversation context", default="")
|
||||
|
||||
sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
|
||||
explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
|
||||
|
||||
return SchemaAwarePersonSPARQLGenerator
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to create schema-aware person SPARQL signature: {e}")
|
||||
return HeritagePersonSPARQLGenerator
|
||||
|
||||
|
||||
def get_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
|
||||
"""Get cached schema-aware person SPARQL signature."""
|
||||
global _schema_aware_person_sparql_signature
|
||||
if _schema_aware_person_sparql_signature is None:
|
||||
_schema_aware_person_sparql_signature = _create_schema_aware_person_sparql_signature()
|
||||
return _schema_aware_person_sparql_signature
|
||||
|
||||
|
||||
def _create_schema_aware_query_intent_signature() -> type[dspy.Signature]:
|
||||
"""Factory to create HeritageQueryIntent with schema-derived valid values.
|
||||
|
||||
|
|
@ -2923,12 +3088,21 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
self.entity_extractor = dspy.Predict(HeritageEntityExtractor)
|
||||
|
||||
# SPARQL generation - use schema-aware signature if available
|
||||
# Institution SPARQL generator (crm:E39_Actor, hc:institutionType)
|
||||
if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
|
||||
self.sparql_gen = dspy.ChainOfThought(get_schema_aware_sparql_signature())
|
||||
logger.info("Using schema-aware SPARQL generator with LinkML-derived prefixes")
|
||||
else:
|
||||
self.sparql_gen = dspy.ChainOfThought(HeritageSPARQLGenerator)
|
||||
|
||||
# Person SPARQL generator - use schema-aware signature if available
|
||||
if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
|
||||
self.person_sparql_gen = dspy.ChainOfThought(get_schema_aware_person_sparql_signature())
|
||||
logger.info("Using schema-aware Person SPARQL generator with LinkML-derived properties")
|
||||
else:
|
||||
self.person_sparql_gen = dspy.ChainOfThought(HeritagePersonSPARQLGenerator)
|
||||
logger.info("Person SPARQL generator initialized with static predicates")
|
||||
|
||||
# Multi-hop retrieval (uses its own signatures internally)
|
||||
self.multi_hop = MultiHopHeritageRetriever(max_hops=max_hops)
|
||||
|
||||
|
|
@ -3089,20 +3263,32 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
|
||||
# Step 3: Generate SPARQL if needed (use resolved question)
|
||||
# Use fast_lm for SPARQL generation if available (performance optimization)
|
||||
# Select the appropriate SPARQL generator based on entity_type:
|
||||
# - person queries use HeritagePersonSPARQLGenerator (schema:Person predicates)
|
||||
# - institution queries use HeritageSPARQLGenerator (crm:E39_Actor predicates)
|
||||
sparql = None
|
||||
if "sparql" in routing.sources:
|
||||
# Select SPARQL generator based on entity_type from router
|
||||
entity_type = getattr(routing, 'entity_type', 'institution')
|
||||
if entity_type == "person":
|
||||
sparql_generator = self.person_sparql_gen
|
||||
logger.debug(f"Using person SPARQL generator for entity_type='{entity_type}'")
|
||||
else:
|
||||
sparql_generator = self.sparql_gen
|
||||
logger.debug(f"Using institution SPARQL generator for entity_type='{entity_type}'")
|
||||
|
||||
if tracker:
|
||||
with tracker.track_llm_call("gpt-4o-mini") as llm_usage:
|
||||
if self.fast_lm:
|
||||
with dspy.settings.context(lm=self.fast_lm):
|
||||
sparql_result = self.sparql_gen(
|
||||
sparql_result = sparql_generator(
|
||||
question=resolved_question,
|
||||
intent=routing.intent,
|
||||
entities=routing.entities,
|
||||
context="",
|
||||
)
|
||||
else:
|
||||
sparql_result = self.sparql_gen(
|
||||
sparql_result = sparql_generator(
|
||||
question=resolved_question,
|
||||
intent=routing.intent,
|
||||
entities=routing.entities,
|
||||
|
|
@ -3111,14 +3297,14 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
else:
|
||||
if self.fast_lm:
|
||||
with dspy.settings.context(lm=self.fast_lm):
|
||||
sparql_result = self.sparql_gen(
|
||||
sparql_result = sparql_generator(
|
||||
question=resolved_question,
|
||||
intent=routing.intent,
|
||||
entities=routing.entities,
|
||||
context="",
|
||||
)
|
||||
else:
|
||||
sparql_result = self.sparql_gen(
|
||||
sparql_result = sparql_generator(
|
||||
question=resolved_question,
|
||||
intent=routing.intent,
|
||||
entities=routing.entities,
|
||||
|
|
@ -3697,6 +3883,14 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
retry_count = 0
|
||||
max_stream_retries = 2
|
||||
|
||||
# DSPy field marker pattern for filtering streaming output
|
||||
# DSPy emits markers like [[ ## answer ## ]], [[ ## reasoning ## ]], etc.
|
||||
DSPY_FIELD_MARKER = re.compile(r'\[\[\s*##\s*(\w+)\s*##\s*\]\]')
|
||||
|
||||
# State machine for DSPy streaming: only stream 'answer' field tokens
|
||||
current_field = None # Track which DSPy output field we're in
|
||||
STREAMABLE_FIELDS = {'answer'} # Only stream these fields to frontend
|
||||
|
||||
while not streaming_succeeded and retry_count <= max_stream_retries:
|
||||
try:
|
||||
# Create streamified version of the answer generator
|
||||
|
|
@ -3721,8 +3915,21 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
follow_up = getattr(value, 'follow_up', [])
|
||||
streaming_succeeded = True
|
||||
elif isinstance(value, str):
|
||||
# Streaming token
|
||||
yield {"type": "token", "content": value}
|
||||
# Filter DSPy field markers and only stream answer field
|
||||
# Check if this token contains a field marker
|
||||
marker_match = DSPY_FIELD_MARKER.search(value)
|
||||
if marker_match:
|
||||
# Update current field state
|
||||
current_field = marker_match.group(1).lower()
|
||||
# Remove the marker from the token
|
||||
cleaned_token = DSPY_FIELD_MARKER.sub('', value).strip()
|
||||
# Only yield if we're in a streamable field and have content
|
||||
if current_field in STREAMABLE_FIELDS and cleaned_token:
|
||||
yield {"type": "token", "content": cleaned_token}
|
||||
elif current_field in STREAMABLE_FIELDS:
|
||||
# We're in the answer field, stream this token
|
||||
yield {"type": "token", "content": value}
|
||||
# Tokens from other fields (reasoning, citations, etc.) are silently consumed
|
||||
else:
|
||||
# Handle ModelResponseStream from litellm/DSPy
|
||||
# Token text is in choices[0].delta.content or .reasoning_content
|
||||
|
|
@ -3740,9 +3947,16 @@ class HeritageRAGPipeline(dspy.Module):
|
|||
yield {"type": "status", "message": value.message}
|
||||
continue
|
||||
|
||||
# Yield extracted token if we got text
|
||||
# Apply same field filtering to extracted token text
|
||||
if token_text:
|
||||
yield {"type": "token", "content": token_text}
|
||||
marker_match = DSPY_FIELD_MARKER.search(token_text)
|
||||
if marker_match:
|
||||
current_field = marker_match.group(1).lower()
|
||||
cleaned_token = DSPY_FIELD_MARKER.sub('', token_text).strip()
|
||||
if current_field in STREAMABLE_FIELDS and cleaned_token:
|
||||
yield {"type": "token", "content": cleaned_token}
|
||||
elif current_field in STREAMABLE_FIELDS:
|
||||
yield {"type": "token", "content": token_text}
|
||||
|
||||
# If we get here, streaming completed
|
||||
streaming_succeeded = True
|
||||
|
|
|
|||
Loading…
Reference in a new issue