fix: filter DSPy field markers from streaming output

Implements a state machine to filter streaming tokens: - Only stream tokens from the 'answer' field to the frontend - Skip tokens from 'reasoning', 'citations', 'confidence', 'follow_up' fields - Remove DSPy field markers like '[[ ## answer ## ]]' from streamed content This fixes the issue where raw DSPy signature field markers were being displayed in the chat interface instead of clean answer text.
2025-12-26 03:11:44 +01:00 · 2025-12-26 03:11:44 +01:00 · fb7993e3af
commit fb7993e3af
parent 6b9fa33767
1 changed files with 222 additions and 8 deletions
--- a/backend/rag/dspy_heritage_rag.py
+++ b/backend/rag/dspy_heritage_rag.py
@ -21,6 +21,7 @@ import asyncio
 import json
 import logging
 import random
 import re
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from enum import Enum
@ -166,6 +167,7 @@ get_cacheable_sparql_docstring: Optional[Callable[[], str]] = None
 get_cacheable_entity_docstring: Optional[Callable[[], str]] = None
 get_cacheable_query_intent_docstring: Optional[Callable[[], str]] = None
 get_cacheable_answer_docstring: Optional[Callable[[], str]] = None
 get_cacheable_person_sparql_docstring: Optional[Callable[[], str]] = None
 try:
    from .schema_loader import (
@ -184,6 +186,7 @@ try:
        get_cacheable_entity_docstring as _get_cacheable_entity_docstring,
        get_cacheable_query_intent_docstring as _get_cacheable_query_intent_docstring,
        get_cacheable_answer_docstring as _get_cacheable_answer_docstring,
        get_cacheable_person_sparql_docstring as _get_cacheable_person_sparql_docstring,
    )
    get_heritage_schema = _get_heritage_schema
    get_sparql_prefixes = _get_sparql_prefixes
@ -200,6 +203,7 @@ try:
    get_cacheable_entity_docstring = _get_cacheable_entity_docstring
    get_cacheable_query_intent_docstring = _get_cacheable_query_intent_docstring
    get_cacheable_answer_docstring = _get_cacheable_answer_docstring
    get_cacheable_person_sparql_docstring = _get_cacheable_person_sparql_docstring
    SCHEMA_LOADER_AVAILABLE = True
 except ImportError:
    logger.info("Schema loader not available - using static signatures")
@ -473,6 +477,123 @@ class HeritageSPARQLGenerator(dspy.Signature):
    explanation: str = dspy.OutputField(desc="What the query retrieves")
 class HeritagePersonSPARQLGenerator(dspy.Signature):
    """Generate SPARQL queries for heritage person/staff queries.
    You are an expert in SPARQL and the Heritage Person data model.
    Generate valid SPARQL queries for finding people in heritage institutions.
    REQUIRED PREFIXES:
    PREFIX schema: <http://schema.org/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX hc: <https://nde.nl/ontology/hc/>
    MAIN CLASS:
    - schema:Person - Person records
    KEY PROPERTIES:
    - schema:name - Person's full name (REQUIRED in SELECT)
    - schema:jobTitle - Job title with embedded organization (e.g., "Manager bij Nationaal Archief")
    - foaf:name - Alternative name field
    - hc:custodianName - Associated institution name (may be empty)
    CRITICAL PATTERN:
    Organization names are often embedded IN the jobTitle, not in a separate field.
    Use FILTER(CONTAINS(LCASE(?jobTitle), "organization name")) to find people at specific organizations.
    ROLE TERMS (use in FILTER patterns with OR combinations):
    Leadership (English): director, executive director, CEO, deputy director, assistant director,
                          head, chief, manager, team lead, coordinator, supervisor
    Leadership (Dutch): directeur, adjunct-directeur, hoofd, manager, teamleider, teammanager,
                        coördinator, leidinggevende, afdelingshoofd
    Governance (English): chair, chairman, chairperson, president, vice president, secretary,
                          treasurer, board member, trustee
    Governance (Dutch): voorzitter, vice-voorzitter, secretaris, penningmeester, bestuurslid,
                        bestuursvoorzitter
    Curatorial (English): curator, senior curator, chief curator, collections manager,
                          registrar, conservator
    Curatorial (Dutch): conservator, collectiebeheerder, registrar
    Archival (English): archivist, senior archivist, digital archivist, records manager,
                        archival manager, processing archivist
    Archival (Dutch): archivaris, archiefmedewerker, informatiespecialist
    Library (English): librarian, chief librarian, reference librarian, cataloger
    Library (Dutch): bibliothecaris, catalogiseur
    Research (English): researcher, historian, genealogist, research fellow
    Research (Dutch): onderzoeker, historicus, genealoog
    Digital (English): digital preservation specialist, digitization specialist, data manager,
                       metadata specialist, developer, IT specialist
    Digital (Dutch): digitaliseringsmedewerker, datamanager, ICT-medewerker
    Education (English): educator, education officer, tour guide, docent
    Education (Dutch): educatiemedewerker, gids, rondleider
    ALWAYS EXCLUDE anonymous profiles:
    FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
    EXAMPLE QUERY - Find managers at Nationaal Archief:
    ```sparql
    PREFIX schema: <http://schema.org/>
    SELECT DISTINCT ?name ?jobTitle WHERE {
      ?person a schema:Person ;
              schema:name ?name ;
              schema:jobTitle ?jobTitle .
      FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
      FILTER(CONTAINS(LCASE(?jobTitle), "manager") || 
             CONTAINS(LCASE(?jobTitle), "hoofd") ||
             CONTAINS(LCASE(?jobTitle), "directeur") ||
             CONTAINS(LCASE(?jobTitle), "teamleider"))
      FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
    }
    ORDER BY ?name
    LIMIT 50
    ```
    EXAMPLE QUERY - Find all archivists:
    ```sparql
    PREFIX schema: <http://schema.org/>
    SELECT DISTINCT ?name ?jobTitle WHERE {
      ?person a schema:Person ;
              schema:name ?name ;
              schema:jobTitle ?jobTitle .
      FILTER(CONTAINS(LCASE(?jobTitle), "archiv") || 
             CONTAINS(LCASE(?jobTitle), "archivist"))
      FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
    }
    ORDER BY ?name
    LIMIT 100
    ```
    EXAMPLE QUERY - Find curators at a specific museum:
    ```sparql
    PREFIX schema: <http://schema.org/>
    SELECT DISTINCT ?name ?jobTitle WHERE {
      ?person a schema:Person ;
              schema:name ?name ;
              schema:jobTitle ?jobTitle .
      FILTER(CONTAINS(LCASE(?jobTitle), "rijksmuseum"))
      FILTER(CONTAINS(LCASE(?jobTitle), "curator") || 
             CONTAINS(LCASE(?jobTitle), "conservator"))
      FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
    }
    ORDER BY ?name
    ```
    """
    question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
    intent: str = dspy.InputField(desc="Query intent from classifier")
    entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
    context: str = dspy.InputField(desc="Previous conversation context", default="")
    sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
    explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
 class HeritageSQLGenerator(dspy.Signature):
    """Generate SQL queries for DuckLake heritage analytics database.
@ -809,6 +930,8 @@ CRITICAL LANGUAGE RULE:
 _schema_aware_sparql_signature = None
 _schema_aware_entity_signature = None
 _schema_aware_answer_signature = None
 _schema_aware_person_sparql_signature = None
 _schema_aware_person_sparql_signature = None
 def get_schema_aware_sparql_signature() -> type[dspy.Signature]:
@ -835,6 +958,48 @@ def get_schema_aware_answer_signature() -> type[dspy.Signature]:
    return _schema_aware_answer_signature
 def _create_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
    """Factory to create Person SPARQL signature with schema-derived docstring.
    Uses LinkML schema to inject correct prefixes and properties from
    PersonObservation slot_usage into the signature docstring.
    OpenAI Prompt Caching: Uses get_cacheable_person_sparql_docstring() which
    prepends the full ontology context (1,200+ tokens) to ensure cache eligibility.
    """
    if not SCHEMA_LOADER_AVAILABLE or get_cacheable_person_sparql_docstring is None:
        logger.warning("Schema loader unavailable, using static person SPARQL signature")
        return HeritagePersonSPARQLGenerator
    try:
        # Use cacheable docstring (1,500+ tokens) for OpenAI prompt caching
        docstring = get_cacheable_person_sparql_docstring()
        class SchemaAwarePersonSPARQLGenerator(dspy.Signature):
            __doc__ = docstring
            question: str = dspy.InputField(desc="Natural language question about people in heritage sector")
            intent: str = dspy.InputField(desc="Query intent from classifier")
            entities: list[str] = dspy.InputField(desc="Extracted entities (names, organizations, roles)", default=[])
            context: str = dspy.InputField(desc="Previous conversation context", default="")
            sparql: str = dspy.OutputField(desc="Valid SPARQL query for person data using schema:Person")
            explanation: str = dspy.OutputField(desc="What the query retrieves and which roles/organizations are targeted")
        return SchemaAwarePersonSPARQLGenerator
    except Exception as e:
        logger.warning(f"Failed to create schema-aware person SPARQL signature: {e}")
        return HeritagePersonSPARQLGenerator
 def get_schema_aware_person_sparql_signature() -> type[dspy.Signature]:
    """Get cached schema-aware person SPARQL signature."""
    global _schema_aware_person_sparql_signature
    if _schema_aware_person_sparql_signature is None:
        _schema_aware_person_sparql_signature = _create_schema_aware_person_sparql_signature()
    return _schema_aware_person_sparql_signature
 def _create_schema_aware_query_intent_signature() -> type[dspy.Signature]:
    """Factory to create HeritageQueryIntent with schema-derived valid values.
@ -2923,12 +3088,21 @@ class HeritageRAGPipeline(dspy.Module):
            self.entity_extractor = dspy.Predict(HeritageEntityExtractor)
        # SPARQL generation - use schema-aware signature if available
        # Institution SPARQL generator (crm:E39_Actor, hc:institutionType)
        if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
            self.sparql_gen = dspy.ChainOfThought(get_schema_aware_sparql_signature())
            logger.info("Using schema-aware SPARQL generator with LinkML-derived prefixes")
        else:
            self.sparql_gen = dspy.ChainOfThought(HeritageSPARQLGenerator)
        # Person SPARQL generator - use schema-aware signature if available
        if use_schema_aware and SCHEMA_LOADER_AVAILABLE:
            self.person_sparql_gen = dspy.ChainOfThought(get_schema_aware_person_sparql_signature())
            logger.info("Using schema-aware Person SPARQL generator with LinkML-derived properties")
        else:
            self.person_sparql_gen = dspy.ChainOfThought(HeritagePersonSPARQLGenerator)
            logger.info("Person SPARQL generator initialized with static predicates")
        # Multi-hop retrieval (uses its own signatures internally)
        self.multi_hop = MultiHopHeritageRetriever(max_hops=max_hops)
@ -3089,20 +3263,32 @@ class HeritageRAGPipeline(dspy.Module):
            # Step 3: Generate SPARQL if needed (use resolved question)
            # Use fast_lm for SPARQL generation if available (performance optimization)
            # Select the appropriate SPARQL generator based on entity_type:
            # - person queries use HeritagePersonSPARQLGenerator (schema:Person predicates)
            # - institution queries use HeritageSPARQLGenerator (crm:E39_Actor predicates)
            sparql = None
            if "sparql" in routing.sources:
                # Select SPARQL generator based on entity_type from router
                entity_type = getattr(routing, 'entity_type', 'institution')
                if entity_type == "person":
                    sparql_generator = self.person_sparql_gen
                    logger.debug(f"Using person SPARQL generator for entity_type='{entity_type}'")
                else:
                    sparql_generator = self.sparql_gen
                    logger.debug(f"Using institution SPARQL generator for entity_type='{entity_type}'")
                if tracker:
                    with tracker.track_llm_call("gpt-4o-mini") as llm_usage:
                        if self.fast_lm:
                            with dspy.settings.context(lm=self.fast_lm):
-                                sparql_result = self.sparql_gen(
+                                sparql_result = sparql_generator(
                                    question=resolved_question,
                                    intent=routing.intent,
                                    entities=routing.entities,
                                    context="",
                                )
                        else:
-                            sparql_result = self.sparql_gen(
+                            sparql_result = sparql_generator(
                                question=resolved_question,
                                intent=routing.intent,
                                entities=routing.entities,
@ -3111,14 +3297,14 @@ class HeritageRAGPipeline(dspy.Module):
                else:
                    if self.fast_lm:
                        with dspy.settings.context(lm=self.fast_lm):
-                            sparql_result = self.sparql_gen(
+                            sparql_result = sparql_generator(
                                question=resolved_question,
                                intent=routing.intent,
                                entities=routing.entities,
                                context="",
                            )
                    else:
-                        sparql_result = self.sparql_gen(
+                        sparql_result = sparql_generator(
                            question=resolved_question,
                            intent=routing.intent,
                            entities=routing.entities,
@ -3697,6 +3883,14 @@ class HeritageRAGPipeline(dspy.Module):
        retry_count = 0
        max_stream_retries = 2
        # DSPy field marker pattern for filtering streaming output
        # DSPy emits markers like [[ ## answer ## ]], [[ ## reasoning ## ]], etc.
        DSPY_FIELD_MARKER = re.compile(r'\[\[\s*##\s*(\w+)\s*##\s*\]\]')
        # State machine for DSPy streaming: only stream 'answer' field tokens
        current_field = None  # Track which DSPy output field we're in
        STREAMABLE_FIELDS = {'answer'}  # Only stream these fields to frontend
        while not streaming_succeeded and retry_count <= max_stream_retries:
            try:
                # Create streamified version of the answer generator
@ -3721,8 +3915,21 @@ class HeritageRAGPipeline(dspy.Module):
                            follow_up = getattr(value, 'follow_up', [])
                            streaming_succeeded = True
                        elif isinstance(value, str):
-                            # Streaming token
+                            # Filter DSPy field markers and only stream answer field
-                            yield {"type": "token", "content": value}
+                            # Check if this token contains a field marker
                            marker_match = DSPY_FIELD_MARKER.search(value)
                            if marker_match:
                                # Update current field state
                                current_field = marker_match.group(1).lower()
                                # Remove the marker from the token
                                cleaned_token = DSPY_FIELD_MARKER.sub('', value).strip()
                                # Only yield if we're in a streamable field and have content
                                if current_field in STREAMABLE_FIELDS and cleaned_token:
                                    yield {"type": "token", "content": cleaned_token}
                            elif current_field in STREAMABLE_FIELDS:
                                # We're in the answer field, stream this token
                                yield {"type": "token", "content": value}
                            # Tokens from other fields (reasoning, citations, etc.) are silently consumed
                        else:
                            # Handle ModelResponseStream from litellm/DSPy
                            # Token text is in choices[0].delta.content or .reasoning_content
@ -3740,9 +3947,16 @@ class HeritageRAGPipeline(dspy.Module):
                                yield {"type": "status", "message": value.message}
                                continue
-                            # Yield extracted token if we got text
+                            # Apply same field filtering to extracted token text
                            if token_text:
-                                yield {"type": "token", "content": token_text}
+                                marker_match = DSPY_FIELD_MARKER.search(token_text)
                                if marker_match:
                                    current_field = marker_match.group(1).lower()
                                    cleaned_token = DSPY_FIELD_MARKER.sub('', token_text).strip()
                                    if current_field in STREAMABLE_FIELDS and cleaned_token:
                                        yield {"type": "token", "content": cleaned_token}
                                elif current_field in STREAMABLE_FIELDS:
                                    yield {"type": "token", "content": token_text}
                # If we get here, streaming completed
                streaming_succeeded = True