glam/backend/rag/schema_loader.py

"""
LinkML Schema Loader for DSPy Heritage RAG

Loads and parses LinkML schema files to provide schema-aware context
for DSPy signatures and RAG pipeline components.

The loader extracts:
- Class definitions with descriptions and ontology mappings
- Slot definitions with URIs and ranges
- Enum values for controlled vocabularies
- Prefix mappings for SPARQL generation

This enables:
1. Dynamic schema context injection into DSPy signatures
2. Schema-validated entity extraction
3. Ontology-aligned SPARQL generation
4. Rich answer synthesis with correct ontology terms
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import Any, Optional

import yaml

logger = logging.getLogger(__name__)

# Default schema directory
SCHEMA_BASE_DIR = Path(__file__).parent.parent.parent / "schemas" / "20251121" / "linkml"


@dataclass
class OntologyPrefix:
    """An ontology prefix mapping."""
    prefix: str
    uri: str
    description: Optional[str] = None


@dataclass
class SlotDefinition:
    """A slot (property) definition from LinkML schema."""
    name: str
    slot_uri: Optional[str] = None
    range: Optional[str] = None
    description: Optional[str] = None
    required: bool = False
    multivalued: bool = False
    exact_mappings: list[str] = field(default_factory=list)
    close_mappings: list[str] = field(default_factory=list)
    examples: list[dict] = field(default_factory=list)


@dataclass
class EnumValue:
    """A permissible value in an enum."""
    name: str
    description: Optional[str] = None
    meaning: Optional[str] = None  # Wikidata mapping
    comments: list[str] = field(default_factory=list)


@dataclass
class EnumDefinition:
    """An enum definition from LinkML schema."""
    name: str
    description: Optional[str] = None
    values: list[EnumValue] = field(default_factory=list)


@dataclass
class ClassDefinition:
    """A class definition from LinkML schema."""
    name: str
    class_uri: Optional[str] = None
    description: Optional[str] = None
    is_a: Optional[str] = None
    slots: list[str] = field(default_factory=list)
    exact_mappings: list[str] = field(default_factory=list)
    close_mappings: list[str] = field(default_factory=list)
    narrow_mappings: list[str] = field(default_factory=list)


@dataclass
class StaffRoleDefinition:
    """A staff role class definition from LinkML schema.

    Represents an official job title/appellation in heritage institutions,
    categorized by role family (CURATORIAL, ARCHIVAL, DIGITAL, etc.).
    """
    name: str
    category: str  # CURATORIAL, ARCHIVAL, DIGITAL, etc.
    description: Optional[str] = None
    class_uri: Optional[str] = None
    common_variants: list[str] = field(default_factory=list)
    wikidata_mapping: Optional[str] = None  # e.g., wikidata:Q674426


@dataclass
class HeritageSchema:
    """Complete parsed heritage custodian schema."""

    # Core schema metadata
    name: str
    version: str
    description: str

    # Ontology prefixes
    prefixes: dict[str, OntologyPrefix] = field(default_factory=dict)

    # Classes
    classes: dict[str, ClassDefinition] = field(default_factory=dict)

    # Slots (properties)
    slots: dict[str, SlotDefinition] = field(default_factory=dict)

    # Enums
    enums: dict[str, EnumDefinition] = field(default_factory=dict)

    # Custodian types (from CustodianPrimaryTypeEnum)
    custodian_types: list[EnumValue] = field(default_factory=list)

    # Staff roles organized by category (from StaffRoles.yaml)
    staff_roles: dict[str, list[StaffRoleDefinition]] = field(default_factory=dict)

    # Role categories (from RoleCategoryEnum in StaffRole.yaml)
    role_categories: list[EnumValue] = field(default_factory=list)

    def get_sparql_prefixes(self) -> str:
        """Generate SPARQL prefix declarations from schema prefixes."""
        lines = []
        for prefix, info in self.prefixes.items():
            lines.append(f"PREFIX {prefix}: <{info.uri}>")
        return "\n".join(lines)

    def get_custodian_type_names(self) -> list[str]:
        """Get list of custodian type enum values."""
        return [v.name for v in self.custodian_types]

    def get_staff_role_names(self) -> list[str]:
        """Get flat list of all staff role class names."""
        roles = []
        for category_roles in self.staff_roles.values():
            roles.extend([r.name for r in category_roles])
        return sorted(roles)

    def get_staff_role_category_names(self) -> list[str]:
        """Get list of staff role category names."""
        return [v.name for v in self.role_categories]

    def get_staff_roles_by_category(self) -> dict[str, list[str]]:
        """Get staff role names organized by category."""
        return {
            category: [r.name for r in roles]
            for category, roles in self.staff_roles.items()
        }

    def get_class_description(self, class_name: str) -> Optional[str]:
        """Get description for a class."""
        cls = self.classes.get(class_name)
        return cls.description if cls else None

    def get_slot_uri(self, slot_name: str) -> Optional[str]:
        """Get the slot URI for a slot name."""
        slot = self.slots.get(slot_name)
        return slot.slot_uri if slot else None

    def format_entity_types_for_prompt(self) -> str:
        """Format custodian types for DSPy prompt injection."""
        lines = ["Heritage Custodian Types (GLAMORCUBESFIXPHDNT taxonomy):"]
        for ct in self.custodian_types:
            desc = ct.description.split("(")[0].strip() if ct.description else ct.name
            lines.append(f"  - {ct.name}: {desc}")
        return "\n".join(lines)

    def format_key_properties_for_prompt(self) -> str:
        """Format key properties for DSPy prompt injection."""
        key_slots = [
            "hc_id", "preferred_label", "custodian_type", "legal_status",
            "place_designation", "has_collection", "identifiers",
            "organizational_structure", "encompassing_body"
        ]
        lines = ["Key Properties:"]
        for slot_name in key_slots:
            slot = self.slots.get(slot_name)
            if slot:
                uri = slot.slot_uri or f"hc:{slot_name}"
                desc = (slot.description or "").split("\n")[0][:80]
                lines.append(f"  - {uri}: {desc}")
        return "\n".join(lines)

    def format_staff_role_categories_for_prompt(self) -> str:
        """Format staff role categories for DSPy prompt injection."""
        lines = ["Staff Role Categories (13 categories):"]
        for rc in self.role_categories:
            desc = rc.description[:60] if rc.description else rc.name
            lines.append(f"  - {rc.name}: {desc}")
        return "\n".join(lines)

    def format_staff_roles_for_prompt(self, max_per_category: int = 5) -> str:
        """Format staff roles for DSPy prompt injection.

        Args:
            max_per_category: Maximum roles to show per category (for brevity)
        """
        lines = ["Staff Roles by Category:"]
        for category, roles in sorted(self.staff_roles.items()):
            role_names = [r.name for r in roles[:max_per_category]]
            if len(roles) > max_per_category:
                role_names.append(f"... +{len(roles) - max_per_category} more")
            lines.append(f"  - {category}: {', '.join(role_names)}")
        return "\n".join(lines)

    def format_ontology_context_for_prompt(self) -> str:
        """Format complete ontology context for DSPy prompts."""
        sections = [
            "=" * 60,
            "HERITAGE CUSTODIAN ONTOLOGY CONTEXT",
            "=" * 60,
            "",
            "Hub Architecture:",
            "  - Custodian (crm:E39_Actor): Central hub entity",
            "  - CustodianObservation: Evidence from sources",
            "  - CustodianName: Standardized emic names",
            "  - CustodianLegalStatus: Formal legal entity",
            "  - CustodianPlace: Geographic location",
            "  - CustodianCollection: Heritage collections",
            "",
            self.format_entity_types_for_prompt(),
            "",
            self.format_key_properties_for_prompt(),
            "",
        ]

        # Add staff roles if loaded
        if self.role_categories:
            sections.extend([
                self.format_staff_role_categories_for_prompt(),
                "",
                self.format_staff_roles_for_prompt(),
                "",
            ])

        sections.append("Key Ontology Prefixes:")

        for prefix, info in list(self.prefixes.items())[:12]:  # Top 12 prefixes
            sections.append(f"  PREFIX {prefix}: <{info.uri}>")

        sections.extend([
            "",
            "=" * 60,
        ])

        return "\n".join(sections)


class SchemaLoader:
    """
    Loads and parses LinkML schema files for the Heritage Custodian Ontology.

    Usage:
        loader = SchemaLoader()
        schema = loader.load()

        # Get SPARQL prefixes
        prefixes = schema.get_sparql_prefixes()

        # Get custodian types for entity extraction
        types = schema.get_custodian_type_names()

        # Get prompt context
        context = schema.format_ontology_context_for_prompt()
    """

    def __init__(self, schema_dir: Optional[Path] = None):
        """Initialize schema loader.

        Args:
            schema_dir: Path to LinkML schema directory. Defaults to
                        schemas/20251121/linkml/
        """
        self.schema_dir = schema_dir or SCHEMA_BASE_DIR
        self._schema: Optional[HeritageSchema] = None

    def load(self, force_reload: bool = False) -> HeritageSchema:
        """Load and parse the complete schema.

        Args:
            force_reload: Force reload even if cached

        Returns:
            Parsed HeritageSchema object
        """
        if self._schema is not None and not force_reload:
            return self._schema

        logger.info(f"Loading LinkML schema from {self.schema_dir}")

        # Load main schema file
        main_schema_path = self.schema_dir / "01_custodian_name_modular.yaml"
        if not main_schema_path.exists():
            raise FileNotFoundError(f"Main schema not found: {main_schema_path}")

        with open(main_schema_path, "r", encoding="utf-8") as f:
            main_schema = yaml.safe_load(f)

        # Initialize schema object
        schema = HeritageSchema(
            name=main_schema.get("name", "heritage_custodian_ontology"),
            version=main_schema.get("version", "0.9.9"),
            description=main_schema.get("description", ""),
        )

        # Load prefixes from Custodian class (has the most complete set)
        schema.prefixes = self._load_prefixes()

        # Load custodian types enum
        schema.custodian_types = self._load_custodian_types()
        schema.enums["CustodianPrimaryTypeEnum"] = EnumDefinition(
            name="CustodianPrimaryTypeEnum",
            description="GLAMORCUBESFIXPHDNT Primary Type Categories",
            values=schema.custodian_types,
        )

        # Load key classes
        schema.classes = self._load_key_classes()

        # Load key slots
        schema.slots = self._load_key_slots()

        # Load staff role categories (RoleCategoryEnum)
        schema.role_categories = self._load_role_categories()
        schema.enums["RoleCategoryEnum"] = EnumDefinition(
            name="RoleCategoryEnum",
            description="Staff Role Categories",
            values=schema.role_categories,
        )

        # Load staff roles organized by category
        schema.staff_roles = self._load_staff_roles()

        self._schema = schema
        logger.info(f"Loaded schema with {len(schema.classes)} classes, "
                   f"{len(schema.slots)} slots, {len(schema.custodian_types)} custodian types, "
                   f"{len(schema.role_categories)} role categories, "
                   f"{sum(len(r) for r in schema.staff_roles.values())} staff roles")

        return schema

    def _load_prefixes(self) -> dict[str, OntologyPrefix]:
        """Load ontology prefixes from Custodian class file."""
        prefixes = {}

        # Default prefixes from main schema and Custodian class
        default_prefixes = {
            "linkml": "https://w3id.org/linkml/",
            "hc": "https://nde.nl/ontology/hc/",
            "crm": "http://www.cidoc-crm.org/cidoc-crm/",
            "prov": "http://www.w3.org/ns/prov#",
            "schema": "http://schema.org/",
            "cpov": "http://data.europa.eu/m8g/",
            "rico": "https://www.ica.org/standards/RiC/ontology#",
            "foaf": "http://xmlns.com/foaf/0.1/",
            "tooi": "https://identifier.overheid.nl/tooi/def/ont/",
            "org": "http://www.w3.org/ns/org#",
            "skos": "http://www.w3.org/2004/02/skos/core#",
            "dcterms": "http://purl.org/dc/terms/",
            "dct": "http://purl.org/dc/terms/",
            "wdt": "http://www.wikidata.org/prop/direct/",
            "wikidata": "http://www.wikidata.org/entity/",
            "geo": "http://www.opengis.net/ont/geosparql#",
            "geof": "http://www.opengis.net/def/function/geosparql/",
            "ghcid": "https://w3id.org/heritage/custodian/",
            "sosa": "http://www.w3.org/ns/sosa/",
        }

        # Try to load from Custodian.yaml for additional prefixes
        custodian_path = self.schema_dir / "modules" / "classes" / "Custodian.yaml"
        if custodian_path.exists():
            try:
                with open(custodian_path, "r", encoding="utf-8") as f:
                    custodian_yaml = yaml.safe_load(f)
                    if "prefixes" in custodian_yaml:
                        default_prefixes.update(custodian_yaml["prefixes"])
            except Exception as e:
                logger.warning(f"Could not load prefixes from Custodian.yaml: {e}")

        for prefix, uri in default_prefixes.items():
            prefixes[prefix] = OntologyPrefix(prefix=prefix, uri=uri)

        return prefixes

    def _load_custodian_types(self) -> list[EnumValue]:
        """Load CustodianPrimaryTypeEnum values."""
        enum_path = self.schema_dir / "modules" / "enums" / "CustodianPrimaryTypeEnum.yaml"
        if not enum_path.exists():
            logger.warning(f"CustodianPrimaryTypeEnum not found: {enum_path}")
            return []

        with open(enum_path, "r", encoding="utf-8") as f:
            enum_yaml = yaml.safe_load(f)

        values = []
        enum_def = enum_yaml.get("enums", {}).get("CustodianPrimaryTypeEnum", {})
        permissible_values = enum_def.get("permissible_values", {})

        for name, info in permissible_values.items():
            values.append(EnumValue(
                name=name,
                description=info.get("description"),
                meaning=info.get("meaning"),
                comments=info.get("comments", []),
            ))

        return values

    def _load_key_classes(self) -> dict[str, ClassDefinition]:
        """Load key class definitions."""
        classes = {}

        # Key classes to load
        key_class_files = [
            "Custodian.yaml",
            "CustodianName.yaml",
            "CustodianObservation.yaml",
            "CustodianLegalStatus.yaml",
            "CustodianPlace.yaml",
            "CustodianCollection.yaml",
            "Identifier.yaml",
            "TimeSpan.yaml",
            "OrganizationalStructure.yaml",
            "EncompassingBody.yaml",
        ]

        classes_dir = self.schema_dir / "modules" / "classes"

        for filename in key_class_files:
            filepath = classes_dir / filename
            if not filepath.exists():
                continue

            try:
                with open(filepath, "r", encoding="utf-8") as f:
                    class_yaml = yaml.safe_load(f)

                # Find class definition in the YAML
                class_defs = class_yaml.get("classes", {})
                for class_name, class_info in class_defs.items():
                    classes[class_name] = ClassDefinition(
                        name=class_name,
                        class_uri=class_info.get("class_uri"),
                        description=class_info.get("description"),
                        is_a=class_info.get("is_a"),
                        slots=class_info.get("slots", []),
                        exact_mappings=class_info.get("exact_mappings", []),
                        close_mappings=class_info.get("close_mappings", []),
                        narrow_mappings=class_info.get("narrow_mappings", []),
                    )
            except Exception as e:
                logger.warning(f"Could not load class from {filepath}: {e}")

        return classes

    def _load_key_slots(self) -> dict[str, SlotDefinition]:
        """Load key slot definitions."""
        slots = {}

        # Key slots to load
        key_slot_files = [
            "hc_id.yaml",
            "preferred_label.yaml",
            "custodian_type.yaml",
            "legal_status.yaml",
            "place_designation.yaml",
            "has_collection.yaml",
            "identifiers.yaml",
            "organizational_structure.yaml",
            "encompassing_body.yaml",
            "identifier_scheme.yaml",
            "identifier_value.yaml",
            "observed_name.yaml",
            "emic_name.yaml",
            "valid_from.yaml",
            "valid_to.yaml",
        ]

        slots_dir = self.schema_dir / "modules" / "slots"

        for filename in key_slot_files:
            filepath = slots_dir / filename
            if not filepath.exists():
                continue

            try:
                with open(filepath, "r", encoding="utf-8") as f:
                    slot_yaml = yaml.safe_load(f)

                # Find slot definition in the YAML
                slot_defs = slot_yaml.get("slots", {})
                for slot_name, slot_info in slot_defs.items():
                    slots[slot_name] = SlotDefinition(
                        name=slot_name,
                        slot_uri=slot_info.get("slot_uri"),
                        range=slot_info.get("range"),
                        description=slot_info.get("description"),
                        required=slot_info.get("required", False),
                        multivalued=slot_info.get("multivalued", False),
                        exact_mappings=slot_info.get("exact_mappings", []),
                        close_mappings=slot_info.get("close_mappings", []),
                        examples=slot_info.get("examples", []),
                    )
            except Exception as e:
                logger.warning(f"Could not load slot from {filepath}: {e}")

        return slots

    def _load_role_categories(self) -> list[EnumValue]:
        """Load RoleCategoryEnum values from StaffRole.yaml."""
        enum_path = self.schema_dir / "modules" / "classes" / "StaffRole.yaml"
        if not enum_path.exists():
            logger.warning(f"StaffRole.yaml not found: {enum_path}")
            return []

        try:
            with open(enum_path, "r", encoding="utf-8") as f:
                staff_role_yaml = yaml.safe_load(f)

            values = []
            enum_def = staff_role_yaml.get("enums", {}).get("RoleCategoryEnum", {})
            permissible_values = enum_def.get("permissible_values", {})

            for name, info in permissible_values.items():
                values.append(EnumValue(
                    name=name,
                    description=info.get("description") if info else None,
                ))

            logger.debug(f"Loaded {len(values)} role categories")
            return values

        except Exception as e:
            logger.warning(f"Could not load role categories: {e}")
            return []

    def _load_staff_roles(self) -> dict[str, list[StaffRoleDefinition]]:
        """Load staff role classes organized by category from StaffRoles.yaml.

        Parses the slot_usage.role_category.ifabsent pattern to determine category.
        Example: ifabsent: "string(CURATORIAL)" -> category = "CURATORIAL"

        Returns:
            Dictionary mapping category name to list of StaffRoleDefinition
        """
        import re

        roles_path = self.schema_dir / "modules" / "classes" / "StaffRoles.yaml"
        if not roles_path.exists():
            logger.warning(f"StaffRoles.yaml not found: {roles_path}")
            return {}

        try:
            with open(roles_path, "r", encoding="utf-8") as f:
                roles_yaml = yaml.safe_load(f)

            roles_by_category: dict[str, list[StaffRoleDefinition]] = {}
            class_defs = roles_yaml.get("classes", {})

            # Regex to extract category from ifabsent: "string(CURATORIAL)"
            ifabsent_pattern = re.compile(r'string\((\w+)\)')

            for class_name, class_info in class_defs.items():
                if not class_info:
                    continue

                # Extract category from slot_usage.role_category.ifabsent
                category = "UNKNOWN"
                slot_usage = class_info.get("slot_usage", {})
                role_category = slot_usage.get("role_category", {})
                ifabsent = role_category.get("ifabsent", "")

                match = ifabsent_pattern.search(ifabsent)
                if match:
                    category = match.group(1)

                # Extract wikidata mapping from exact_mappings
                wikidata_mapping = None
                exact_mappings = class_info.get("exact_mappings", [])
                for mapping in exact_mappings:
                    if mapping.startswith("wikidata:"):
                        wikidata_mapping = mapping
                        break

                # Create role definition
                role_def = StaffRoleDefinition(
                    name=class_name,
                    category=category,
                    description=class_info.get("description"),
                    class_uri=class_info.get("class_uri"),
                    wikidata_mapping=wikidata_mapping,
                )

                # Add to category
                if category not in roles_by_category:
                    roles_by_category[category] = []
                roles_by_category[category].append(role_def)

            total_roles = sum(len(r) for r in roles_by_category.values())
            logger.debug(f"Loaded {total_roles} staff roles across {len(roles_by_category)} categories")
            return roles_by_category

        except Exception as e:
            logger.warning(f"Could not load staff roles: {e}")
            return {}


# Singleton instance for easy access
_schema_loader: Optional[SchemaLoader] = None


def get_schema_loader() -> SchemaLoader:
    """Get singleton schema loader instance."""
    global _schema_loader
    if _schema_loader is None:
        _schema_loader = SchemaLoader()
    return _schema_loader


@lru_cache(maxsize=1)
def get_heritage_schema() -> HeritageSchema:
    """Get cached heritage schema (loaded once)."""
    loader = get_schema_loader()
    return loader.load()


# Convenience functions for common operations
def get_sparql_prefixes() -> str:
    """Get SPARQL prefix declarations from schema."""
    return get_heritage_schema().get_sparql_prefixes()


def get_custodian_types() -> list[str]:
    """Get list of valid custodian type names."""
    return get_heritage_schema().get_custodian_type_names()


def get_ontology_context() -> str:
    """Get formatted ontology context for DSPy prompts."""
    return get_heritage_schema().format_ontology_context_for_prompt()


def get_entity_types_prompt() -> str:
    """Get formatted entity types for DSPy entity extraction."""
    return get_heritage_schema().format_entity_types_for_prompt()


def get_key_properties_prompt() -> str:
    """Get formatted key properties for DSPy prompts."""
    return get_heritage_schema().format_key_properties_for_prompt()


# Staff Role Convenience Functions
def get_staff_role_categories() -> list[str]:
    """Get list of staff role category names (13 categories).

    Returns:
        List of role category names like ['CURATORIAL', 'ARCHIVAL', 'DIGITAL', ...]
    """
    return get_heritage_schema().get_staff_role_category_names()


def get_all_staff_roles() -> list[str]:
    """Get flat list of all staff role class names (64 roles).

    Returns:
        List of role names like ['Curator', 'Archivist', 'DataEngineer', ...]
    """
    return get_heritage_schema().get_staff_role_names()


def get_staff_role_classes() -> dict[str, list[str]]:
    """Get staff role names organized by category.

    Returns:
        Dictionary mapping category to list of role names.
        Example: {'CURATORIAL': ['Curator', 'CollectionsManager'], ...}
    """
    return get_heritage_schema().get_staff_roles_by_category()


def get_staff_roles_prompt() -> str:
    """Get formatted staff roles for DSPy prompts."""
    return get_heritage_schema().format_staff_roles_for_prompt()


def get_staff_role_categories_prompt() -> str:
    """Get formatted staff role categories for DSPy prompts."""
    return get_heritage_schema().format_staff_role_categories_for_prompt()


# =============================================================================
# Schema-Aware Signature Helpers
# =============================================================================

def create_schema_aware_sparql_docstring() -> str:
    """Create docstring for SPARQL generator with schema-derived prefixes."""
    schema = get_heritage_schema()

    # Build prefix section
    prefix_lines = []
    for prefix, info in list(schema.prefixes.items())[:15]:  # Top 15
        prefix_lines.append(f"    - PREFIX {prefix}: <{info.uri}>")

    # Build class section
    class_lines = []
    for cls_name, cls_def in schema.classes.items():
        uri = cls_def.class_uri or f"hc:{cls_name}"
        desc = (cls_def.description or "").split("\n")[0][:60]
        class_lines.append(f"    - {uri} ({cls_name}): {desc}")

    # Build property section
    prop_lines = []
    for slot_name, slot_def in list(schema.slots.items())[:10]:
        uri = slot_def.slot_uri or f"hc:{slot_name}"
        desc = (slot_def.description or "").split("\n")[0][:60]
        prop_lines.append(f"    - {uri}: {desc}")

    docstring = f"""Generate SPARQL queries for heritage custodian knowledge graph.

    You are an expert in SPARQL and the Heritage Custodian Ontology (v{schema.version}).
    Generate valid SPARQL queries that work with our Oxigraph endpoint.

    Ontology Prefixes (MUST USE THESE EXACT URIs):
{chr(10).join(prefix_lines)}

    Key Classes:
{chr(10).join(class_lines[:8])}

    Key Properties:
{chr(10).join(prop_lines)}

    Hub Architecture:
    - Custodian (crm:E39_Actor) is the central hub entity
    - CustodianObservation contains evidence from sources
    - CustodianName holds standardized emic names
    - CustodianLegalStatus holds formal legal entity info
    - CustodianPlace holds geographic location
    - CustodianCollection holds heritage collections
    """

    return docstring


def create_schema_aware_entity_docstring() -> str:
    """Create docstring for entity extractor with schema-derived types.

    Includes multilingual synonyms with language tags when ontology_mapping
    module is available, enabling better entity recognition across languages.
    """
    schema = get_heritage_schema()

    type_lines = []
    for ct in schema.custodian_types:
        # Extract first part of description
        desc = ct.description.split("(")[0].strip() if ct.description else ct.name
        type_lines.append(f"    - {ct.name}: {desc}")

    # Build multilingual synonym section with language tags
    synonym_lines = []
    try:
        # Import dynamically to avoid circular imports
        from backend.rag.ontology_mapping import get_ontology_mapper
        mapper = get_ontology_mapper()

        # Key types to include synonyms for
        key_types = [
            "MUSEUM", "LIBRARY", "ARCHIVE", "GALLERY", "RESEARCH_CENTER",
            "EDUCATION_PROVIDER", "HOLY_SACRED_SITE", "BIO_CUSTODIAN",
        ]

        for custodian_type in key_types:
            by_lang = mapper.get_all_synonyms_by_language(
                custodian_type, "CustodianPrimaryTypeEnum"
            )

            tagged_syns: list[str] = []
            # Sort languages for consistent output
            for lang in sorted(by_lang.keys()):
                if lang == "all":  # Skip the aggregate 'all' key
                    continue
                syns = by_lang[lang]
                # Take up to 2 synonyms per language
                for syn in sorted(syns)[:2]:
                    tagged_syns.append(f"{syn} ({lang})")

            if tagged_syns:
                # Limit to 6 total synonyms per type for brevity
                synonym_lines.append(f"    - {custodian_type}: {', '.join(tagged_syns[:6])}")

        logger.debug(f"Built multilingual synonyms for {len(synonym_lines)} types")

    except ImportError:
        logger.warning("ontology_mapping not available, using static synonyms")
        # Fallback to static synonyms without language tags
        synonym_lines = [
            '    - MUSEUM: "museum", "musea", "museo", "musée"',
            '    - LIBRARY: "library", "bibliotheek", "bibliothèque"',
            '    - ARCHIVE: "archive", "archief", "archiv"',
            '    - GALLERY: "gallery", "galerie"',
        ]
    except Exception as e:
        logger.warning(f"Could not build multilingual synonyms: {e}")
        synonym_lines = []

    # Format synonym section
    if synonym_lines:
        synonym_section = f"""
    MULTILINGUAL SYNONYMS (term + language code):
{chr(10).join(synonym_lines)}
"""
    else:
        synonym_section = ""

    docstring = f"""Extract heritage-specific entities from text.

    Identify institutions, places, dates, identifiers, and relationships
    following the Heritage Custodian Ontology (v{schema.version}).

    Institution Type Classification (GLAMORCUBESFIXPHDNT taxonomy):
{chr(10).join(type_lines)}

    Entity Types to Extract:
    - INSTITUTIONS: Heritage custodians with type classification
    - PLACES: Geographic locations (cities, regions, countries)
    - TEMPORAL: Dates and time periods (founding, closure, events)
    - IDENTIFIERS: ISIL codes (NL-XXXX), Wikidata IDs (Q12345), GHCIDs
{synonym_section}
    When extracting institution types, recognize synonyms in ANY language
    and map them to the canonical GLAMORCUBESFIXPHDNT type.
    """

    return docstring


# =============================================================================
# OpenAI Prompt Caching Helpers
# =============================================================================

def create_cacheable_docstring(signature_docstring: str) -> str:
    """Create a cacheable docstring by prepending ontology context.

    OpenAI prompt caching requires 1024+ tokens at the START of the prompt.
    This function prepends the full ontology context (1,200+ tokens) to any
    signature docstring, ensuring it will be cached.

    The ontology context is STATIC (changes only when schema changes), while
    the user's query is DYNAMIC. By structuring prompts as:
        [STATIC ontology context] + [signature-specific instructions] + [user input]

    We maximize cache hit rates and reduce both latency and costs.

    Benefits:
        - 50% cost reduction on cached input tokens
        - Up to 80% latency reduction
        - Automatic with OpenAI API (no explicit cache management)

    Args:
        signature_docstring: The original DSPy signature docstring

    Returns:
        Merged docstring with ontology context prepended (1,200+ tokens base)

    Example:
        >>> original = "Classify query intent..."  # 50 tokens
        >>> cacheable = create_cacheable_docstring(original)  # 1,250+ tokens
    """
    ontology_context = get_ontology_context()

    # Add a separator for clarity
    merged = f"""{ontology_context}

============================================================
TASK-SPECIFIC INSTRUCTIONS
============================================================

{signature_docstring}"""

    return merged


def get_cacheable_sparql_docstring() -> str:
    """Get SPARQL generator docstring with ontology context for caching.

    Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching.
    """
    return create_cacheable_docstring(create_schema_aware_sparql_docstring())


def get_cacheable_entity_docstring() -> str:
    """Get entity extractor docstring with ontology context for caching.

    Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching.
    """
    return create_cacheable_docstring(create_schema_aware_entity_docstring())


def get_cacheable_query_intent_docstring() -> str:
    """Get query intent docstring with ontology context for caching.

    Combines:
    - Full ontology context (1,200+ tokens)
    - Staff role categories and mappings
    - Custodian type definitions
    - Multilingual synonyms

    Returns a docstring with 2,000+ tokens, ensuring maximum cache utilization.
    """
    schema = get_heritage_schema()

    # Build staff role context
    role_categories = schema.get_staff_role_category_names()
    role_cat_list = ", ".join(role_categories)

    roles_by_category = schema.get_staff_roles_by_category()
    role_examples = []
    for cat, roles in list(roles_by_category.items())[:5]:
        role_examples.append(f"  - {cat}: {', '.join(roles[:3])}")
    role_mapping_context = "\n".join(role_examples)

    # Build custodian type context
    type_examples = ", ".join(ct.name for ct in schema.custodian_types[:15])

    query_intent_docstring = f"""Classify the intent of a heritage institution query.

You are an expert in GLAM (Galleries, Libraries, Archives, Museums) heritage institutions.
Classify the user's query intent to route to appropriate data sources and retrieval strategies.

STAFF ROLE CATEGORIES ({len(role_categories)} categories):
{role_cat_list}

STAFF ROLE CATEGORY → ROLE MAPPING (examples):
{role_mapping_context}

CUSTODIAN TYPES ({len(schema.custodian_types)} types):
{type_examples}

CLASSIFICATION GUIDELINES:
- When entity_type='person', classify the role category and specific role
- When entity_type='institution', classify the custodian type
- Use 'UNKNOWN' when classification is not determinable
- Infer institution type from names (e.g., 'Rijksmuseum' → MUSEUM)
"""

    return create_cacheable_docstring(query_intent_docstring)


def get_cacheable_answer_docstring() -> str:
    """Get answer generator docstring with ontology context for caching.

    Combines:
    - Full ontology context (1,200+ tokens)
    - Key ontology terms for answer synthesis
    - Heritage custodian terminology

    Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching.
    """
    schema = get_heritage_schema()

    # Build entity types context
    type_context = schema.format_entity_types_for_prompt()

    answer_docstring = f"""Generate informative answers about heritage institutions.

You are an expert on heritage custodians following the Heritage Custodian Ontology (v{schema.version}).

Synthesize retrieved information into helpful, accurate responses that:
- Use correct ontology terminology
- Cite sources appropriately
- Include relevant heritage-specific details

Use conversation history to maintain context across multiple turns.
For follow-up questions, resolve pronouns and implicit references
using the previous conversation context.

{type_context}

KEY ONTOLOGY TERMS:
- Custodian: Central hub entity (crm:E39_Actor) representing heritage keepers
- CustodianObservation: Source-based evidence from documents/websites
- CustodianName: Standardized emic (native) names
- CustodianLegalStatus: Formal legal entity information
- CustodianPlace: Geographic location with coordinates
- CustodianCollection: Heritage collections managed

ANSWER GUIDELINES:
- Always prefer ontology-aligned terminology in answers
- When discussing institution types, use GLAMORCUBESFIXPHDNT taxonomy
- Include temporal context (founding dates, historical changes) when relevant
- Reference specific collections, holdings, or digital platforms when known
"""

    return create_cacheable_docstring(answer_docstring)


if __name__ == "__main__":
    # Test the schema loader
    logging.basicConfig(level=logging.INFO)

    schema = get_heritage_schema()

    print("\n=== SCHEMA LOADED ===")
    print(f"Name: {schema.name}")
    print(f"Version: {schema.version}")
    print(f"Classes: {len(schema.classes)}")
    print(f"Slots: {len(schema.slots)}")
    print(f"Custodian Types: {len(schema.custodian_types)}")

    print("\n=== SPARQL PREFIXES ===")
    print(schema.get_sparql_prefixes())

    print("\n=== CUSTODIAN TYPES ===")
    for ct in schema.custodian_types[:5]:
        desc = ct.description[:60] if ct.description else "(no description)"
        print(f"  - {ct.name}: {desc}...")

    print("\n=== ONTOLOGY CONTEXT (for DSPy) ===")
    print(schema.format_ontology_context_for_prompt()[:1000])

    print("\n=== SCHEMA-AWARE SPARQL DOCSTRING ===")
    print(create_schema_aware_sparql_docstring()[:1500])