""" LinkML Schema Loader for DSPy Heritage RAG Loads and parses LinkML schema files to provide schema-aware context for DSPy signatures and RAG pipeline components. The loader extracts: - Class definitions with descriptions and ontology mappings - Slot definitions with URIs and ranges - Enum values for controlled vocabularies - Prefix mappings for SPARQL generation This enables: 1. Dynamic schema context injection into DSPy signatures 2. Schema-validated entity extraction 3. Ontology-aligned SPARQL generation 4. Rich answer synthesis with correct ontology terms """ from __future__ import annotations import logging from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path from typing import Any, Optional import yaml logger = logging.getLogger(__name__) # Default schema directory SCHEMA_BASE_DIR = Path(__file__).parent.parent.parent / "schemas" / "20251121" / "linkml" @dataclass class OntologyPrefix: """An ontology prefix mapping.""" prefix: str uri: str description: Optional[str] = None @dataclass class SlotDefinition: """A slot (property) definition from LinkML schema.""" name: str slot_uri: Optional[str] = None range: Optional[str] = None description: Optional[str] = None required: bool = False multivalued: bool = False exact_mappings: list[str] = field(default_factory=list) close_mappings: list[str] = field(default_factory=list) examples: list[dict] = field(default_factory=list) @dataclass class EnumValue: """A permissible value in an enum.""" name: str description: Optional[str] = None meaning: Optional[str] = None # Wikidata mapping comments: list[str] = field(default_factory=list) @dataclass class EnumDefinition: """An enum definition from LinkML schema.""" name: str description: Optional[str] = None values: list[EnumValue] = field(default_factory=list) @dataclass class ClassDefinition: """A class definition from LinkML schema.""" name: str class_uri: Optional[str] = None description: Optional[str] = None is_a: Optional[str] = None slots: list[str] = field(default_factory=list) exact_mappings: list[str] = field(default_factory=list) close_mappings: list[str] = field(default_factory=list) narrow_mappings: list[str] = field(default_factory=list) @dataclass class StaffRoleDefinition: """A staff role class definition from LinkML schema. Represents an official job title/appellation in heritage institutions, categorized by role family (CURATORIAL, ARCHIVAL, DIGITAL, etc.). """ name: str category: str # CURATORIAL, ARCHIVAL, DIGITAL, etc. description: Optional[str] = None class_uri: Optional[str] = None common_variants: list[str] = field(default_factory=list) wikidata_mapping: Optional[str] = None # e.g., wikidata:Q674426 @dataclass class HeritageSchema: """Complete parsed heritage custodian schema.""" # Core schema metadata name: str version: str description: str # Ontology prefixes prefixes: dict[str, OntologyPrefix] = field(default_factory=dict) # Classes classes: dict[str, ClassDefinition] = field(default_factory=dict) # Slots (properties) slots: dict[str, SlotDefinition] = field(default_factory=dict) # Enums enums: dict[str, EnumDefinition] = field(default_factory=dict) # Custodian types (from CustodianPrimaryTypeEnum) custodian_types: list[EnumValue] = field(default_factory=list) # Staff roles organized by category (from StaffRoles.yaml) staff_roles: dict[str, list[StaffRoleDefinition]] = field(default_factory=dict) # Role categories (from RoleCategoryEnum in StaffRole.yaml) role_categories: list[EnumValue] = field(default_factory=list) def get_sparql_prefixes(self) -> str: """Generate SPARQL prefix declarations from schema prefixes.""" lines = [] for prefix, info in self.prefixes.items(): lines.append(f"PREFIX {prefix}: <{info.uri}>") return "\n".join(lines) def get_custodian_type_names(self) -> list[str]: """Get list of custodian type enum values.""" return [v.name for v in self.custodian_types] def get_staff_role_names(self) -> list[str]: """Get flat list of all staff role class names.""" roles = [] for category_roles in self.staff_roles.values(): roles.extend([r.name for r in category_roles]) return sorted(roles) def get_staff_role_category_names(self) -> list[str]: """Get list of staff role category names.""" return [v.name for v in self.role_categories] def get_staff_roles_by_category(self) -> dict[str, list[str]]: """Get staff role names organized by category.""" return { category: [r.name for r in roles] for category, roles in self.staff_roles.items() } def get_class_description(self, class_name: str) -> Optional[str]: """Get description for a class.""" cls = self.classes.get(class_name) return cls.description if cls else None def get_slot_uri(self, slot_name: str) -> Optional[str]: """Get the slot URI for a slot name.""" slot = self.slots.get(slot_name) return slot.slot_uri if slot else None def format_entity_types_for_prompt(self) -> str: """Format custodian types for DSPy prompt injection.""" lines = ["Heritage Custodian Types (GLAMORCUBESFIXPHDNT taxonomy):"] for ct in self.custodian_types: desc = ct.description.split("(")[0].strip() if ct.description else ct.name lines.append(f" - {ct.name}: {desc}") return "\n".join(lines) def format_key_properties_for_prompt(self) -> str: """Format key properties for DSPy prompt injection.""" key_slots = [ "hc_id", "preferred_label", "custodian_type", "legal_status", "place_designation", "has_collection", "identifiers", "organizational_structure", "encompassing_body" ] lines = ["Key Properties:"] for slot_name in key_slots: slot = self.slots.get(slot_name) if slot: uri = slot.slot_uri or f"hc:{slot_name}" desc = (slot.description or "").split("\n")[0][:80] lines.append(f" - {uri}: {desc}") return "\n".join(lines) def format_staff_role_categories_for_prompt(self) -> str: """Format staff role categories for DSPy prompt injection.""" lines = ["Staff Role Categories (13 categories):"] for rc in self.role_categories: desc = rc.description[:60] if rc.description else rc.name lines.append(f" - {rc.name}: {desc}") return "\n".join(lines) def format_staff_roles_for_prompt(self, max_per_category: int = 5) -> str: """Format staff roles for DSPy prompt injection. Args: max_per_category: Maximum roles to show per category (for brevity) """ lines = ["Staff Roles by Category:"] for category, roles in sorted(self.staff_roles.items()): role_names = [r.name for r in roles[:max_per_category]] if len(roles) > max_per_category: role_names.append(f"... +{len(roles) - max_per_category} more") lines.append(f" - {category}: {', '.join(role_names)}") return "\n".join(lines) def format_ontology_context_for_prompt(self) -> str: """Format complete ontology context for DSPy prompts.""" sections = [ "=" * 60, "HERITAGE CUSTODIAN ONTOLOGY CONTEXT", "=" * 60, "", "Hub Architecture:", " - Custodian (crm:E39_Actor): Central hub entity", " - CustodianObservation: Evidence from sources", " - CustodianName: Standardized emic names", " - CustodianLegalStatus: Formal legal entity", " - CustodianPlace: Geographic location", " - CustodianCollection: Heritage collections", "", self.format_entity_types_for_prompt(), "", self.format_key_properties_for_prompt(), "", ] # Add staff roles if loaded if self.role_categories: sections.extend([ self.format_staff_role_categories_for_prompt(), "", self.format_staff_roles_for_prompt(), "", ]) sections.append("Key Ontology Prefixes:") for prefix, info in list(self.prefixes.items())[:12]: # Top 12 prefixes sections.append(f" PREFIX {prefix}: <{info.uri}>") sections.extend([ "", "=" * 60, ]) return "\n".join(sections) class SchemaLoader: """ Loads and parses LinkML schema files for the Heritage Custodian Ontology. Usage: loader = SchemaLoader() schema = loader.load() # Get SPARQL prefixes prefixes = schema.get_sparql_prefixes() # Get custodian types for entity extraction types = schema.get_custodian_type_names() # Get prompt context context = schema.format_ontology_context_for_prompt() """ def __init__(self, schema_dir: Optional[Path] = None): """Initialize schema loader. Args: schema_dir: Path to LinkML schema directory. Defaults to schemas/20251121/linkml/ """ self.schema_dir = schema_dir or SCHEMA_BASE_DIR self._schema: Optional[HeritageSchema] = None def load(self, force_reload: bool = False) -> HeritageSchema: """Load and parse the complete schema. Args: force_reload: Force reload even if cached Returns: Parsed HeritageSchema object """ if self._schema is not None and not force_reload: return self._schema logger.info(f"Loading LinkML schema from {self.schema_dir}") # Load main schema file main_schema_path = self.schema_dir / "01_custodian_name_modular.yaml" if not main_schema_path.exists(): raise FileNotFoundError(f"Main schema not found: {main_schema_path}") with open(main_schema_path, "r", encoding="utf-8") as f: main_schema = yaml.safe_load(f) # Initialize schema object schema = HeritageSchema( name=main_schema.get("name", "heritage_custodian_ontology"), version=main_schema.get("version", "0.9.9"), description=main_schema.get("description", ""), ) # Load prefixes from Custodian class (has the most complete set) schema.prefixes = self._load_prefixes() # Load custodian types enum schema.custodian_types = self._load_custodian_types() schema.enums["CustodianPrimaryTypeEnum"] = EnumDefinition( name="CustodianPrimaryTypeEnum", description="GLAMORCUBESFIXPHDNT Primary Type Categories", values=schema.custodian_types, ) # Load key classes schema.classes = self._load_key_classes() # Load key slots schema.slots = self._load_key_slots() # Load staff role categories (RoleCategoryEnum) schema.role_categories = self._load_role_categories() schema.enums["RoleCategoryEnum"] = EnumDefinition( name="RoleCategoryEnum", description="Staff Role Categories", values=schema.role_categories, ) # Load staff roles organized by category schema.staff_roles = self._load_staff_roles() self._schema = schema logger.info(f"Loaded schema with {len(schema.classes)} classes, " f"{len(schema.slots)} slots, {len(schema.custodian_types)} custodian types, " f"{len(schema.role_categories)} role categories, " f"{sum(len(r) for r in schema.staff_roles.values())} staff roles") return schema def _load_prefixes(self) -> dict[str, OntologyPrefix]: """Load ontology prefixes from Custodian class file.""" prefixes = {} # Default prefixes from main schema and Custodian class default_prefixes = { "linkml": "https://w3id.org/linkml/", "hc": "https://nde.nl/ontology/hc/", "crm": "http://www.cidoc-crm.org/cidoc-crm/", "prov": "http://www.w3.org/ns/prov#", "schema": "http://schema.org/", "cpov": "http://data.europa.eu/m8g/", "rico": "https://www.ica.org/standards/RiC/ontology#", "foaf": "http://xmlns.com/foaf/0.1/", "tooi": "https://identifier.overheid.nl/tooi/def/ont/", "org": "http://www.w3.org/ns/org#", "skos": "http://www.w3.org/2004/02/skos/core#", "dcterms": "http://purl.org/dc/terms/", "dct": "http://purl.org/dc/terms/", "wdt": "http://www.wikidata.org/prop/direct/", "wikidata": "http://www.wikidata.org/entity/", "geo": "http://www.opengis.net/ont/geosparql#", "geof": "http://www.opengis.net/def/function/geosparql/", "ghcid": "https://nde.nl/ontology/hc/", "sosa": "http://www.w3.org/ns/sosa/", } # Try to load from Custodian.yaml for additional prefixes custodian_path = self.schema_dir / "modules" / "classes" / "Custodian.yaml" if custodian_path.exists(): try: with open(custodian_path, "r", encoding="utf-8") as f: custodian_yaml = yaml.safe_load(f) if "prefixes" in custodian_yaml: default_prefixes.update(custodian_yaml["prefixes"]) except Exception as e: logger.warning(f"Could not load prefixes from Custodian.yaml: {e}") for prefix, uri in default_prefixes.items(): prefixes[prefix] = OntologyPrefix(prefix=prefix, uri=uri) return prefixes def _load_custodian_types(self) -> list[EnumValue]: """Load CustodianPrimaryTypeEnum values.""" enum_path = self.schema_dir / "modules" / "enums" / "CustodianPrimaryTypeEnum.yaml" if not enum_path.exists(): logger.warning(f"CustodianPrimaryTypeEnum not found: {enum_path}") return [] with open(enum_path, "r", encoding="utf-8") as f: enum_yaml = yaml.safe_load(f) values = [] enum_def = enum_yaml.get("enums", {}).get("CustodianPrimaryTypeEnum", {}) permissible_values = enum_def.get("permissible_values", {}) for name, info in permissible_values.items(): values.append(EnumValue( name=name, description=info.get("description"), meaning=info.get("meaning"), comments=info.get("comments", []), )) return values def _load_key_classes(self) -> dict[str, ClassDefinition]: """Load key class definitions.""" classes = {} # Key classes to load key_class_files = [ "Custodian.yaml", "CustodianName.yaml", "CustodianObservation.yaml", "CustodianLegalStatus.yaml", "CustodianPlace.yaml", "CustodianCollection.yaml", "Identifier.yaml", "TimeSpan.yaml", "OrganizationalStructure.yaml", "EncompassingBody.yaml", ] classes_dir = self.schema_dir / "modules" / "classes" for filename in key_class_files: filepath = classes_dir / filename if not filepath.exists(): continue try: with open(filepath, "r", encoding="utf-8") as f: class_yaml = yaml.safe_load(f) # Find class definition in the YAML class_defs = class_yaml.get("classes", {}) for class_name, class_info in class_defs.items(): classes[class_name] = ClassDefinition( name=class_name, class_uri=class_info.get("class_uri"), description=class_info.get("description"), is_a=class_info.get("is_a"), slots=class_info.get("slots", []), exact_mappings=class_info.get("exact_mappings", []), close_mappings=class_info.get("close_mappings", []), narrow_mappings=class_info.get("narrow_mappings", []), ) except Exception as e: logger.warning(f"Could not load class from {filepath}: {e}") return classes def _load_key_slots(self) -> dict[str, SlotDefinition]: """Load key slot definitions.""" slots = {} # Key slots to load key_slot_files = [ "hc_id.yaml", "preferred_label.yaml", "custodian_type.yaml", "legal_status.yaml", "place_designation.yaml", "has_collection.yaml", "identifiers.yaml", "organizational_structure.yaml", "encompassing_body.yaml", "identifier_scheme.yaml", "identifier_value.yaml", "observed_name.yaml", "emic_name.yaml", "valid_from.yaml", "valid_to.yaml", ] slots_dir = self.schema_dir / "modules" / "slots" for filename in key_slot_files: filepath = slots_dir / filename if not filepath.exists(): continue try: with open(filepath, "r", encoding="utf-8") as f: slot_yaml = yaml.safe_load(f) # Find slot definition in the YAML slot_defs = slot_yaml.get("slots", {}) for slot_name, slot_info in slot_defs.items(): slots[slot_name] = SlotDefinition( name=slot_name, slot_uri=slot_info.get("slot_uri"), range=slot_info.get("range"), description=slot_info.get("description"), required=slot_info.get("required", False), multivalued=slot_info.get("multivalued", False), exact_mappings=slot_info.get("exact_mappings", []), close_mappings=slot_info.get("close_mappings", []), examples=slot_info.get("examples", []), ) except Exception as e: logger.warning(f"Could not load slot from {filepath}: {e}") return slots def _load_role_categories(self) -> list[EnumValue]: """Load RoleCategoryEnum values from StaffRole.yaml.""" enum_path = self.schema_dir / "modules" / "classes" / "StaffRole.yaml" if not enum_path.exists(): logger.warning(f"StaffRole.yaml not found: {enum_path}") return [] try: with open(enum_path, "r", encoding="utf-8") as f: staff_role_yaml = yaml.safe_load(f) values = [] enum_def = staff_role_yaml.get("enums", {}).get("RoleCategoryEnum", {}) permissible_values = enum_def.get("permissible_values", {}) for name, info in permissible_values.items(): values.append(EnumValue( name=name, description=info.get("description") if info else None, )) logger.debug(f"Loaded {len(values)} role categories") return values except Exception as e: logger.warning(f"Could not load role categories: {e}") return [] def _load_staff_roles(self) -> dict[str, list[StaffRoleDefinition]]: """Load staff role classes organized by category from StaffRoles.yaml. Parses the slot_usage.role_category.ifabsent pattern to determine category. Example: ifabsent: "string(CURATORIAL)" -> category = "CURATORIAL" Returns: Dictionary mapping category name to list of StaffRoleDefinition """ import re roles_path = self.schema_dir / "modules" / "classes" / "StaffRoles.yaml" if not roles_path.exists(): logger.warning(f"StaffRoles.yaml not found: {roles_path}") return {} try: with open(roles_path, "r", encoding="utf-8") as f: roles_yaml = yaml.safe_load(f) roles_by_category: dict[str, list[StaffRoleDefinition]] = {} class_defs = roles_yaml.get("classes", {}) # Regex to extract category from ifabsent: "string(CURATORIAL)" ifabsent_pattern = re.compile(r'string\((\w+)\)') for class_name, class_info in class_defs.items(): if not class_info: continue # Extract category from slot_usage.role_category.ifabsent category = "UNKNOWN" slot_usage = class_info.get("slot_usage", {}) role_category = slot_usage.get("role_category", {}) ifabsent = role_category.get("ifabsent", "") match = ifabsent_pattern.search(ifabsent) if match: category = match.group(1) # Extract wikidata mapping from exact_mappings wikidata_mapping = None exact_mappings = class_info.get("exact_mappings", []) for mapping in exact_mappings: if mapping.startswith("wikidata:"): wikidata_mapping = mapping break # Create role definition role_def = StaffRoleDefinition( name=class_name, category=category, description=class_info.get("description"), class_uri=class_info.get("class_uri"), wikidata_mapping=wikidata_mapping, ) # Add to category if category not in roles_by_category: roles_by_category[category] = [] roles_by_category[category].append(role_def) total_roles = sum(len(r) for r in roles_by_category.values()) logger.debug(f"Loaded {total_roles} staff roles across {len(roles_by_category)} categories") return roles_by_category except Exception as e: logger.warning(f"Could not load staff roles: {e}") return {} # Singleton instance for easy access _schema_loader: Optional[SchemaLoader] = None def get_schema_loader() -> SchemaLoader: """Get singleton schema loader instance.""" global _schema_loader if _schema_loader is None: _schema_loader = SchemaLoader() return _schema_loader @lru_cache(maxsize=1) def get_heritage_schema() -> HeritageSchema: """Get cached heritage schema (loaded once).""" loader = get_schema_loader() return loader.load() # Convenience functions for common operations def get_sparql_prefixes() -> str: """Get SPARQL prefix declarations from schema.""" return get_heritage_schema().get_sparql_prefixes() def get_custodian_types() -> list[str]: """Get list of valid custodian type names.""" return get_heritage_schema().get_custodian_type_names() def get_ontology_context() -> str: """Get formatted ontology context for DSPy prompts.""" return get_heritage_schema().format_ontology_context_for_prompt() def get_entity_types_prompt() -> str: """Get formatted entity types for DSPy entity extraction.""" return get_heritage_schema().format_entity_types_for_prompt() def get_key_properties_prompt() -> str: """Get formatted key properties for DSPy prompts.""" return get_heritage_schema().format_key_properties_for_prompt() # Staff Role Convenience Functions def get_staff_role_categories() -> list[str]: """Get list of staff role category names (13 categories). Returns: List of role category names like ['CURATORIAL', 'ARCHIVAL', 'DIGITAL', ...] """ return get_heritage_schema().get_staff_role_category_names() def get_all_staff_roles() -> list[str]: """Get flat list of all staff role class names (64 roles). Returns: List of role names like ['Curator', 'Archivist', 'DataEngineer', ...] """ return get_heritage_schema().get_staff_role_names() def get_staff_role_classes() -> dict[str, list[str]]: """Get staff role names organized by category. Returns: Dictionary mapping category to list of role names. Example: {'CURATORIAL': ['Curator', 'CollectionsManager'], ...} """ return get_heritage_schema().get_staff_roles_by_category() def get_staff_roles_prompt() -> str: """Get formatted staff roles for DSPy prompts.""" return get_heritage_schema().format_staff_roles_for_prompt() def get_staff_role_categories_prompt() -> str: """Get formatted staff role categories for DSPy prompts.""" return get_heritage_schema().format_staff_role_categories_for_prompt() # ============================================================================= # Schema-Aware Signature Helpers # ============================================================================= def create_schema_aware_sparql_docstring() -> str: """Create docstring for SPARQL generator with schema-derived prefixes.""" schema = get_heritage_schema() # Build prefix section prefix_lines = [] for prefix, info in list(schema.prefixes.items())[:15]: # Top 15 prefix_lines.append(f" - PREFIX {prefix}: <{info.uri}>") # Build class section class_lines = [] for cls_name, cls_def in schema.classes.items(): uri = cls_def.class_uri or f"hc:{cls_name}" desc = (cls_def.description or "").split("\n")[0][:60] class_lines.append(f" - {uri} ({cls_name}): {desc}") # Build property section prop_lines = [] for slot_name, slot_def in list(schema.slots.items())[:10]: uri = slot_def.slot_uri or f"hc:{slot_name}" desc = (slot_def.description or "").split("\n")[0][:60] prop_lines.append(f" - {uri}: {desc}") docstring = f"""Generate SPARQL queries for heritage custodian knowledge graph. You are an expert in SPARQL and the Heritage Custodian Ontology (v{schema.version}). Generate valid SPARQL queries that work with our Oxigraph endpoint. Ontology Prefixes (MUST USE THESE EXACT URIs): {chr(10).join(prefix_lines)} Key Classes: {chr(10).join(class_lines[:8])} Key Properties: {chr(10).join(prop_lines)} Hub Architecture: - Custodian (crm:E39_Actor) is the central hub entity - CustodianObservation contains evidence from sources - CustodianName holds standardized emic names - CustodianLegalStatus holds formal legal entity info - CustodianPlace holds geographic location - CustodianCollection holds heritage collections """ return docstring def create_schema_aware_entity_docstring() -> str: """Create docstring for entity extractor with schema-derived types. Includes multilingual synonyms with language tags when ontology_mapping module is available, enabling better entity recognition across languages. """ schema = get_heritage_schema() type_lines = [] for ct in schema.custodian_types: # Extract first part of description desc = ct.description.split("(")[0].strip() if ct.description else ct.name type_lines.append(f" - {ct.name}: {desc}") # Build multilingual synonym section with language tags synonym_lines = [] try: # Import dynamically to avoid circular imports from backend.rag.ontology_mapping import get_ontology_mapper mapper = get_ontology_mapper() # Key types to include synonyms for key_types = [ "MUSEUM", "LIBRARY", "ARCHIVE", "GALLERY", "RESEARCH_CENTER", "EDUCATION_PROVIDER", "HOLY_SACRED_SITE", "BIO_CUSTODIAN", ] for custodian_type in key_types: by_lang = mapper.get_all_synonyms_by_language( custodian_type, "CustodianPrimaryTypeEnum" ) tagged_syns: list[str] = [] # Sort languages for consistent output for lang in sorted(by_lang.keys()): if lang == "all": # Skip the aggregate 'all' key continue syns = by_lang[lang] # Take up to 2 synonyms per language for syn in sorted(syns)[:2]: tagged_syns.append(f"{syn} ({lang})") if tagged_syns: # Limit to 6 total synonyms per type for brevity synonym_lines.append(f" - {custodian_type}: {', '.join(tagged_syns[:6])}") logger.debug(f"Built multilingual synonyms for {len(synonym_lines)} types") except ImportError: logger.warning("ontology_mapping not available, using static synonyms") # Fallback to static synonyms without language tags synonym_lines = [ ' - MUSEUM: "museum", "musea", "museo", "musée"', ' - LIBRARY: "library", "bibliotheek", "bibliothèque"', ' - ARCHIVE: "archive", "archief", "archiv"', ' - GALLERY: "gallery", "galerie"', ] except Exception as e: logger.warning(f"Could not build multilingual synonyms: {e}") synonym_lines = [] # Format synonym section if synonym_lines: synonym_section = f""" MULTILINGUAL SYNONYMS (term + language code): {chr(10).join(synonym_lines)} """ else: synonym_section = "" docstring = f"""Extract heritage-specific entities from text. Identify institutions, places, dates, identifiers, and relationships following the Heritage Custodian Ontology (v{schema.version}). Institution Type Classification (GLAMORCUBESFIXPHDNT taxonomy): {chr(10).join(type_lines)} Entity Types to Extract: - INSTITUTIONS: Heritage custodians with type classification - PLACES: Geographic locations (cities, regions, countries) - TEMPORAL: Dates and time periods (founding, closure, events) - IDENTIFIERS: ISIL codes (NL-XXXX), Wikidata IDs (Q12345), GHCIDs {synonym_section} When extracting institution types, recognize synonyms in ANY language and map them to the canonical GLAMORCUBESFIXPHDNT type. """ return docstring # ============================================================================= # OpenAI Prompt Caching Helpers # ============================================================================= def create_cacheable_docstring(signature_docstring: str) -> str: """Create a cacheable docstring by prepending ontology context. OpenAI prompt caching requires 1024+ tokens at the START of the prompt. This function prepends the full ontology context (1,200+ tokens) to any signature docstring, ensuring it will be cached. The ontology context is STATIC (changes only when schema changes), while the user's query is DYNAMIC. By structuring prompts as: [STATIC ontology context] + [signature-specific instructions] + [user input] We maximize cache hit rates and reduce both latency and costs. Benefits: - 50% cost reduction on cached input tokens - Up to 80% latency reduction - Automatic with OpenAI API (no explicit cache management) Args: signature_docstring: The original DSPy signature docstring Returns: Merged docstring with ontology context prepended (1,200+ tokens base) Example: >>> original = "Classify query intent..." # 50 tokens >>> cacheable = create_cacheable_docstring(original) # 1,250+ tokens """ ontology_context = get_ontology_context() # Add a separator for clarity merged = f"""{ontology_context} ============================================================ TASK-SPECIFIC INSTRUCTIONS ============================================================ {signature_docstring}""" return merged def get_cacheable_sparql_docstring() -> str: """Get SPARQL generator docstring with ontology context for caching. Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching. """ return create_cacheable_docstring(create_schema_aware_sparql_docstring()) def get_cacheable_entity_docstring() -> str: """Get entity extractor docstring with ontology context for caching. Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching. """ return create_cacheable_docstring(create_schema_aware_entity_docstring()) def get_cacheable_query_intent_docstring() -> str: """Get query intent docstring with ontology context for caching. Combines: - Full ontology context (1,200+ tokens) - Staff role categories and mappings - Custodian type definitions - Multilingual synonyms Returns a docstring with 2,000+ tokens, ensuring maximum cache utilization. """ schema = get_heritage_schema() # Build staff role context role_categories = schema.get_staff_role_category_names() role_cat_list = ", ".join(role_categories) roles_by_category = schema.get_staff_roles_by_category() role_examples = [] for cat, roles in list(roles_by_category.items())[:5]: role_examples.append(f" - {cat}: {', '.join(roles[:3])}") role_mapping_context = "\n".join(role_examples) # Build custodian type context type_examples = ", ".join(ct.name for ct in schema.custodian_types[:15]) query_intent_docstring = f"""Classify the intent of a heritage institution query. You are an expert in GLAM (Galleries, Libraries, Archives, Museums) heritage institutions. Classify the user's query intent to route to appropriate data sources and retrieval strategies. STAFF ROLE CATEGORIES ({len(role_categories)} categories): {role_cat_list} STAFF ROLE CATEGORY → ROLE MAPPING (examples): {role_mapping_context} CUSTODIAN TYPES ({len(schema.custodian_types)} types): {type_examples} CLASSIFICATION GUIDELINES: - When entity_type='person', classify the role category and specific role - When entity_type='institution', classify the custodian type - Use 'UNKNOWN' when classification is not determinable - Infer institution type from names (e.g., 'Rijksmuseum' → MUSEUM) """ return create_cacheable_docstring(query_intent_docstring) def get_cacheable_answer_docstring() -> str: """Get answer generator docstring with ontology context for caching. Combines: - Full ontology context (1,200+ tokens) - Key ontology terms for answer synthesis - Heritage custodian terminology Returns a docstring with 1,500+ tokens, ensuring OpenAI prompt caching. """ schema = get_heritage_schema() # Build entity types context type_context = schema.format_entity_types_for_prompt() answer_docstring = f"""Generate informative answers about heritage institutions. You are an expert on heritage custodians following the Heritage Custodian Ontology (v{schema.version}). Synthesize retrieved information into helpful, accurate responses that: - Use correct ontology terminology - Cite sources appropriately - Include relevant heritage-specific details Use conversation history to maintain context across multiple turns. For follow-up questions, resolve pronouns and implicit references using the previous conversation context. {type_context} KEY ONTOLOGY TERMS: - Custodian: Central hub entity (crm:E39_Actor) representing heritage keepers - CustodianObservation: Source-based evidence from documents/websites - CustodianName: Standardized emic (native) names - CustodianLegalStatus: Formal legal entity information - CustodianPlace: Geographic location with coordinates - CustodianCollection: Heritage collections managed ANSWER GUIDELINES: - Always prefer ontology-aligned terminology in answers - When discussing institution types, use GLAMORCUBESFIXPHDNT taxonomy - Include temporal context (founding dates, historical changes) when relevant - Reference specific collections, holdings, or digital platforms when known """ return create_cacheable_docstring(answer_docstring) if __name__ == "__main__": # Test the schema loader logging.basicConfig(level=logging.INFO) schema = get_heritage_schema() print("\n=== SCHEMA LOADED ===") print(f"Name: {schema.name}") print(f"Version: {schema.version}") print(f"Classes: {len(schema.classes)}") print(f"Slots: {len(schema.slots)}") print(f"Custodian Types: {len(schema.custodian_types)}") print("\n=== SPARQL PREFIXES ===") print(schema.get_sparql_prefixes()) print("\n=== CUSTODIAN TYPES ===") for ct in schema.custodian_types[:5]: desc = ct.description[:60] if ct.description else "(no description)" print(f" - {ct.name}: {desc}...") print("\n=== ONTOLOGY CONTEXT (for DSPy) ===") print(schema.format_ontology_context_for_prompt()[:1000]) print("\n=== SCHEMA-AWARE SPARQL DOCSTRING ===") print(create_schema_aware_sparql_docstring()[:1500])