diff --git a/.opencode/rules/no-deletion-from-slot-fixes.md b/.opencode/rules/no-deletion-from-slot-fixes.md
new file mode 100644
index 0000000000..ae5a6ac855
--- /dev/null
+++ b/.opencode/rules/no-deletion-from-slot-fixes.md
@@ -0,0 +1,17 @@
+# Rule: Do Not Delete Entries from slot_fixes.yaml
+
+**CRITICAL**: Entries in `schemas/20251121/linkml/modules/slots/slot_fixes.yaml` MUST NEVER be deleted.
+
+This file serves as a persistent audit log and migration tracking registry.
+
+**Protocol**:
+1. **Process** the migration specified in the `revision` section.
+2. **Update** the `processed` section:
+ * Set `status: true`.
+ * Add a `notes` field describing the action taken (e.g., "Migrated to has_or_had_name + PersonName class. Slot archived.").
+ * Add a `date` field (YYYY-MM-DD).
+3. **Keep** the original entry intact.
+
+**Forbidden**:
+* ❌ Deleting a processed block.
+* ❌ Removing an entry because the slot file doesn't exist (mark as processed with note "Slot file not found, skipped").
diff --git a/.opencode/rules/preserve-bespoke-slots-until-refactoring.md b/.opencode/rules/preserve-bespoke-slots-until-refactoring.md
new file mode 100644
index 0000000000..4df0810fe3
--- /dev/null
+++ b/.opencode/rules/preserve-bespoke-slots-until-refactoring.md
@@ -0,0 +1,32 @@
+# Rule: Preserve Bespoke Slots Until Refactoring
+
+**Identifier**: `preserve-bespoke-slots-until-refactoring`
+**Severity**: **CRITICAL**
+
+## Core Directive
+
+**DO NOT remove or migrate "additional" bespoke slots during generic migration passes unless they are the specific target of the current task.**
+
+## Context
+
+When migrating a specific slot (e.g., `has_approval_date`), you may encounter other bespoke or legacy slots in the same class file (e.g., `innovation_budget`, `operating_budget`).
+
+**YOU MUST**:
+* ✅ Migrate ONLY the specific slot you were instructed to work on.
+* ✅ Leave other bespoke slots exactly as they are.
+* ✅ Focus strictly on the current migration target.
+
+**YOU MUST NOT**:
+* ❌ Proactively migrate "nearby" slots just because they look like they need refactoring.
+* ❌ Remove slots that seem unused or redundant without specific instruction.
+* ❌ "Clean up" the class file by removing legacy attributes.
+
+## Rationale
+
+Refactoring is a separate, planned phase. Mixing opportunistic refactoring with systematic slot migration increases the risk of regression and makes changes harder to review. "We will refactor those later."
+
+## Workflow
+
+1. **Identify Target**: Identify the specific slot(s) assigned for migration (from `slot_fixes.yaml` or user prompt).
+2. **Execute Migration**: Apply changes ONLY for those slots.
+3. **Ignore Others**: Do not touch other slots in the file, even if they violate other rules (like Rule 39 or Rule 53). Those will be handled in their own dedicated tasks.
diff --git a/backend/rag/hybrid_retriever.py b/backend/rag/hybrid_retriever.py
new file mode 100644
index 0000000000..946eef891d
--- /dev/null
+++ b/backend/rag/hybrid_retriever.py
@@ -0,0 +1,2534 @@
+"""
+Hybrid Retriever: Vector Search + Knowledge Graph Expansion
+
+Combines Qdrant vector similarity search with Oxigraph SPARQL graph expansion
+to provide semantically-aware and structurally-enriched retrieval.
+
+Architecture:
+ 1. Vector Search (Qdrant) - Find semantically similar institutions AND persons
+ 2. Graph Expansion (Oxigraph) - Expand via relationships:
+ - Same city/region
+ - Same institution type
+ - Related collections
+ - Organizational relationships
+ 3. Re-ranking - Combine scores for final ranking
+ 4. Query Routing - Detect if query is about institutions or persons
+
+Collections:
+ - heritage_custodians: Institution data (27K+ records)
+ - heritage_persons: Staff/person data (10K+ records)
+
+Example usage:
+ retriever = HybridRetriever(
+ qdrant_host="localhost",
+ qdrant_port=6333,
+ sparql_endpoint="http://localhost:7878/query"
+ )
+
+ # Institution search
+ results = retriever.search("museums with Dutch colonial history")
+
+ # Person search (auto-detected or explicit)
+ results = retriever.search("Who works at the Nationaal Archief?")
+ results = retriever.search_persons("archivist at Rijksmuseum")
+"""
+
+import hashlib
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from typing import Any, TYPE_CHECKING
+
+import httpx
+
+# Polygon filter for geographic containment testing (Dutch provinces)
+from glam_extractor.geocoding.polygon_filter import (
+ get_polygon_filter,
+ ProvincePolygonFilter,
+)
+
+if TYPE_CHECKING:
+ from qdrant_client import QdrantClient
+ from openai import OpenAI
+ from sentence_transformers import SentenceTransformer
+ # Forward reference as string to avoid circular imports
+ MultiEmbeddingRetriever = Any # Actually from glam_extractor.api.multi_embedding_retriever
+ EmbeddingModel = Any # Actually from glam_extractor.api.multi_embedding_retriever
+
+logger = logging.getLogger(__name__)
+
+
+# SPARQL endpoint configuration
+DEFAULT_SPARQL_ENDPOINT = os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query")
+DEFAULT_SPARQL_TIMEOUT = 30.0
+
+# Ontology prefixes used in Oxigraph
+SPARQL_PREFIXES = """
+PREFIX hc:
+PREFIX hcc:
+PREFIX ghc:
+PREFIX skos:
+PREFIX wdt:
+PREFIX wd:
+PREFIX schema:
+PREFIX geo:
+PREFIX rdfs:
+PREFIX rdf:
+"""
+
+
+@dataclass
+class RetrievedInstitution:
+ """A retrieved heritage institution with combined scores."""
+
+ ghcid: str
+ name: str
+ uri: str
+ vector_score: float = 0.0
+ graph_score: float = 0.0
+ combined_score: float = 0.0
+
+ # Metadata from vector search
+ institution_type: str | None = None
+ country: str | None = None
+ city: str | None = None
+ description: str | None = None
+
+ # Geographic coordinates
+ latitude: float | None = None
+ longitude: float | None = None
+
+ # Graph expansion data
+ related_institutions: list[str] = field(default_factory=list)
+ expansion_reason: str | None = None # e.g., "same_city", "same_type", "related_collection"
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for API responses."""
+ return {
+ "ghcid": self.ghcid,
+ "name": self.name,
+ "uri": self.uri,
+ "scores": {
+ "vector": round(self.vector_score, 4),
+ "graph": round(self.graph_score, 4),
+ "combined": round(self.combined_score, 4),
+ },
+ "metadata": {
+ "institution_type": self.institution_type,
+ "country": self.country,
+ "city": self.city,
+ "description": self.description,
+ "latitude": self.latitude,
+ "longitude": self.longitude,
+ },
+ "graph_expansion": {
+ "related_institutions": self.related_institutions,
+ "expansion_reason": self.expansion_reason,
+ }
+ }
+
+
+# ===================================================================
+# Linked Data URI Generation Utilities
+# ===================================================================
+# Generate stable ontology-aligned URIs for Person and PersonObservation
+# following the LinkML schema at schemas/20251121/linkml/
+# Namespace: https://nde.nl/ontology/hc/
+# ===================================================================
+
+import re
+import unicodedata
+
+# Ontology namespaces
+ONTOLOGY_BASE = "https://nde.nl/ontology/hc"
+PERSON_HUB_PREFIX = f"{ONTOLOGY_BASE}/person"
+PERSON_OBS_PREFIX = f"{ONTOLOGY_BASE}/person-obs"
+CUSTODIAN_PREFIX = f"{ONTOLOGY_BASE}/custodian"
+
+# JSON-LD context for person search responses
+PERSON_JSONLD_CONTEXT = {
+ "@vocab": f"{ONTOLOGY_BASE}/",
+ "schema": "http://schema.org/",
+ "pico": "https://personsincontext.org/model#",
+ "prov": "http://www.w3.org/ns/prov#",
+ "foaf": "http://xmlns.com/foaf/0.1/",
+ "name": "schema:name",
+ "jobTitle": "schema:jobTitle",
+ "affiliation": "schema:affiliation",
+ "sameAs": "schema:sameAs",
+ "refers_to_person": "pico:observationOf",
+ "observation_source": "prov:hadPrimarySource",
+}
+
+
+def generate_slug(text: str) -> str:
+ """Generate URL-safe slug from text.
+
+ Examples:
+ "Kitty Bogte" → "kitty-bogte"
+ "Dr. Jane Smith" → "dr-jane-smith"
+ "Taco Dibbits" → "taco-dibbits"
+ """
+ if not text:
+ return "unknown"
+
+ # Normalize unicode (NFD decomposition) and remove diacritics
+ normalized = unicodedata.normalize('NFD', text)
+ ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+
+ # Convert to lowercase
+ lowercase = ascii_text.lower()
+
+ # Replace non-alphanumeric with hyphens
+ slug = re.sub(r'[^a-z0-9]+', '-', lowercase)
+
+ # Collapse multiple hyphens and strip leading/trailing
+ slug = re.sub(r'-+', '-', slug).strip('-')
+
+ return slug or "unknown"
+
+
+def generate_role_slug(headline: str | None) -> str:
+ """Generate role slug from job title/headline.
+
+ Examples:
+ "Programmer/curator" → "programmer-curator"
+ "Senior Archivist" → "senior-archivist"
+ None → "staff"
+ """
+ if not headline:
+ return "staff"
+ return generate_slug(headline)
+
+
+def generate_person_hub_uri(name: str, linkedin_slug: str | None = None) -> str:
+ """Generate Person hub URI (abstract identity).
+
+ Format: https://nde.nl/ontology/hc/person/{person-slug}
+
+ Uses LinkedIn slug if available for stability, otherwise derives from name.
+
+ Examples:
+ generate_person_hub_uri("Kitty Bogte", "kittybogte")
+ → "https://nde.nl/ontology/hc/person/kittybogte"
+ generate_person_hub_uri("Dr. Jane Smith")
+ → "https://nde.nl/ontology/hc/person/dr-jane-smith"
+ """
+ if linkedin_slug:
+ slug = linkedin_slug
+ else:
+ slug = generate_slug(name)
+
+ return f"{PERSON_HUB_PREFIX}/{slug}"
+
+
+def generate_observation_uri(
+ custodian_slug: str | None,
+ person_name: str,
+ role_slug: str | None = None,
+ linkedin_slug: str | None = None
+) -> str:
+ """Generate PersonObservation URI.
+
+ Format: https://nde.nl/ontology/hc/person-obs/{custodian-slug}/{person-slug}/{role-slug}
+
+ Examples:
+ generate_observation_uri("nl-ga-nationaal-archief", "Kitty Bogte", "programmer-curator")
+ → "https://nde.nl/ontology/hc/person-obs/nl-ga-nationaal-archief/kitty-bogte/programmer-curator"
+ """
+ custodian = custodian_slug or "unknown-custodian"
+ person = linkedin_slug or generate_slug(person_name)
+ role = role_slug or "staff"
+
+ return f"{PERSON_OBS_PREFIX}/{custodian}/{person}/{role}"
+
+
+def generate_custodian_uri(custodian_slug: str | None, ghcid: str | None = None) -> str | None:
+ """Generate Custodian URI.
+
+ Format: https://nde.nl/ontology/hc/custodian/{ghcid-or-slug}
+ """
+ if ghcid:
+ return f"{CUSTODIAN_PREFIX}/{ghcid}"
+ elif custodian_slug:
+ return f"{CUSTODIAN_PREFIX}/{custodian_slug}"
+ return None
+
+
+def extract_linkedin_slug(linkedin_url: str | None) -> str | None:
+ """Extract slug from LinkedIn URL.
+
+ Examples:
+ "https://www.linkedin.com/in/kittybogte" → "kittybogte"
+ "https://linkedin.com/in/jane-smith-12345" → "jane-smith-12345"
+ """
+ if not linkedin_url:
+ return None
+
+ match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
+ return match.group(1) if match else None
+
+
+@dataclass
+class RetrievedPerson:
+ """A retrieved person/staff member with search scores and linked data URIs."""
+
+ person_id: str
+ name: str
+ vector_score: float = 0.0
+ combined_score: float = 0.0
+ richness_score: float = 0.0 # Metadata richness score (0-1)
+
+ # Metadata from vector search
+ headline: str | None = None # Job title/role
+ custodian_name: str | None = None # Organization they work at
+ custodian_slug: str | None = None
+ location: str | None = None
+ heritage_relevant: bool = False
+ heritage_type: str | None = None # GLAMORCUBESFIXPHDNT code
+ source_type: str | None = None # "staff_list" or "entity_profile"
+ linkedin_url: str | None = None
+ has_wcms: bool = False # WCMS-registered profile (heritage sector user)
+
+ # WCMS-specific fields for display on review page
+ wcms_user_id: str | None = None
+ wcms_abs_id: str | None = None # NAN identifier
+ wcms_crm_id: str | None = None
+ wcms_username: str | None = None
+ wcms_username_url: str | None = None
+ wcms_status: str | None = None # "Active" or "Blocked"
+ wcms_roles: list[str] | None = None
+ wcms_registered_since: str | None = None
+ wcms_last_access: str | None = None
+
+ # Contact details
+ email: str | None = None
+ email_domain: str | None = None
+
+ # Linked data fields (generated)
+ linkedin_profile_path: str | None = None # Path to entity JSON file
+
+ @property
+ def linkedin_slug(self) -> str | None:
+ """Extract LinkedIn slug from URL."""
+ return extract_linkedin_slug(self.linkedin_url)
+
+ @property
+ def person_hub_uri(self) -> str:
+ """Generate Person hub URI (abstract identity)."""
+ return generate_person_hub_uri(self.name, self.linkedin_slug)
+
+ @property
+ def observation_uri(self) -> str:
+ """Generate PersonObservation URI."""
+ role_slug = generate_role_slug(self.headline)
+ return generate_observation_uri(
+ self.custodian_slug,
+ self.name,
+ role_slug,
+ self.linkedin_slug
+ )
+
+ @property
+ def custodian_uri(self) -> str | None:
+ """Generate Custodian URI."""
+ return generate_custodian_uri(self.custodian_slug)
+
+ def to_dict(self, include_jsonld: bool = True) -> dict[str, Any]:
+ """Convert to dictionary for API responses.
+
+ Args:
+ include_jsonld: If True, include JSON-LD linked data fields (@id, @type, etc.)
+ """
+ result = {
+ "person_id": self.person_id,
+ "name": self.name,
+ "scores": {
+ "vector": round(self.vector_score, 4),
+ "combined": round(self.combined_score, 4),
+ "richness": round(self.richness_score, 4),
+ },
+ "metadata": {
+ "headline": self.headline,
+ "custodian_name": self.custodian_name,
+ "custodian_slug": self.custodian_slug,
+ "location": self.location,
+ "heritage_relevant": self.heritage_relevant,
+ "heritage_type": self.heritage_type,
+ "source_type": self.source_type,
+ "linkedin_url": self.linkedin_url,
+ "has_wcms": self.has_wcms,
+ # WCMS fields for review page
+ "wcms_user_id": self.wcms_user_id,
+ "wcms_abs_id": self.wcms_abs_id,
+ "wcms_crm_id": self.wcms_crm_id,
+ "wcms_username": self.wcms_username,
+ "wcms_username_url": self.wcms_username_url,
+ "wcms_status": self.wcms_status,
+ "wcms_roles": self.wcms_roles,
+ "wcms_registered_since": self.wcms_registered_since,
+ "wcms_last_access": self.wcms_last_access,
+ # Contact details
+ "email": self.email,
+ "email_domain": self.email_domain,
+ }
+ }
+
+ if include_jsonld:
+ # Add JSON-LD linked data fields
+ result["@id"] = self.observation_uri
+ result["@type"] = "pico:PersonObservation"
+ result["refers_to_person"] = self.person_hub_uri
+
+ # Add custodian affiliation if available
+ if self.custodian_uri:
+ result["unit_affiliation"] = self.custodian_uri
+
+ # Add schema:sameAs for LinkedIn URL
+ if self.linkedin_url:
+ result["schema:sameAs"] = self.linkedin_url
+
+ # Add linkedin_profile_path if available
+ if self.linkedin_profile_path:
+ result["linkedin_profile_path"] = self.linkedin_profile_path
+
+ return result
+
+
+# Query type detection patterns
+PERSON_QUERY_PATTERNS = [
+ # Dutch
+ "wie werkt", "wie werk", "werken in", "werken bij", "medewerker", "personeel",
+ "staff", "werknemer", "expert", "experts", "specialist", "specialisten",
+ "directeur", "curator", "archivaris", "bibliothecaris", "conservator",
+ "team", "collega", "collegas", "mensen bij", "werkzaam",
+ # English
+ "who works", "staff at", "employees", "team at", "people at", "work at",
+ "director of", "curator at", "archivist", "librarian", "works at",
+ "experts at", "specialists", "professionals at",
+ # Generic
+ "linkedin", "person", "professional",
+]
+
+# ===================================================================
+# Dutch Province/Subdivision Code Mapping (ISO 3166-2:NL)
+# ===================================================================
+# Maps province names (lowercase, various spellings) to ISO 3166-2 codes
+# Used for filtering Qdrant queries by region
+# Qdrant payload field: "region" (stores short codes like "NH", "ZH")
+# ===================================================================
+
+DUTCH_PROVINCE_CODES: dict[str, str] = {
+ # Noord-Holland
+ "noord-holland": "NH",
+ "noordholland": "NH",
+ "north holland": "NH",
+ "north-holland": "NH",
+ # Zuid-Holland
+ "zuid-holland": "ZH",
+ "zuidholland": "ZH",
+ "south holland": "ZH",
+ "south-holland": "ZH",
+ # Utrecht
+ "utrecht": "UT",
+ # Gelderland
+ "gelderland": "GE",
+ # Noord-Brabant
+ "noord-brabant": "NB",
+ "noordbrabant": "NB",
+ "brabant": "NB",
+ "north brabant": "NB",
+ # Limburg
+ "limburg": "LI",
+ # Overijssel
+ "overijssel": "OV",
+ # Friesland / Fryslân
+ "friesland": "FR",
+ "fryslân": "FR",
+ "fryslan": "FR",
+ # Groningen
+ "groningen": "GR",
+ # Drenthe
+ "drenthe": "DR",
+ # Flevoland
+ "flevoland": "FL",
+ # Zeeland
+ "zeeland": "ZE",
+}
+
+
+def get_province_code(province_name: str | None) -> str | None:
+ """Convert Dutch province name to ISO 3166-2 subdivision code (without country prefix).
+
+ Args:
+ province_name: Province name in Dutch or English (case-insensitive)
+
+ Returns:
+ Two-letter province code (e.g., "NH", "ZH") or None if not found
+
+ Example:
+ >>> get_province_code("Noord-Holland")
+ 'NH'
+ >>> get_province_code("south holland")
+ 'ZH'
+ >>> get_province_code("Bavaria")
+ None
+ """
+ if not province_name:
+ return None
+ return DUTCH_PROVINCE_CODES.get(province_name.lower().strip())
+
+def looks_like_person_name(query: str) -> bool:
+ """Detect if query looks like a person's name for name-boosted search.
+
+ A query looks like a person name if it:
+ - Contains 2-4 capitalized words (first/last name pattern)
+ - Does NOT contain common non-name words (institutions, locations, etc.)
+ - Does NOT contain question words (who, what, where, etc.)
+
+ Args:
+ query: Search query string
+
+ Returns:
+ True if query appears to be a person name
+
+ Examples:
+ >>> looks_like_person_name("Kitty Bogte")
+ True
+ >>> looks_like_person_name("Who works at the Rijksmuseum?")
+ False
+ >>> looks_like_person_name("archivist at Nationaal Archief")
+ False
+ """
+ # Skip if query contains question words or common phrases
+ non_name_indicators = [
+ # Question words
+ "who", "what", "where", "which", "how", "why",
+ "wie", "wat", "waar", "welk", "hoe", "waarom",
+ # Role/job indicators
+ "works at", "working at", "werkt bij", "werkzaam",
+ "archivist", "curator", "director", "librarian",
+ "archivaris", "directeur", "bibliothecaris",
+ # Prepositions indicating context
+ " at ", " in ", " of ", " for ", " the ",
+ " bij ", " in ", " van ", " voor ", " de ", " het ",
+ # Punctuation that indicates non-name queries
+ "?", "!",
+ ]
+
+ query_lower = query.lower()
+ for indicator in non_name_indicators:
+ if indicator in query_lower:
+ return False
+
+ # Check for capitalized word pattern (typical of names)
+ words = query.strip().split()
+ if len(words) < 2 or len(words) > 4:
+ return False
+
+ # Check if words look like name components (capitalized or all letters)
+ capitalized_count = sum(1 for w in words if w[0].isupper() and w.isalpha())
+
+ # Most name words should be capitalized
+ return capitalized_count >= len(words) - 1 # Allow one lowercase (e.g., "van", "de")
+
+
+def calculate_name_match_boost(query: str, name: str) -> float:
+ """Calculate a score boost for name matching.
+
+ Uses case-insensitive substring matching to boost results where
+ the query matches part or all of the person's name.
+
+ Args:
+ query: Search query (potential name)
+ name: Person's name from search result
+
+ Returns:
+ Boost factor (1.0 = no boost, >1.0 = boosted)
+ - 3.0: Exact match (case-insensitive)
+ - 2.5: Query contains full name or name contains full query
+ - 2.0: Partial match (first or last name matches)
+ - 1.0: No match
+ """
+ query_lower = query.lower().strip()
+ name_lower = name.lower().strip()
+
+ # Exact match
+ if query_lower == name_lower:
+ return 3.0
+
+ # Query is substring of name or vice versa
+ if query_lower in name_lower or name_lower in query_lower:
+ return 2.5
+
+ # Check for partial matches (first or last name)
+ query_parts = set(query_lower.split())
+ name_parts = set(name_lower.split())
+
+ # How many query parts match name parts?
+ matching_parts = query_parts & name_parts
+ if matching_parts:
+ # More matching parts = higher boost
+ match_ratio = len(matching_parts) / max(len(query_parts), len(name_parts))
+ return 1.0 + match_ratio # 1.5-2.0 range for partial matches
+
+ return 1.0 # No boost
+
+
+def detect_query_type(query: str, dspy_entity_type: str | None = None) -> str:
+ """Detect if query is about institutions or persons.
+
+ Uses DSPy LLM classification if provided, falls back to keyword heuristics.
+
+ Args:
+ query: Search query string
+ dspy_entity_type: Optional entity_type from DSPy HeritageQueryRouter
+ ("person", "institution", or "both")
+
+ Returns:
+ "person" or "institution"
+ """
+ # Prefer DSPy semantic classification when available
+ if dspy_entity_type:
+ if dspy_entity_type in ("person", "both"):
+ return "person"
+ if dspy_entity_type == "institution":
+ return "institution"
+
+ # Fallback to keyword heuristics
+ query_lower = query.lower()
+
+ for pattern in PERSON_QUERY_PATTERNS:
+ if pattern in query_lower:
+ return "person"
+
+ return "institution"
+
+
+# ===================================================================
+# Schema-Aware Filter Mapping for DSPy Heritage Query Router
+# ===================================================================
+#
+# These mappings are now loaded DYNAMICALLY from the LinkML schema files
+# via the ontology_mapping module. This ensures:
+# 1. Schema is the single source of truth (no hardcoded values)
+# 2. Multilingual support (Dutch, German, French, Spanish, etc.)
+# 3. Automatic updates when schema changes
+#
+# The ontology_mapping module extracts synonyms from YAML comments
+# and provides fuzzy matching for natural language queries.
+# ===================================================================
+
+def _get_custodian_type_mapping() -> dict[str, str]:
+ """Get custodian type to heritage code mapping from schema.
+
+ Dynamically loads from CustodianPrimaryTypeEnum in LinkML schema.
+ Falls back to minimal hardcoded mapping if schema unavailable.
+
+ Returns:
+ Dict mapping custodian type (e.g., "MUSEUM") to heritage code (e.g., "M")
+ """
+ try:
+ # Try backend.rag path first (when backend is in Python path)
+ from backend.rag.ontology_mapping import get_custodian_type_mapping
+ mapping = get_custodian_type_mapping()
+ if mapping:
+ return mapping
+ except ImportError:
+ try:
+ # Fallback: try direct import (when ontology_mapping is in sys.path)
+ from ontology_mapping import get_custodian_type_mapping # type: ignore[import-not-found]
+ mapping = get_custodian_type_mapping()
+ if mapping:
+ return mapping
+ except ImportError:
+ logger.warning("ontology_mapping not available, using fallback mapping")
+ except Exception as e:
+ logger.warning(f"Failed to load custodian type mapping from schema: {e}")
+
+ # Fallback: minimal GLAMORCUBESFIXPHDNT mapping
+ return {
+ "GALLERY": "G", "LIBRARY": "L", "ARCHIVE": "A", "MUSEUM": "M",
+ "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C",
+ "UNKNOWN": "U", "BIO_CUSTODIAN": "B", "EDUCATION_PROVIDER": "E",
+ "COLLECTING_SOCIETY": "S", "FEATURE": "F", "INTANGIBLE_HERITAGE_GROUP": "I",
+ "MIXED": "X", "PERSONAL_COLLECTION": "P", "HOLY_SITE": "H",
+ "DIGITAL_PLATFORM": "D", "NGO": "N", "TASTE_SMELL_HERITAGE": "T",
+ }
+
+
+def _get_role_category_keywords() -> dict[str, list[str]]:
+ """Get role category keywords from schema.
+
+ Dynamically loads from RoleCategoryEnum in LinkML schema.
+ Falls back to hardcoded keywords if schema unavailable.
+
+ Returns:
+ Dict mapping role category (e.g., "CURATORIAL") to keywords list
+ """
+ try:
+ # Try backend.rag path first (when backend is in Python path)
+ from backend.rag.ontology_mapping import get_role_keywords
+ keywords = get_role_keywords()
+ if keywords:
+ return keywords
+ except ImportError:
+ try:
+ # Fallback: try direct import (when ontology_mapping is in sys.path)
+ from ontology_mapping import get_role_keywords # type: ignore[import-not-found]
+ keywords = get_role_keywords()
+ if keywords:
+ return keywords
+ except ImportError:
+ logger.warning("ontology_mapping not available, using fallback role keywords")
+ except Exception as e:
+ logger.warning(f"Failed to load role keywords from schema: {e}")
+
+ # Fallback: essential role category keywords (hardcoded)
+ return {
+ "CURATORIAL": [
+ "curator", "curatorial", "collectie", "collection", "tentoonstellingen",
+ "exhibitions", "acquisitions", "registrar", "museum professional"
+ ],
+ "CONSERVATION": [
+ "conservator", "conservation", "restaurator", "restoration", "preservatie",
+ "preservation", "materiaal", "material", "preventive"
+ ],
+ "ARCHIVAL": [
+ "archivist", "archivaris", "archief", "archive", "records", "documentalist",
+ "erfgoed", "heritage records", "acquisitie", "beschrijving"
+ ],
+ "LIBRARY": [
+ "bibliothecaris", "librarian", "bibliotheek", "library", "catalogus",
+ "cataloging", "metadata", "special collections", "reference"
+ ],
+ "DIGITAL": [
+ "digital", "digitaal", "developer", "data", "software", "IT", "tech",
+ "engineer", "digitalisering", "digitization", "web", "database"
+ ],
+ "EDUCATION": [
+ "educatie", "education", "learning", "museum educator", "outreach",
+ "public programs", "docent", "teacher", "rondleiding", "guide"
+ ],
+ "GOVERNANCE": [
+ "bestuur", "board", "governance", "trustee", "raad", "council",
+ "advisory", "commissie", "committee"
+ ],
+ "LEADERSHIP": [
+ "director", "directeur", "manager", "head of", "hoofd", "chief",
+ "CEO", "president", "leider", "leadership"
+ ],
+ "RESEARCH": [
+ "onderzoek", "research", "researcher", "wetenschapper", "scientist",
+ "academic", "scholar", "fellow", "postdoc", "PhD"
+ ],
+ "TECHNICAL": [
+ "technical", "technisch", "facilities", "installation", "AV",
+ "audiovisual", "lighting", "security", "beveiliging"
+ ],
+ "SUPPORT": [
+ "support", "admin", "administratie", "office", "HR", "finance",
+ "marketing", "communications", "front desk", "visitor services"
+ ],
+ "CREATIVE": [
+ "design", "ontwerp", "creative", "graphic", "exhibition design",
+ "multimedia", "artist", "kunstenaar", "visual"
+ ],
+ "EXTERNAL": [
+ "volunteer", "vrijwilliger", "intern", "stagiair", "consultant",
+ "advisor", "external", "contractor", "freelance"
+ ],
+ }
+
+
+# Lazy-loaded module-level caches (populated on first access)
+_CUSTODIAN_TYPE_MAPPING: dict[str, str] | None = None
+_ROLE_CATEGORY_KEYWORDS: dict[str, list[str]] | None = None
+
+
+def get_custodian_type_to_heritage_code() -> dict[str, str]:
+ """Get cached custodian type to heritage code mapping."""
+ global _CUSTODIAN_TYPE_MAPPING
+ if _CUSTODIAN_TYPE_MAPPING is None:
+ _CUSTODIAN_TYPE_MAPPING = _get_custodian_type_mapping()
+ return _CUSTODIAN_TYPE_MAPPING
+
+
+def get_role_category_keywords() -> dict[str, list[str]]:
+ """Get cached role category keywords."""
+ global _ROLE_CATEGORY_KEYWORDS
+ if _ROLE_CATEGORY_KEYWORDS is None:
+ _ROLE_CATEGORY_KEYWORDS = _get_role_category_keywords()
+ return _ROLE_CATEGORY_KEYWORDS
+
+
+def build_schema_aware_person_filter(
+ heritage_type_code: str | None = None,
+ heritage_relevant_only: bool = False,
+ custodian_slug: str | None = None,
+ only_wcms: bool = False,
+) -> dict[str, Any] | None:
+ """Build Qdrant filter conditions for schema-aware person search.
+
+ Args:
+ heritage_type_code: Single-letter heritage type code (M, A, L, etc.)
+ heritage_relevant_only: Only return heritage-relevant staff
+ custodian_slug: Filter by specific custodian
+ only_wcms: Only return WCMS-registered profiles (heritage sector users)
+
+ Returns:
+ Dict of filter conditions for Qdrant, or None if no filters
+ """
+ filters: dict[str, Any] = {}
+
+ if heritage_type_code and heritage_type_code not in ("U", "UNKNOWN", "UNSPECIFIED"):
+ filters["heritage_type"] = heritage_type_code
+
+ if heritage_relevant_only:
+ filters["heritage_relevant"] = True
+
+ if custodian_slug:
+ filters["custodian_slug"] = custodian_slug
+
+ if only_wcms:
+ filters["has_wcms"] = True
+
+ return filters if filters else None
+
+
+def filter_by_role_category_keywords(
+ results: list["RetrievedPerson"],
+ role_category: str | None,
+) -> list["RetrievedPerson"]:
+ """Post-filter search results by role category using headline keywords.
+
+ Since role_category is not indexed in Qdrant, we use headline keyword matching
+ to filter results after vector search.
+
+ Args:
+ results: List of RetrievedPerson from vector search
+ role_category: Target role category (CURATORIAL, ARCHIVAL, etc.)
+
+ Returns:
+ Filtered list of RetrievedPerson matching the role category
+ """
+ if not role_category or role_category in ("UNKNOWN", "UNSPECIFIED"):
+ return results
+
+ keywords = get_role_category_keywords().get(role_category, [])
+ if not keywords:
+ return results
+
+ filtered = []
+ for person in results:
+ headline = (person.headline or "").lower()
+ # Check if any keyword matches the headline
+ if any(kw.lower() in headline for kw in keywords):
+ filtered.append(person)
+
+ # If filtering removed all results, return original (don't be too strict)
+ if not filtered:
+ logger.info(f"Role category filter '{role_category}' removed all results, returning unfiltered")
+ return results
+
+ logger.info(f"Role category filter '{role_category}' reduced results from {len(results)} to {len(filtered)}")
+ return filtered
+
+
+def get_heritage_type_code(custodian_type: str | None) -> str | None:
+ """Convert CustodianPrimaryTypeEnum value to single-letter heritage code.
+
+ Args:
+ custodian_type: Custodian type from DSPy router (e.g., "MUSEUM", "ARCHIVE")
+
+ Returns:
+ Single-letter heritage code (e.g., "M", "A") or None if not mappable
+ """
+ if not custodian_type or custodian_type in ("UNKNOWN", "UNSPECIFIED"):
+ return None
+ return get_custodian_type_to_heritage_code().get(custodian_type)
+
+
+class SPARQLClient:
+ """Client for querying Oxigraph SPARQL endpoint."""
+
+ def __init__(
+ self,
+ endpoint: str = DEFAULT_SPARQL_ENDPOINT,
+ timeout: float = DEFAULT_SPARQL_TIMEOUT,
+ max_connections: int = 20 # Allow concurrent connections for parallel queries
+ ):
+ self.endpoint = endpoint
+ self.timeout = timeout
+ self.max_connections = max_connections
+ self._client: httpx.Client | None = None
+
+ @property
+ def client(self) -> httpx.Client:
+ """Lazy-initialize HTTP client with connection pooling."""
+ if self._client is None:
+ # Configure connection pool for parallel SPARQL queries
+ limits = httpx.Limits(
+ max_keepalive_connections=self.max_connections,
+ max_connections=self.max_connections,
+ keepalive_expiry=30.0 # Keep connections alive for reuse
+ )
+ self._client = httpx.Client(
+ timeout=self.timeout,
+ limits=limits,
+ http2=False # HTTP/1.1 is often faster for small queries
+ )
+ return self._client
+
+ def query(self, sparql: str, log_timing: bool = False) -> list[dict[str, Any]]:
+ """Execute SPARQL query and return results.
+
+ Args:
+ sparql: SPARQL query string
+ log_timing: Whether to log query execution time
+
+ Returns:
+ List of result bindings as dictionaries
+ """
+ full_query = SPARQL_PREFIXES + sparql
+ start_time = time.time() if log_timing else 0
+
+ try:
+ response = self.client.post(
+ self.endpoint,
+ data={"query": full_query},
+ headers={"Accept": "application/sparql-results+json"}
+ )
+ response.raise_for_status()
+
+ data = response.json()
+ bindings = data.get("results", {}).get("bindings", [])
+
+ # Convert bindings to simple dicts
+ results = []
+ for binding in bindings:
+ row = {}
+ for key, value in binding.items():
+ row[key] = value.get("value", "")
+ results.append(row)
+
+ if log_timing:
+ duration_ms = (time.time() - start_time) * 1000
+ logger.debug(f"SPARQL query completed: {len(results)} results in {duration_ms:.0f}ms")
+
+ return results
+
+ except httpx.HTTPError as e:
+ logger.error(f"SPARQL query failed: {e}")
+ return []
+ except Exception as e:
+ logger.error(f"Unexpected error in SPARQL query: {e}")
+ return []
+
+ def close(self) -> None:
+ """Close the HTTP client."""
+ if self._client:
+ self._client.close()
+ self._client = None
+
+
+class HybridRetriever:
+ """Hybrid retriever combining vector search with knowledge graph expansion.
+
+ The retrieval process:
+ 1. Vector search finds semantically similar institutions
+ 2. For each result, SPARQL expands to find related institutions:
+ - Institutions in the same city
+ - Institutions of the same type
+ - Institutions with related collections
+ 3. Results are re-ranked based on combined vector + graph scores
+
+ Embedding Models:
+ - If OpenAI API key is available AND collection uses 1536-dim vectors: use OpenAI
+ - Otherwise: use sentence-transformers (all-MiniLM-L6-v2, 384-dim)
+
+ Multi-Embedding Support:
+ Set use_multi_embedding=True to enable support for multiple embedding models
+ via Qdrant's named vectors feature. This allows:
+ - A/B testing different embedding models
+ - Seamless migration between models
+ - Specifying which model to use per query
+
+ Args:
+ qdrant_host: Qdrant server hostname
+ qdrant_port: Qdrant REST API port
+ sparql_endpoint: Oxigraph SPARQL endpoint URL
+ vector_weight: Weight for vector similarity scores (0-1)
+ graph_weight: Weight for graph expansion scores (0-1)
+ collection_name: Qdrant collection name
+ embedding_model: Embedding model name (auto-detected if not specified)
+ k_vector: Number of initial vector search results
+ k_expand: Number of graph expansion results per seed
+ k_final: Final number of results to return
+ use_multi_embedding: Enable multi-embedding mode with named vectors
+ preferred_embedding_model: Preferred model for multi-embedding mode
+ """
+
+ # Class-level type annotations for instance attributes
+ qdrant_host: str
+ qdrant_port: int
+ sparql_endpoint: str
+ vector_weight: float
+ graph_weight: float
+ collection_name: str
+ k_vector: int
+ k_expand: int
+ k_final: int
+ openai_api_key: str | None
+ use_production_qdrant: bool
+ use_multi_embedding: bool
+ preferred_embedding_model: str | None
+ sparql_client: "SPARQLClient"
+ embedding_model: str
+
+ # Private attributes with lazy initialization
+ _qdrant_client: "QdrantClient | None"
+ _openai_client: "OpenAI | None"
+ _st_model: "SentenceTransformer | None"
+ _use_sentence_transformers: bool
+ _collection_vector_size: int | None
+ _multi_retriever: "MultiEmbeddingRetriever | None"
+ _selected_multi_model: "EmbeddingModel | None"
+
+ def __init__(
+ self,
+ qdrant_host: str = "localhost",
+ qdrant_port: int = 6333,
+ sparql_endpoint: str = DEFAULT_SPARQL_ENDPOINT,
+ vector_weight: float = 0.7,
+ graph_weight: float = 0.3,
+ collection_name: str = "heritage_custodians",
+ embedding_model: str | None = None, # Auto-detect if None
+ k_vector: int = 10,
+ k_expand: int = 5,
+ k_final: int = 10,
+ openai_api_key: str | None = None,
+ use_production_qdrant: bool = False,
+ use_multi_embedding: bool = False,
+ preferred_embedding_model: str | None = None,
+ ):
+ self.qdrant_host = qdrant_host
+ self.qdrant_port = qdrant_port
+ self.sparql_endpoint = sparql_endpoint
+ self.vector_weight = vector_weight
+ self.graph_weight = graph_weight
+ self.collection_name = collection_name
+ self.k_vector = k_vector
+ self.k_expand = k_expand
+ self.k_final = k_final
+ self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
+ self.use_production_qdrant = use_production_qdrant
+ self.use_multi_embedding = use_multi_embedding
+ self.preferred_embedding_model = preferred_embedding_model
+
+ # Initialize SPARQL client
+ self.sparql_client = SPARQLClient(endpoint=sparql_endpoint)
+
+ # Lazy-load Qdrant, OpenAI, and sentence-transformers clients
+ self._qdrant_client = None
+ self._openai_client = None
+ self._st_model = None
+ self._use_sentence_transformers = False
+ self._collection_vector_size: int | None = None
+
+ # Multi-embedding retriever (lazy-loaded)
+ self._multi_retriever = None
+
+ # Currently selected multi-embedding model (for multi-embedding mode)
+ self._selected_multi_model = None
+
+ # Determine embedding model to use
+ self.embedding_model = embedding_model or self._auto_detect_embedding_model()
+
+ logger.info(
+ f"Initialized HybridRetriever: "
+ f"Qdrant={qdrant_host}:{qdrant_port}, "
+ f"SPARQL={sparql_endpoint}, "
+ f"embedding_model={self.embedding_model}, "
+ f"multi_embedding={use_multi_embedding}, "
+ f"weights=vector:{vector_weight}/graph:{graph_weight}"
+ )
+
+ @property
+ def qdrant_client(self) -> "QdrantClient":
+ """Lazy-load Qdrant client."""
+ if self._qdrant_client is None:
+ from qdrant_client import QdrantClient
+
+ if self.use_production_qdrant:
+ # Connect via HTTPS to production
+ self._qdrant_client = QdrantClient(
+ host="bronhouder.nl",
+ port=443,
+ https=True,
+ prefix="qdrant",
+ prefer_grpc=False,
+ timeout=30
+ )
+ else:
+ self._qdrant_client = QdrantClient(
+ host=self.qdrant_host,
+ port=self.qdrant_port
+ )
+ return self._qdrant_client
+
+ @property
+ def openai_client(self) -> "OpenAI":
+ """Lazy-load OpenAI client."""
+ if self._openai_client is None:
+ if not self.openai_api_key:
+ raise RuntimeError(
+ "OpenAI API key not available. Set OPENAI_API_KEY or use sentence-transformers."
+ )
+ import openai
+ self._openai_client = openai.OpenAI(api_key=self.openai_api_key)
+ return self._openai_client
+
+ def _get_collection_vector_size(self) -> int | None:
+ """Get the vector size of the Qdrant collection."""
+ try:
+ info = self.qdrant_client.get_collection(self.collection_name)
+ if hasattr(info.config.params, 'vectors'):
+ vectors_config = info.config.params.vectors
+ if isinstance(vectors_config, dict):
+ # Named vectors
+ first_config = next(iter(vectors_config.values()), None)
+ return first_config.size if first_config else None
+ elif vectors_config is not None:
+ # Single vector config
+ return vectors_config.size
+ return None
+ except Exception as e:
+ logger.warning(f"Could not get collection vector size: {e}")
+ return None
+
+ def _auto_detect_embedding_model(self) -> str:
+ """Auto-detect which embedding model to use based on collection and available APIs.
+
+ Detection priority:
+ 1. Check main collection (heritage_custodians) vector size
+ 2. If main collection doesn't exist, check heritage_persons collection
+ 3. If OpenAI key available and collection uses 1536-dim, use OpenAI
+ 4. Otherwise use sentence-transformers (384-dim, all-MiniLM-L6-v2)
+ """
+ # Check main collection vector size first
+ vector_size = self._get_collection_vector_size()
+ self._collection_vector_size = vector_size
+
+ # If main collection doesn't exist, try heritage_persons collection
+ if vector_size is None:
+ logger.info(f"Collection '{self.collection_name}' not found, checking heritage_persons")
+ person_vector_size = self._get_person_collection_vector_size()
+ if person_vector_size:
+ vector_size = person_vector_size
+ logger.info(f"Using heritage_persons collection vector size: {vector_size}")
+
+ if vector_size == 384:
+ # Collection uses sentence-transformers dimensions
+ self._use_sentence_transformers = True
+ logger.info("Auto-detected 384-dim vectors, using sentence-transformers")
+ return "all-MiniLM-L6-v2"
+ elif vector_size == 1536 and self.openai_api_key:
+ # Collection uses OpenAI dimensions and we have API key
+ self._use_sentence_transformers = False
+ logger.info("Auto-detected 1536-dim vectors with OpenAI key, using OpenAI")
+ return "text-embedding-3-small"
+ elif self.openai_api_key:
+ # Default to OpenAI if we have key
+ self._use_sentence_transformers = False
+ return "text-embedding-3-small"
+ else:
+ # Fallback to sentence-transformers
+ self._use_sentence_transformers = True
+ logger.info("No OpenAI key, falling back to sentence-transformers")
+ return "all-MiniLM-L6-v2"
+
+ def _load_sentence_transformer(self) -> "SentenceTransformer":
+ """Lazy-load sentence-transformers model."""
+ if self._st_model is None:
+ try:
+ from sentence_transformers import SentenceTransformer
+ self._st_model = SentenceTransformer(self.embedding_model)
+ logger.info(f"Loaded sentence-transformers model: {self.embedding_model}")
+ except ImportError:
+ raise RuntimeError("sentence-transformers not installed. Run: pip install sentence-transformers")
+ return self._st_model
+
+ @property
+ def multi_retriever(self) -> "MultiEmbeddingRetriever | None":
+ """Lazy-load MultiEmbeddingRetriever when multi-embedding mode is enabled.
+
+ Returns:
+ MultiEmbeddingRetriever instance or None if not in multi-embedding mode
+ """
+ if not self.use_multi_embedding:
+ return None
+
+ if self._multi_retriever is None:
+ from glam_extractor.api.multi_embedding_retriever import (
+ MultiEmbeddingRetriever,
+ MultiEmbeddingConfig,
+ EmbeddingModel,
+ )
+
+ # Create config matching current settings
+ config = MultiEmbeddingConfig(
+ qdrant_host=self.qdrant_host,
+ qdrant_port=self.qdrant_port,
+ qdrant_https=self.use_production_qdrant,
+ qdrant_prefix="qdrant" if self.use_production_qdrant else None,
+ openai_api_key=self.openai_api_key,
+ institutions_collection=self.collection_name,
+ )
+
+ self._multi_retriever = MultiEmbeddingRetriever(config)
+
+ # Auto-select model if not specified
+ if self.preferred_embedding_model:
+ try:
+ self._selected_multi_model = EmbeddingModel(self.preferred_embedding_model)
+ except ValueError:
+ logger.warning(f"Unknown embedding model: {self.preferred_embedding_model}")
+ assert self._multi_retriever is not None # Set above
+ self._selected_multi_model = self._multi_retriever.select_model(self.collection_name)
+ else:
+ assert self._multi_retriever is not None # Set above
+ self._selected_multi_model = self._multi_retriever.select_model(self.collection_name)
+
+ logger.info(f"MultiEmbeddingRetriever initialized, selected model: {self._selected_multi_model}")
+
+ return self._multi_retriever
+
+ def _get_embedding(self, text: str, using: str | None = None) -> list[float]:
+ """Get embedding vector for text using the appropriate model.
+
+ Args:
+ text: Text to embed
+ using: Optional embedding model name (for multi-embedding mode)
+
+ Returns:
+ Embedding vector as list of floats
+ """
+ # If multi-embedding mode, delegate to MultiEmbeddingRetriever
+ if self.use_multi_embedding and self.multi_retriever:
+ from glam_extractor.api.multi_embedding_retriever import EmbeddingModel
+
+ # Determine which model to use
+ if using:
+ try:
+ model = EmbeddingModel(using)
+ except ValueError:
+ logger.warning(f"Unknown model '{using}', using default")
+ model = self._selected_multi_model
+ else:
+ model = self._selected_multi_model
+
+ if model:
+ return self.multi_retriever.get_embedding(text, model)
+ else:
+ # Fallback to legacy mode
+ logger.warning("No multi-embedding model available, falling back to legacy")
+
+ # Legacy single-model embedding
+ if self._use_sentence_transformers:
+ model = self._load_sentence_transformer()
+ embedding = model.encode(text)
+ return embedding.tolist()
+ else:
+ response = self.openai_client.embeddings.create(
+ input=text,
+ model=self.embedding_model
+ )
+ return response.data[0].embedding
+
+ def _vector_search(
+ self,
+ query: str,
+ k: int,
+ using: str | None = None,
+ region_codes: list[str] | None = None,
+ cities: list[str] | None = None,
+ institution_types: list[str] | None = None,
+ use_polygon_filter: bool = True,
+ ) -> list[RetrievedInstitution]:
+ """Perform vector similarity search in Qdrant.
+
+ Args:
+ query: Search query text
+ k: Number of results to retrieve
+ using: Optional embedding model name (for multi-embedding mode)
+ region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
+ cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
+ institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
+ use_polygon_filter: If True, apply polygon-based geographic filtering
+ using actual province boundaries (default: True)
+
+ Returns:
+ List of RetrievedInstitution with vector scores
+ """
+ query_vector = self._get_embedding(query, using=using)
+
+ # When polygon filtering is enabled and regions are specified,
+ # over-fetch to ensure we have enough results after polygon filtering
+ effective_limit = k
+ if use_polygon_filter and region_codes:
+ effective_limit = k * 3 # Over-fetch 3x for polygon filtering
+ logger.debug(f"Over-fetching {effective_limit} results for polygon filtering")
+
+ # Build query parameters
+ search_params = {
+ "collection_name": self.collection_name,
+ "query": query_vector,
+ "limit": effective_limit,
+ "with_payload": True,
+ }
+
+ # Build geographic/type filter if any criteria provided
+ # NOTE: Always apply region metadata filter to Qdrant first to get relevant results.
+ # The polygon filter (if enabled) is an additional precision filter applied afterward.
+ # Previously we disabled metadata region filter when polygon filter was enabled,
+ # but this caused vector search to return results from wrong regions.
+ if region_codes or cities or institution_types:
+ from glam_extractor.ontology.qdrant_filters import QdrantFilterBuilder
+
+ # Convert institution types from full names (LIBRARY, MUSEUM) to single-letter codes (L, M)
+ # because Qdrant stores institution_type as single-letter codes per GLAMORCUBESFIXPHDNT
+ type_codes = None
+ if institution_types:
+ type_mapping = get_custodian_type_to_heritage_code()
+ type_codes = [type_mapping.get(t, t) for t in institution_types]
+ # Filter out any that didn't map (keep original if 1 char already)
+ type_codes = [c for c in type_codes if c and len(c) == 1]
+ logger.debug(f"Converted institution types: {institution_types} -> {type_codes}")
+
+ builder = QdrantFilterBuilder()
+ filter_dict = builder.combined_filter(
+ primary_types=type_codes, # Use single-letter codes
+ region_codes=region_codes, # Always apply region filter to get relevant results
+ cities=cities,
+ combine_mode="must",
+ )
+ if filter_dict:
+ query_filter = QdrantFilterBuilder.to_qdrant_models(filter_dict)
+ search_params["query_filter"] = query_filter
+ logger.info(
+ f"Applied Qdrant filter: types={type_codes}, "
+ f"regions={region_codes}, cities={cities}"
+ )
+
+ # Add named vector 'using' ONLY if collection actually has named vectors
+ # Single-vector collections will error with "Not existing vector name" otherwise
+ if self.use_multi_embedding and self.multi_retriever:
+ uses_named = self.multi_retriever.uses_named_vectors(self.collection_name)
+ if uses_named:
+ if using:
+ search_params["using"] = using
+ elif self._selected_multi_model:
+ search_params["using"] = self._selected_multi_model.value
+ # else: single-vector collection, don't add 'using' parameter
+
+ results = self.qdrant_client.query_points(**search_params)
+
+ institutions = []
+ for point in results.points:
+ payload = point.payload or {}
+
+ inst = RetrievedInstitution(
+ ghcid=payload.get("ghcid", ""),
+ name=payload.get("name", ""),
+ uri=payload.get("uri", f"https://nde.nl/ontology/hc/custodian/{payload.get('ghcid', '')}"),
+ vector_score=point.score,
+ institution_type=payload.get("institution_type"),
+ country=payload.get("country"),
+ city=payload.get("city"),
+ description=payload.get("text", "")[:200] if payload.get("text") else None,
+ latitude=payload.get("latitude"),
+ longitude=payload.get("longitude"),
+ )
+ institutions.append(inst)
+
+ # Apply polygon-based geographic filtering if enabled and regions specified
+ if use_polygon_filter and region_codes and institutions:
+ institutions = self._apply_polygon_filter(institutions, region_codes, k)
+
+ return institutions
+
+ def _apply_polygon_filter(
+ self,
+ institutions: list[RetrievedInstitution],
+ region_codes: list[str],
+ k: int,
+ ) -> list[RetrievedInstitution]:
+ """Filter institutions by polygon containment in specified regions.
+
+ Uses actual province boundary polygons to ensure results are
+ geographically within the requested regions, not just metadata matching.
+
+ Args:
+ institutions: List of retrieved institutions with lat/lon
+ region_codes: List of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
+ k: Maximum number of results to return
+
+ Returns:
+ Filtered list of institutions within the specified regions
+ """
+ polygon_filter = get_polygon_filter()
+
+ # Handle case where polygon filter module is not available or not loaded
+ if polygon_filter is None:
+ logger.warning("Polygon filter not available, skipping geographic filtering")
+ return institutions[:k]
+
+ if not polygon_filter.is_loaded:
+ logger.warning("Polygon filter not loaded, skipping geographic filtering")
+ return institutions[:k]
+
+ filtered = []
+ for inst in institutions:
+ if inst.latitude is None or inst.longitude is None:
+ # No coordinates, check if metadata region matches
+ if inst.country == "NL":
+ # For Dutch institutions without coords, fallback to metadata
+ # Extract region from GHCID (format: NL-{REGION}-...)
+ if inst.ghcid and len(inst.ghcid) > 3:
+ ghcid_region = inst.ghcid.split("-")[1] if "-" in inst.ghcid else None
+ if ghcid_region and ghcid_region.upper() in [r.upper() for r in region_codes]:
+ filtered.append(inst)
+ continue
+
+ # Check if point is within any of the requested regions
+ for region_code in region_codes:
+ if polygon_filter.point_in_province(inst.latitude, inst.longitude, region_code):
+ filtered.append(inst)
+ break # Don't add same institution multiple times
+
+ logger.info(
+ f"Polygon filter: {len(filtered)}/{len(institutions)} institutions "
+ f"in regions {region_codes}"
+ )
+
+ # Return up to k results
+ return filtered[:k]
+
+ def _build_batched_expansion_query(
+ self,
+ seed_institutions: list[RetrievedInstitution],
+ exclude_ghcids: set[str],
+ limit_per_expansion: int = 5
+ ) -> tuple[str, dict[str, dict]]:
+ """Build a single SPARQL query with UNION clauses for all expansions.
+
+ DEDUPLICATES by city code and type+country to avoid redundant query patterns.
+ For example, if 5 seeds are all from Amsterdam with type MUSEUM, we only
+ create ONE city expansion (for AMS) and ONE type expansion (for NL + M),
+ not 10 redundant UNIONs.
+
+ Args:
+ seed_institutions: Seed institutions to expand from
+ exclude_ghcids: GHCIDs to exclude from results
+ limit_per_expansion: Max results per expansion type
+
+ Returns:
+ Tuple of (SPARQL query string, expansion_metadata dict)
+ expansion_metadata maps expansion_key -> {seed, type, city/type_code}
+ """
+ unions = []
+ expansion_metadata = {}
+
+ # Track unique patterns to avoid duplicate queries
+ seen_city_codes: set[str] = set()
+ seen_type_patterns: set[str] = set() # "country-type_code" pattern
+
+ seeds_to_expand = seed_institutions[:5]
+ city_idx = 0
+ type_idx = 0
+
+ for seed in seeds_to_expand:
+ # City expansion - deduplicate by city code
+ if seed.city:
+ city_code = seed.city[:3].upper()
+ if city_code not in seen_city_codes:
+ seen_city_codes.add(city_code)
+ expansion_key = f"city_{city_idx}"
+ city_idx += 1
+ unions.append(f"""
+ {{
+ SELECT ?s ?name ?ghcid ?type ("{expansion_key}" AS ?expansion_key) WHERE {{
+ ?s a hcc:Custodian ;
+ skos:prefLabel ?name ;
+ hc:ghcid ?ghcid .
+ FILTER(CONTAINS(?ghcid, "-{city_code}-"))
+ OPTIONAL {{ ?s hc:institutionType ?type }}
+ }}
+ LIMIT {limit_per_expansion + len(exclude_ghcids)}
+ }}
+ """)
+ expansion_metadata[expansion_key] = {
+ "seed": seed,
+ "type": "city",
+ "city": seed.city,
+ "city_code": city_code
+ }
+
+ # Type expansion - deduplicate by country + type_code pattern
+ if seed.institution_type and seed.country:
+ type_code = get_custodian_type_to_heritage_code().get(seed.institution_type, "")
+ if type_code:
+ pattern_key = f"{seed.country}-{type_code}"
+ if pattern_key not in seen_type_patterns:
+ seen_type_patterns.add(pattern_key)
+ expansion_key = f"type_{type_idx}"
+ type_idx += 1
+ unions.append(f"""
+ {{
+ SELECT ?s ?name ?ghcid ?city ("{expansion_key}" AS ?expansion_key) WHERE {{
+ ?s a hcc:Custodian ;
+ skos:prefLabel ?name ;
+ hc:ghcid ?ghcid .
+ FILTER(STRSTARTS(?ghcid, "{seed.country}-"))
+ FILTER(CONTAINS(?ghcid, "-{type_code}-"))
+ OPTIONAL {{ ?s schema:location ?city }}
+ }}
+ LIMIT {limit_per_expansion + len(exclude_ghcids)}
+ }}
+ """)
+ expansion_metadata[expansion_key] = {
+ "seed": seed,
+ "type": "type",
+ "institution_type": seed.institution_type,
+ "type_code": type_code,
+ "country": seed.country
+ }
+
+ if not unions:
+ return "", {}
+
+ # Log deduplication stats
+ logger.info(f"Batched SPARQL: {len(unions)} UNIONs (deduplicated from max {len(seeds_to_expand) * 2}). "
+ f"Unique cities: {seen_city_codes}, Unique types: {seen_type_patterns}")
+
+ # Combine all unions into a single query
+ query = f"""
+ SELECT ?s ?name ?ghcid ?type ?city ?expansion_key WHERE {{
+ {" UNION ".join(unions)}
+ }}
+ """
+
+ return query, expansion_metadata
+
+ def _graph_expand_batched(
+ self,
+ seed_institutions: list[RetrievedInstitution]
+ ) -> list[RetrievedInstitution]:
+ """Expand seed results using a SINGLE batched SPARQL query.
+
+ This is a significant optimization over the parallel ThreadPoolExecutor
+ approach. Instead of 10 HTTP requests (even in parallel), we execute
+ ONE SPARQL query with UNION clauses.
+
+ Performance comparison:
+ - Sequential: 10 queries × ~100ms = 4+ seconds
+ - Parallel (ThreadPool): ~500ms-1s (limited by GIL/connection pool)
+ - Batched (this method): ONE query ~150-300ms
+
+ Args:
+ seed_institutions: Initial vector search results
+
+ Returns:
+ Additional institutions found via graph expansion
+ """
+ start_time = time.time()
+ exclude_ghcids = {inst.ghcid for inst in seed_institutions}
+ expanded = []
+ seen_ghcids = set(exclude_ghcids)
+
+ # Build batched query
+ query, expansion_metadata = self._build_batched_expansion_query(
+ seed_institutions, exclude_ghcids, limit_per_expansion=self.k_expand
+ )
+
+ if not query:
+ logger.debug("No graph expansion tasks to execute")
+ return expanded
+
+ # Execute single batched query
+ query_start = time.time()
+ results = self.sparql_client.query(query)
+ query_duration = (time.time() - query_start) * 1000
+
+ logger.debug(f"Batched SPARQL query: {len(results)} raw results in {query_duration:.0f}ms")
+
+ # Group results by expansion_key
+ results_by_expansion: dict[str, list[dict]] = {}
+ for row in results:
+ exp_key = row.get("expansion_key", "")
+ if exp_key:
+ if exp_key not in results_by_expansion:
+ results_by_expansion[exp_key] = []
+ results_by_expansion[exp_key].append(row)
+
+ # Process results, filtering and creating RetrievedInstitution objects
+ for exp_key, rows in results_by_expansion.items():
+ if exp_key not in expansion_metadata:
+ continue
+
+ meta = expansion_metadata[exp_key]
+ seed = meta["seed"]
+ exp_type = meta["type"]
+
+ count = 0
+ for row in rows:
+ ghcid = row.get("ghcid", "")
+ if not ghcid or ghcid in seen_ghcids:
+ continue
+
+ if count >= self.k_expand:
+ break
+
+ seen_ghcids.add(ghcid)
+ count += 1
+
+ if exp_type == "city":
+ expanded.append(RetrievedInstitution(
+ ghcid=ghcid,
+ name=row.get("name", ""),
+ uri=row.get("s", ""),
+ graph_score=0.8, # High score for same city
+ institution_type=row.get("type"),
+ expansion_reason="same_city",
+ related_institutions=[seed.ghcid]
+ ))
+ elif exp_type == "type":
+ expanded.append(RetrievedInstitution(
+ ghcid=ghcid,
+ name=row.get("name", ""),
+ uri=row.get("s", ""),
+ graph_score=0.5, # Medium score for same type
+ institution_type=seed.institution_type,
+ city=row.get("city"),
+ expansion_reason="same_type",
+ related_institutions=[seed.ghcid]
+ ))
+
+ logger.debug(f"Expansion {exp_key}: {count} results for {seed.ghcid}")
+
+ total_time = (time.time() - start_time) * 1000
+ logger.info(f"Graph expansion (batched): 1 query, {len(results)} raw results, "
+ f"{len(expanded)} expanded in {total_time:.0f}ms")
+
+ return expanded
+
+ def _expand_by_city(self, city: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
+ """Find other institutions in the same city via SPARQL.
+
+ Note: This method is kept for backwards compatibility and direct calls.
+ For batch operations, use _graph_expand_batched() instead.
+
+ Args:
+ city: City name to search for
+ exclude_ghcids: GHCIDs to exclude from results
+ limit: Maximum number of results
+
+ Returns:
+ List of institution data dicts
+ """
+ if not city:
+ return []
+
+ query = f"""
+ SELECT ?s ?name ?ghcid ?type WHERE {{
+ ?s a hcc:Custodian ;
+ skos:prefLabel ?name ;
+ hc:ghcid ?ghcid .
+
+ # Match city in GHCID (format: CC-RR-CCC-T-ABBR)
+ FILTER(CONTAINS(?ghcid, "-{city[:3].upper()}-"))
+
+ OPTIONAL {{ ?s hc:institutionType ?type }}
+ }}
+ LIMIT {limit + len(exclude_ghcids)}
+ """
+
+ results = self.sparql_client.query(query)
+
+ # Filter out excluded GHCIDs
+ filtered = []
+ for row in results:
+ ghcid = row.get("ghcid", "")
+ if ghcid not in exclude_ghcids:
+ filtered.append(row)
+ if len(filtered) >= limit:
+ break
+
+ return filtered
+
+ def _expand_by_type(self, institution_type: str, country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
+ """Find other institutions of the same type in the same country.
+
+ Args:
+ institution_type: Institution type (MUSEUM, LIBRARY, etc.)
+ country: Country code (ISO 3166-1 alpha-2)
+ exclude_ghcids: GHCIDs to exclude
+ limit: Maximum number of results
+
+ Returns:
+ List of institution data dicts
+ """
+ if not institution_type:
+ return []
+
+ # Map institution type to GHCID type code using dynamic schema mapping
+ type_code = get_custodian_type_to_heritage_code().get(institution_type, "")
+
+ if not type_code or not country:
+ return []
+
+ query = f"""
+ SELECT ?s ?name ?ghcid ?city WHERE {{
+ ?s a hcc:Custodian ;
+ skos:prefLabel ?name ;
+ hc:ghcid ?ghcid .
+
+ # Match country and type in GHCID
+ FILTER(STRSTARTS(?ghcid, "{country}-"))
+ FILTER(CONTAINS(?ghcid, "-{type_code}-"))
+
+ OPTIONAL {{ ?s schema:location ?city }}
+ }}
+ LIMIT {limit + len(exclude_ghcids)}
+ """
+
+ results = self.sparql_client.query(query)
+
+ filtered = []
+ for row in results:
+ ghcid = row.get("ghcid", "")
+ if ghcid not in exclude_ghcids:
+ filtered.append(row)
+ if len(filtered) >= limit:
+ break
+
+ return filtered
+
+ def _expand_by_wikidata_country(self, wikidata_country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
+ """Find institutions in the same country using Wikidata P17 property.
+
+ Args:
+ wikidata_country: Wikidata entity ID for country (e.g., Q55 for Netherlands)
+ exclude_ghcids: GHCIDs to exclude
+ limit: Maximum number of results
+
+ Returns:
+ List of institution data dicts
+ """
+ if not wikidata_country:
+ return []
+
+ query = f"""
+ SELECT ?s ?name ?ghcid ?type WHERE {{
+ ?s a hcc:Custodian ;
+ skos:prefLabel ?name ;
+ hc:ghcid ?ghcid ;
+ wdt:P17 wd:{wikidata_country} .
+
+ OPTIONAL {{ ?s hc:institutionType ?type }}
+ }}
+ LIMIT {limit + len(exclude_ghcids)}
+ """
+
+ results = self.sparql_client.query(query)
+
+ filtered = []
+ for row in results:
+ ghcid = row.get("ghcid", "")
+ if ghcid not in exclude_ghcids:
+ filtered.append(row)
+ if len(filtered) >= limit:
+ break
+
+ return filtered
+
+ def _graph_expand(
+ self,
+ seed_institutions: list[RetrievedInstitution],
+ use_batched: bool = True
+ ) -> list[RetrievedInstitution]:
+ """Expand seed results using knowledge graph relationships.
+
+ By default uses batched SPARQL (single query with UNION) for best performance.
+ Falls back to parallel ThreadPoolExecutor if batched fails.
+
+ Performance comparison:
+ - Sequential: 10 queries × ~100ms = 4+ seconds
+ - Parallel (ThreadPool): ~500ms-3s (limited by GIL/connection pool)
+ - Batched (UNION query): ONE query ~150-300ms ← DEFAULT
+
+ Args:
+ seed_institutions: Initial vector search results
+ use_batched: If True (default), use batched SPARQL query.
+ If False, use parallel ThreadPoolExecutor.
+
+ Returns:
+ Additional institutions found via graph expansion
+ """
+ if use_batched:
+ try:
+ return self._graph_expand_batched(seed_institutions)
+ except Exception as e:
+ logger.warning(f"Batched graph expansion failed, falling back to parallel: {e}")
+ # Fall through to parallel implementation
+
+ return self._graph_expand_parallel(seed_institutions)
+
+ def _graph_expand_parallel(
+ self,
+ seed_institutions: list[RetrievedInstitution]
+ ) -> list[RetrievedInstitution]:
+ """Expand seed results using parallel SPARQL queries (fallback method).
+
+ Uses ThreadPoolExecutor to parallelize SPARQL queries. This is slower than
+ the batched approach but serves as a fallback.
+
+ Args:
+ seed_institutions: Initial vector search results
+
+ Returns:
+ Additional institutions found via graph expansion
+ """
+ start_time = time.time()
+ exclude_ghcids = {inst.ghcid for inst in seed_institutions}
+ expanded = []
+ seen_ghcids = set(exclude_ghcids)
+
+ # Prepare all expansion tasks
+ # Each task is a tuple: (task_type, seed, query_params)
+ tasks = []
+ seeds_to_expand = seed_institutions[:5] # Expand top 5 seeds
+
+ for seed in seeds_to_expand:
+ # City expansion task
+ if seed.city:
+ tasks.append(("city", seed, {"city": seed.city}))
+
+ # Type expansion task
+ if seed.institution_type and seed.country:
+ tasks.append(("type", seed, {
+ "institution_type": seed.institution_type,
+ "country": seed.country
+ }))
+
+ if not tasks:
+ logger.debug("No graph expansion tasks to execute")
+ return expanded
+
+ # Execute SPARQL queries in parallel
+ # Use min(10, len(tasks)) workers to avoid over-parallelization
+ max_workers = min(10, len(tasks))
+
+ def execute_expansion(task):
+ """Execute a single expansion task and return results with metadata."""
+ task_type, seed, params = task
+ task_start = time.time()
+
+ try:
+ if task_type == "city":
+ results = self._expand_by_city(
+ params["city"], exclude_ghcids, limit=self.k_expand
+ )
+ return {
+ "task_type": task_type,
+ "seed": seed,
+ "results": results,
+ "duration_ms": (time.time() - task_start) * 1000
+ }
+ elif task_type == "type":
+ results = self._expand_by_type(
+ params["institution_type"],
+ params["country"],
+ exclude_ghcids,
+ limit=self.k_expand
+ )
+ return {
+ "task_type": task_type,
+ "seed": seed,
+ "results": results,
+ "duration_ms": (time.time() - task_start) * 1000
+ }
+ except Exception as e:
+ logger.warning(f"Graph expansion task failed: {task_type} for {seed.ghcid}: {e}")
+ return {
+ "task_type": task_type,
+ "seed": seed,
+ "results": [],
+ "duration_ms": (time.time() - task_start) * 1000,
+ "error": str(e)
+ }
+
+ # Run all tasks in parallel
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ futures = {executor.submit(execute_expansion, task): task for task in tasks}
+
+ for future in as_completed(futures):
+ result = future.result()
+ if result is None:
+ continue
+
+ task_type = result["task_type"]
+ seed = result["seed"]
+ rows = result["results"]
+ duration = result.get("duration_ms", 0)
+
+ logger.debug(f"Graph expansion {task_type} for {seed.ghcid}: "
+ f"{len(rows)} results in {duration:.0f}ms")
+
+ # Process results based on task type
+ if task_type == "city":
+ for row in rows:
+ ghcid = row.get("ghcid", "")
+ if ghcid and ghcid not in seen_ghcids:
+ seen_ghcids.add(ghcid)
+ expanded.append(RetrievedInstitution(
+ ghcid=ghcid,
+ name=row.get("name", ""),
+ uri=row.get("s", ""),
+ graph_score=0.8, # High score for same city
+ institution_type=row.get("type"),
+ expansion_reason="same_city",
+ related_institutions=[seed.ghcid]
+ ))
+ elif task_type == "type":
+ for row in rows:
+ ghcid = row.get("ghcid", "")
+ if ghcid and ghcid not in seen_ghcids:
+ seen_ghcids.add(ghcid)
+ expanded.append(RetrievedInstitution(
+ ghcid=ghcid,
+ name=row.get("name", ""),
+ uri=row.get("s", ""),
+ graph_score=0.5, # Medium score for same type
+ institution_type=seed.institution_type,
+ city=row.get("city"),
+ expansion_reason="same_type",
+ related_institutions=[seed.ghcid]
+ ))
+
+ total_time = (time.time() - start_time) * 1000
+ logger.info(f"Graph expansion completed: {len(tasks)} queries, "
+ f"{len(expanded)} results in {total_time:.0f}ms (parallel)")
+
+ return expanded
+
+ def _combine_and_rank(
+ self,
+ vector_results: list[RetrievedInstitution],
+ graph_results: list[RetrievedInstitution],
+ k: int
+ ) -> list[RetrievedInstitution]:
+ """Combine vector and graph results with weighted scoring and graph inheritance.
+
+ This method implements a hybrid scoring approach:
+ 1. Direct merge: If a graph result matches a vector result (same GHCID),
+ the graph_score is directly applied
+ 2. Graph inheritance: Vector results inherit a portion of graph scores from
+ related institutions found via graph expansion (same city/type)
+
+ Args:
+ vector_results: Results from vector search
+ graph_results: Results from graph expansion
+ k: Number of final results
+
+ Returns:
+ Combined and ranked results
+ """
+ # Debug logging for investigation
+ logger.debug(f"Combining {len(vector_results)} vector + {len(graph_results)} graph results")
+
+ # Create lookup by GHCID for merging
+ results_by_ghcid: dict[str, RetrievedInstitution] = {}
+
+ # Track which vector GHCIDs we have for inheritance
+ vector_ghcids = set()
+
+ # Add vector results
+ for inst in vector_results:
+ if inst.ghcid:
+ results_by_ghcid[inst.ghcid] = inst
+ vector_ghcids.add(inst.ghcid)
+ logger.debug(f" Vector: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) "
+ f"v={inst.vector_score:.3f} g={inst.graph_score:.3f}")
+
+ # Track direct merges and inheritance candidates
+ direct_merges = 0
+ inheritance_boosts = []
+
+ # Merge graph results and build inheritance map
+ # inheritance_map: vector_ghcid -> list of (related_ghcid, graph_score, reason)
+ inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids}
+
+ for inst in graph_results:
+ logger.debug(f" Graph: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) "
+ f"g={inst.graph_score:.3f} reason={inst.expansion_reason} "
+ f"related_to={inst.related_institutions}")
+
+ if inst.ghcid in results_by_ghcid:
+ # Direct merge: graph result matches existing vector result
+ existing = results_by_ghcid[inst.ghcid]
+ old_graph_score = existing.graph_score
+ existing.graph_score = max(existing.graph_score, inst.graph_score)
+ existing.related_institutions.extend(inst.related_institutions)
+ if inst.expansion_reason:
+ existing.expansion_reason = inst.expansion_reason
+ direct_merges += 1
+ logger.debug(f" -> Direct merge! {inst.ghcid} graph_score: {old_graph_score:.3f} -> {existing.graph_score:.3f}")
+ else:
+ # New institution from graph expansion
+ results_by_ghcid[inst.ghcid] = inst
+
+ # Build inheritance: this graph result was expanded FROM a vector result
+ # The related_institutions field contains the seed GHCID(s) it was expanded from
+ for seed_ghcid in inst.related_institutions:
+ if seed_ghcid in inheritance_map:
+ inheritance_map[seed_ghcid].append(
+ (inst.ghcid, inst.graph_score, inst.expansion_reason or "related")
+ )
+
+ logger.debug(f"Direct merges: {direct_merges}")
+
+ # Apply graph score inheritance to vector results
+ # Vector results inherit a portion of graph scores from their related institutions
+ INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores
+
+ for vector_ghcid, related_list in inheritance_map.items():
+ if related_list and vector_ghcid in results_by_ghcid:
+ inst = results_by_ghcid[vector_ghcid]
+
+ # Calculate inherited score: average of related graph scores * inheritance factor
+ related_scores = [score for _, score, _ in related_list]
+ inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR
+
+ old_graph_score = inst.graph_score
+ # Inherit: take max of current graph_score and inherited score
+ inst.graph_score = max(inst.graph_score, inherited_score)
+
+ if inst.graph_score > old_graph_score:
+ # Track related institutions for context
+ related_ghcids = [ghcid for ghcid, _, _ in related_list]
+ inst.related_institutions.extend(related_ghcids[:3]) # Add up to 3 related
+
+ inheritance_boosts.append({
+ "ghcid": vector_ghcid,
+ "name": inst.name,
+ "old_graph": old_graph_score,
+ "new_graph": inst.graph_score,
+ "inherited_from": len(related_list),
+ "reasons": list(set(r for _, _, r in related_list))
+ })
+ logger.debug(f" Inheritance: {vector_ghcid} graph_score: {old_graph_score:.3f} -> "
+ f"{inst.graph_score:.3f} (from {len(related_list)} related institutions)")
+
+ if inheritance_boosts:
+ logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: "
+ f"{[b['ghcid'] for b in inheritance_boosts[:3]]}...")
+
+ # Calculate combined scores
+ for inst in results_by_ghcid.values():
+ inst.combined_score = (
+ self.vector_weight * inst.vector_score +
+ self.graph_weight * inst.graph_score
+ )
+
+ # Sort by combined score
+ ranked = sorted(
+ results_by_ghcid.values(),
+ key=lambda x: x.combined_score,
+ reverse=True
+ )
+
+ # Log top results for debugging
+ logger.debug(f"Top {min(5, len(ranked))} combined results:")
+ for i, inst in enumerate(ranked[:5]):
+ logger.debug(f" {i+1}. {inst.ghcid} ({inst.name[:25] if inst.name else '?'}...) "
+ f"combined={inst.combined_score:.3f} (v={inst.vector_score:.3f}, g={inst.graph_score:.3f})")
+
+ return ranked[:k]
+
+ def _get_person_collection_vector_size(self) -> int | None:
+ """Get the vector size of the person collection."""
+ try:
+ info = self.qdrant_client.get_collection("heritage_persons")
+ if hasattr(info.config.params, 'vectors'):
+ vectors_config = info.config.params.vectors
+ if isinstance(vectors_config, dict):
+ first_config = next(iter(vectors_config.values()), None)
+ return first_config.size if first_config else None
+ elif vectors_config is not None:
+ return vectors_config.size # type: ignore[union-attr]
+ return None
+ except Exception as e:
+ logger.warning(f"Could not get person collection vector size: {e}")
+ return None
+
+ def _person_vector_search(
+ self,
+ query: str,
+ k: int,
+ using: str | None = None,
+ filter_conditions: dict[str, Any] | None = None,
+ ) -> list[RetrievedPerson]:
+ """Perform vector similarity search in Qdrant heritage_persons collection.
+
+ Args:
+ query: Search query text
+ k: Number of results to retrieve
+ using: Optional embedding model name (for multi-embedding mode)
+ filter_conditions: Optional dict of field->value filters for Qdrant
+
+ Returns:
+ List of RetrievedPerson with vector scores
+ """
+ from qdrant_client.http import models
+
+ # Check person collection vector size and use appropriate model
+ person_vector_size = self._get_person_collection_vector_size()
+ person_model = using
+
+ if person_vector_size == 384 and not using:
+ # Person collection uses MiniLM (384-dim), override model selection
+ person_model = "minilm_384"
+ logger.info(f"Person collection uses 384-dim vectors, using MiniLM model")
+ elif person_vector_size == 1536 and not using:
+ person_model = "openai_1536"
+ elif person_vector_size == 768 and not using:
+ person_model = "bge_768"
+
+ query_vector = self._get_embedding(query, using=person_model)
+
+ try:
+ # Build query parameters
+ search_params: dict[str, Any] = {
+ "collection_name": "heritage_persons",
+ "query": query_vector,
+ "limit": k,
+ "with_payload": True,
+ }
+
+ # Add named vector 'using' ONLY if collection actually has named vectors
+ # Single-vector collections will error with "Not existing vector name" otherwise
+ if self.use_multi_embedding and self.multi_retriever:
+ uses_named = self.multi_retriever.uses_named_vectors("heritage_persons")
+ if uses_named:
+ if using:
+ search_params["using"] = using
+ elif self._selected_multi_model:
+ search_params["using"] = self._selected_multi_model.value
+ # else: single-vector collection, don't add 'using' parameter
+
+ # Add schema-aware filters if provided
+ if filter_conditions:
+ filter_list = []
+ for key, value in filter_conditions.items():
+ # Handle advanced match filters (e.g. {"email": {"match": {"text": "nos"}}})
+ if isinstance(value, dict) and "match" in value:
+ filter_list.append(
+ models.FieldCondition(
+ key=key,
+ match=models.MatchText(**value["match"])
+ )
+ )
+ else:
+ # Standard exact match value
+ filter_list.append(
+ models.FieldCondition(
+ key=key,
+ match=models.MatchValue(value=value),
+ )
+ )
+
+ search_params["query_filter"] = models.Filter(must=filter_list)
+ logger.info(f"[Qdrant] Applied person filters: {filter_conditions}")
+
+ logger.info(f"[Qdrant] Searching '{search_params['collection_name']}' with params: query_filter={filter_conditions}, limit={k}")
+
+ results = self.qdrant_client.query_points(**search_params)
+ except Exception as e:
+ logger.warning(f"Person collection search failed: {e}")
+ return []
+
+ persons = []
+ for point in results.points:
+ payload = point.payload or {}
+
+ # Extract richness score from payload (indexed by index_persons_qdrant.py)
+ richness_score = payload.get("richness_score", 0.0)
+
+ person = RetrievedPerson(
+ person_id=payload.get("staff_id", "") or hashlib.md5(
+ f"{payload.get('custodian_slug', '')}:{payload.get('name', '')}".encode()
+ ).hexdigest()[:16],
+ name=payload.get("name", ""),
+ vector_score=point.score,
+ richness_score=richness_score,
+ headline=payload.get("headline"),
+ custodian_name=payload.get("custodian_name"),
+ custodian_slug=payload.get("custodian_slug"),
+ location=payload.get("location"),
+ heritage_relevant=payload.get("heritage_relevant", False),
+ heritage_type=payload.get("heritage_type"),
+ source_type=payload.get("source_type"),
+ linkedin_url=payload.get("linkedin_url"),
+ has_wcms=payload.get("has_wcms", False),
+ # WCMS-specific fields
+ wcms_user_id=payload.get("wcms_user_id"),
+ wcms_abs_id=payload.get("wcms_abs_id"),
+ wcms_crm_id=payload.get("wcms_crm_id"),
+ wcms_username=payload.get("wcms_username"),
+ wcms_username_url=payload.get("wcms_username_url"),
+ wcms_status=payload.get("wcms_status"),
+ wcms_roles=payload.get("wcms_roles"),
+ wcms_registered_since=payload.get("wcms_registered_since"),
+ wcms_last_access=payload.get("wcms_last_access"),
+ # Contact details
+ email=payload.get("email"),
+ email_domain=payload.get("email_domain"),
+ )
+
+ # Apply richness score boosting
+ # Formula: combined_score = vector_score * (0.7 + 0.3 * richness_score)
+ # - Profiles with richness_score=0 get 70% of vector score
+ # - Profiles with richness_score=1 get 100% of vector score
+ # This ensures rich profiles rank higher than sparse ones at similar similarity
+ richness_boost = 0.7 + 0.3 * richness_score
+ person.combined_score = person.vector_score * richness_boost
+
+ # Apply name-matching boost for queries that look like person names
+ # This ensures that searching for "Kitty Bogte" returns Kitty Bogte first,
+ # even if vector similarity ranks other Dutch names higher
+ if looks_like_person_name(query) and person.name:
+ name_boost = calculate_name_match_boost(query, person.name)
+ if name_boost > 1.0:
+ logger.debug(f"Name match boost {name_boost}x for '{person.name}' (query: '{query}')")
+ person.combined_score *= name_boost
+
+ persons.append(person)
+
+ # Re-sort by combined score after name boosting
+ persons.sort(key=lambda p: p.combined_score, reverse=True)
+
+ return persons
+
+ def search_persons(
+ self,
+ query: str,
+ k: int | None = None,
+ filter_custodian: str | None = None,
+ only_heritage_relevant: bool = False,
+ only_wcms: bool = False,
+ using: str | None = None,
+ # Schema-aware filter parameters (from DSPy HeritageQueryRouter)
+ target_role_category: str | None = None,
+ target_custodian_type: str | None = None,
+ # Extra filters for robust domain search (e.g. email substring)
+ extra_filters: dict[str, Any] | None = None,
+ ) -> list[RetrievedPerson]:
+ """Search for persons/staff in the heritage_persons collection.
+
+ Args:
+ query: Natural language search query
+ k: Number of results to return (default: k_final)
+ filter_custodian: Optional custodian slug to filter by
+ only_heritage_relevant: Only return heritage-relevant staff
+ only_wcms: Only return WCMS-registered profiles (heritage sector users)
+ using: Optional embedding model name (for multi-embedding mode).
+ One of: "openai_1536", "minilm_384", "bge_768"
+ target_role_category: Role category from DSPy router (CURATORIAL, ARCHIVAL, etc.)
+ Used for headline-based post-filtering since not indexed in Qdrant.
+ target_custodian_type: Custodian type from DSPy router (MUSEUM, ARCHIVE, etc.)
+ Converted to heritage_type code for Qdrant filtering.
+ extra_filters: Optional extra Qdrant filters (e.g. {"email": {"match": {"text": "nos"}}})
+
+ Returns:
+ List of RetrievedPerson with scores
+ """
+ k = k or self.k_final
+
+ # Build Qdrant filter conditions from schema-aware parameters
+ heritage_type_code = get_heritage_type_code(target_custodian_type)
+ filter_conditions = build_schema_aware_person_filter(
+ heritage_type_code=heritage_type_code,
+ heritage_relevant_only=only_heritage_relevant,
+ custodian_slug=filter_custodian,
+ only_wcms=only_wcms,
+ ) or {}
+
+ # Merge extra filters if provided (e.g. email match)
+ if extra_filters:
+ filter_conditions.update(extra_filters)
+
+ if not filter_conditions:
+ filter_conditions = None
+
+ logger.info(f"Person search for: {query[:50]}... (model: {using or 'auto'}, role_category: {target_role_category}, custodian_type: {target_custodian_type}, extras: {extra_filters})")
+
+ # Over-fetch to allow for post-filtering and name boosting
+ # - Base multiplier: 2x for general queries
+ # - Role category filter: 3x (need more candidates for keyword filtering)
+ # - Name queries: fetch minimum 100 to ensure name boost can find exact matches
+ # (vector similarity often ranks similar-sounding names higher than exact matches)
+ is_name_query = looks_like_person_name(query)
+ fetch_multiplier = 3 if target_role_category else 2
+ fetch_count = max(k * fetch_multiplier, 100 if is_name_query else 0)
+ results = self._person_vector_search(query, fetch_count, using=using, filter_conditions=filter_conditions)
+ logger.info(f"Found {len(results)} person results after Qdrant filtering")
+
+ # Apply role category post-filtering (keyword-based since not indexed)
+ if target_role_category:
+ results = filter_by_role_category_keywords(results, target_role_category)
+
+ # Sort by combined score and limit
+ results.sort(key=lambda x: x.combined_score, reverse=True)
+ return results[:k]
+
+ def search(
+ self,
+ query: str,
+ k: int | None = None,
+ expand_graph: bool = True,
+ filter_conditions: dict[str, Any] | None = None,
+ auto_route: bool = True,
+ using: str | None = None,
+ region_codes: list[str] | None = None,
+ cities: list[str] | None = None,
+ institution_types: list[str] | None = None,
+ ) -> list[RetrievedInstitution] | list[RetrievedPerson]:
+ """Perform hybrid vector + graph search with automatic query routing.
+
+ If auto_route is True, automatically detects if query is about persons
+ (e.g., "Who works at Rijksmuseum?") and routes to person search.
+
+ Args:
+ query: Natural language search query
+ k: Number of results to return (default: k_final)
+ expand_graph: Whether to perform graph expansion (institution search only)
+ filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params)
+ auto_route: Automatically detect and route person queries
+ using: Optional embedding model name (for multi-embedding mode).
+ One of: "openai_1536", "minilm_384", "bge_768"
+ region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
+ for filtering by province/subdivision
+ cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
+ institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
+
+ Returns:
+ List of RetrievedInstitution or RetrievedPerson with combined scores
+ """
+ k = k or self.k_final
+
+ # Auto-route person queries
+ if auto_route:
+ query_type = detect_query_type(query)
+ if query_type == "person":
+ logger.info(f"Auto-routing to person search for: {query[:50]}...")
+ return self.search_persons(query, k=k, using=using)
+
+ # Institution search (original behavior)
+ filter_info = []
+ if region_codes:
+ filter_info.append(f"regions={region_codes}")
+ if cities:
+ filter_info.append(f"cities={cities}")
+ if institution_types:
+ filter_info.append(f"types={institution_types}")
+ filter_str = f" [{', '.join(filter_info)}]" if filter_info else ""
+
+ logger.info(f"Vector search for: {query[:50]}...{filter_str} (model: {using or 'auto'})")
+ vector_results = self._vector_search(
+ query,
+ self.k_vector,
+ using=using,
+ region_codes=region_codes,
+ cities=cities,
+ institution_types=institution_types,
+ )
+ logger.info(f"Found {len(vector_results)} vector results")
+
+ # Step 2: Graph expansion (if enabled)
+ graph_results = []
+ if expand_graph and vector_results:
+ logger.info("Expanding via knowledge graph...")
+ graph_results = self._graph_expand(vector_results)
+ logger.info(f"Found {len(graph_results)} graph expansion results")
+
+ # Step 3: Combine and rank
+ final_results = self._combine_and_rank(vector_results, graph_results, k)
+ logger.info(f"Returning {len(final_results)} combined results")
+
+ return final_results
+
+ def search_institutions(
+ self,
+ query: str,
+ k: int | None = None,
+ expand_graph: bool = True,
+ filter_conditions: dict[str, Any] | None = None,
+ using: str | None = None,
+ region_codes: list[str] | None = None,
+ cities: list[str] | None = None,
+ institution_types: list[str] | None = None,
+ ) -> list[RetrievedInstitution]:
+ """Explicit institution search (bypasses auto-routing).
+
+ Args:
+ query: Natural language search query
+ k: Number of results to return (default: k_final)
+ expand_graph: Whether to perform graph expansion
+ filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params)
+ using: Optional embedding model name (for multi-embedding mode).
+ One of: "openai_1536", "minilm_384", "bge_768"
+ region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
+ for filtering by province/subdivision
+ cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
+ institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
+
+ Returns:
+ List of RetrievedInstitution with combined scores
+ """
+ # auto_route=False ensures we get RetrievedInstitution, not RetrievedPerson
+ results = self.search(
+ query,
+ k=k,
+ expand_graph=expand_graph,
+ filter_conditions=filter_conditions,
+ auto_route=False,
+ using=using,
+ region_codes=region_codes,
+ cities=cities,
+ institution_types=institution_types,
+ )
+ return results # type: ignore[return-value]
+
+ def __call__(self, query: str, k: int | None = None) -> list[str]:
+ """DSPy-compatible interface returning passage texts.
+
+ Supports both institution and person queries with auto-routing.
+
+ Args:
+ query: Search query
+ k: Number of results
+
+ Returns:
+ List of passage texts (institution/person descriptions)
+ """
+ results = self.search(query, k=k)
+
+ passages = []
+ for r in results:
+ if isinstance(r, RetrievedPerson):
+ # Person result
+ org = f" at {r.custodian_name}" if r.custodian_name else ""
+ role = r.headline or "Unknown role"
+ passages.append(f"{r.name} ({role}{org})")
+ else:
+ # Institution result
+ inst_type = r.institution_type or "Unknown type"
+ desc = r.description or "No description"
+ passages.append(f"{r.name} ({inst_type}) - {desc}")
+
+ return passages
+
+ def get_stats(self) -> dict[str, Any]:
+ """Get retriever statistics.
+
+ Returns:
+ Dict with Qdrant and Oxigraph stats
+ """
+ stats = {
+ "qdrant": {
+ "institutions": {},
+ "persons": {},
+ },
+ "oxigraph": {},
+ "config": {
+ "vector_weight": self.vector_weight,
+ "graph_weight": self.graph_weight,
+ "k_vector": self.k_vector,
+ "k_expand": self.k_expand,
+ "k_final": self.k_final
+ }
+ }
+
+ # Qdrant institution collection stats
+ try:
+ info = self.qdrant_client.get_collection(self.collection_name)
+ stats["qdrant"]["institutions"] = {
+ "collection": self.collection_name,
+ "points_count": info.points_count,
+ "status": info.status.value if info.status else "unknown"
+ }
+ except Exception as e:
+ stats["qdrant"]["institutions"]["error"] = str(e)
+
+ # Qdrant person collection stats
+ try:
+ info = self.qdrant_client.get_collection("heritage_persons")
+ stats["qdrant"]["persons"] = {
+ "collection": "heritage_persons",
+ "points_count": info.points_count,
+ "status": info.status.value if info.status else "unknown"
+ }
+ except Exception as e:
+ stats["qdrant"]["persons"]["error"] = str(e)
+
+ # Oxigraph stats
+ try:
+ result = self.sparql_client.query(
+ "SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s a hcc:Custodian }"
+ )
+ if result:
+ stats["oxigraph"]["custodian_count"] = int(result[0].get("count", 0))
+ except Exception as e:
+ stats["oxigraph"]["error"] = str(e)
+
+ return stats
+
+ def close(self):
+ """Clean up resources."""
+ self.sparql_client.close()
+ if self._qdrant_client:
+ self._qdrant_client.close()
+
+
+def create_hybrid_retriever(
+ use_production: bool = False,
+ **kwargs
+) -> HybridRetriever:
+ """Factory function to create a hybrid retriever.
+
+ Args:
+ use_production: If True, connect to production endpoints
+ **kwargs: Additional arguments for HybridRetriever
+
+ Returns:
+ Configured HybridRetriever instance
+ """
+ if use_production:
+ return HybridRetriever(
+ qdrant_host="bronhouder.nl",
+ qdrant_port=443,
+ sparql_endpoint="https://bronhouder.nl/sparql",
+ use_production_qdrant=True,
+ **kwargs
+ )
+ else:
+ return HybridRetriever(
+ qdrant_host=os.getenv("QDRANT_HOST", "localhost"),
+ qdrant_port=int(os.getenv("QDRANT_PORT", "6333")),
+ sparql_endpoint=os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query"),
+ **kwargs
+ )
diff --git a/backend/rag/main.py b/backend/rag/main.py
index 2c16f3d5bd..0a15f6e35d 100644
--- a/backend/rag/main.py
+++ b/backend/rag/main.py
@@ -1660,6 +1660,7 @@ class MultiSourceRetriever:
only_heritage_relevant: bool = False,
only_wcms: bool = False,
using: str | None = None,
+ extra_filters: dict[str, Any] | None = None,
) -> list[Any]:
"""Search for persons/staff in the heritage_persons collection.
@@ -1672,20 +1673,29 @@ class MultiSourceRetriever:
only_heritage_relevant: Only return heritage-relevant staff
only_wcms: Only return WCMS-registered profiles
using: Optional embedding model to use (e.g., 'minilm_384', 'openai_1536')
+ extra_filters: Optional extra filters for Qdrant
Returns:
List of RetrievedPerson objects
"""
if self.qdrant:
try:
- return self.qdrant.search_persons( # type: ignore[no-any-return]
- query=query,
- k=k,
- filter_custodian=filter_custodian,
- only_heritage_relevant=only_heritage_relevant,
- only_wcms=only_wcms,
- using=using,
- )
+ # Dynamically check if qdrant.search_persons supports extra_filters
+ # This handles case where HybridRetriever signature varies
+ import inspect
+ sig = inspect.signature(self.qdrant.search_persons)
+ kwargs = {
+ "query": query,
+ "k": k,
+ "filter_custodian": filter_custodian,
+ "only_heritage_relevant": only_heritage_relevant,
+ "only_wcms": only_wcms,
+ "using": using,
+ }
+ if "extra_filters" in sig.parameters:
+ kwargs["extra_filters"] = extra_filters
+
+ return self.qdrant.search_persons(**kwargs) # type: ignore[no-any-return]
except Exception as e:
logger.error(f"Person search failed: {e}")
return []
@@ -2755,11 +2765,18 @@ async def person_search(request: PersonSearchRequest) -> PersonSearchResponse:
# Augment query for better recall on domain names if it looks like a domain search
# "nos" -> "nos email domain nos" to guide vector search towards email addresses
search_query = request.query
+ extra_filters = None
+
+ # Check for single word domain-like queries
if len(search_query.split()) == 1 and len(search_query) > 2 and "@" not in search_query:
# Heuristic: single word queries might be domain searches
- # We append "email domain" context to guide the embedding
- search_query = f"{search_query} email domain {search_query}"
+ # We use MatchText filtering on email field to find substring matches
+ # Qdrant "match": {"text": "nos"} performs token-based matching
+ extra_filters = {"email": {"match": {"text": search_query}}}
+ logger.info(f"[PersonSearch] Potential domain search detected for '{search_query}'. Applying strict email filter: {extra_filters}")
+ logger.info(f"[PersonSearch] Executing search for '{search_query}' (extra_filters={extra_filters})")
+
# Use the hybrid retriever's person search
results = retriever.search_persons(
query=search_query,
@@ -2768,8 +2785,27 @@ async def person_search(request: PersonSearchRequest) -> PersonSearchResponse:
only_heritage_relevant=request.only_heritage_relevant,
only_wcms=request.only_wcms,
using=request.embedding_model, # Pass embedding model
+ extra_filters=extra_filters,
)
+ # FALLBACK: If strict domain filter yielded no results, try standard vector search
+ # This fixes the issue where searching for names like "willem" (which look like domains)
+ # would fail because they don't appear in emails.
+ if extra_filters and not results:
+ logger.info(f"[PersonSearch] No results with email filter for '{search_query}'. Falling back to standard vector search.")
+ results = retriever.search_persons(
+ query=search_query,
+ k=request.k,
+ filter_custodian=request.filter_custodian,
+ only_heritage_relevant=request.only_heritage_relevant,
+ only_wcms=request.only_wcms,
+ using=request.embedding_model,
+ extra_filters=None, # Disable filter for fallback
+ )
+ logger.info(f"[PersonSearch] Fallback search returned {len(results)} results")
+
+ logger.info(f"[PersonSearch] Final result count: {len(results)}")
+
# Determine which embedding model was actually used
embedding_model_used = None
qdrant = retriever.qdrant
diff --git a/backend/rag/multi_embedding_retriever.py b/backend/rag/multi_embedding_retriever.py
new file mode 100644
index 0000000000..b7ca264693
--- /dev/null
+++ b/backend/rag/multi_embedding_retriever.py
@@ -0,0 +1,846 @@
+"""
+Multi-Embedding Retriever for Heritage Data
+
+Supports multiple embedding models using Qdrant's named vectors feature.
+This enables:
+- A/B testing different embedding models
+- Cost optimization (cheap local embeddings vs paid API embeddings)
+- Gradual migration between embedding models
+- Fallback when one model is unavailable
+
+Supported Embedding Models:
+ - openai_1536: text-embedding-3-small (1536-dim, $0.02/1M tokens)
+ - minilm_384: all-MiniLM-L6-v2 (384-dim, free/local)
+ - bge_768: bge-base-en-v1.5 (768-dim, free/local, high quality)
+
+Collection Architecture:
+ Each collection has named vectors for each embedding model:
+
+ heritage_custodians:
+ vectors:
+ "openai_1536": VectorParams(size=1536)
+ "minilm_384": VectorParams(size=384)
+ payload: {name, ghcid, institution_type, ...}
+
+ heritage_persons:
+ vectors:
+ "openai_1536": VectorParams(size=1536)
+ "minilm_384": VectorParams(size=384)
+ payload: {name, headline, custodian_name, ...}
+
+Usage:
+ retriever = MultiEmbeddingRetriever()
+
+ # Search with default model (auto-select based on availability)
+ results = retriever.search("museums in Amsterdam")
+
+ # Search with specific model
+ results = retriever.search("museums in Amsterdam", using="minilm_384")
+
+ # A/B test comparison
+ comparison = retriever.compare_models("museums in Amsterdam")
+"""
+
+import hashlib
+import logging
+import os
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Literal
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingModel(str, Enum):
+ """Supported embedding models with their configurations."""
+
+ OPENAI_1536 = "openai_1536"
+ MINILM_384 = "minilm_384"
+ BGE_768 = "bge_768"
+
+ @property
+ def dimension(self) -> int:
+ """Get the vector dimension for this model."""
+ dims = {
+ "openai_1536": 1536,
+ "minilm_384": 384,
+ "bge_768": 768,
+ }
+ return dims[self.value]
+
+ @property
+ def model_name(self) -> str:
+ """Get the actual model name for loading."""
+ names = {
+ "openai_1536": "text-embedding-3-small",
+ "minilm_384": "all-MiniLM-L6-v2",
+ "bge_768": "BAAI/bge-base-en-v1.5",
+ }
+ return names[self.value]
+
+ @property
+ def is_local(self) -> bool:
+ """Check if this model runs locally (no API calls)."""
+ return self.value in ("minilm_384", "bge_768")
+
+ @property
+ def cost_per_1m_tokens(self) -> float:
+ """Approximate cost per 1M tokens (0 for local models)."""
+ costs = {
+ "openai_1536": 0.02,
+ "minilm_384": 0.0,
+ "bge_768": 0.0,
+ }
+ return costs[self.value]
+
+
+@dataclass
+class MultiEmbeddingConfig:
+ """Configuration for multi-embedding retriever."""
+
+ # Qdrant connection
+ qdrant_host: str = "localhost"
+ qdrant_port: int = 6333
+ qdrant_https: bool = False
+ qdrant_prefix: str | None = None
+
+ # API keys
+ openai_api_key: str | None = None
+
+ # Default embedding model preference order
+ # First available model is used if no explicit model is specified
+ model_preference: list[EmbeddingModel] = field(default_factory=lambda: [
+ EmbeddingModel.MINILM_384, # Free, fast, good quality
+ EmbeddingModel.OPENAI_1536, # Higher quality, paid
+ EmbeddingModel.BGE_768, # Free, high quality, slower
+ ])
+
+ # Collection names
+ institutions_collection: str = "heritage_custodians"
+ persons_collection: str = "heritage_persons"
+
+ # Search defaults
+ default_k: int = 10
+
+
+class MultiEmbeddingRetriever:
+ """Retriever supporting multiple embedding models via Qdrant named vectors.
+
+ This class manages multiple embedding models and allows searching with
+ any available model. It handles:
+ - Model lazy-loading
+ - Automatic model selection based on availability
+ - Named vector creation and search
+ - A/B testing between models
+ """
+
+ def __init__(self, config: MultiEmbeddingConfig | None = None):
+ """Initialize multi-embedding retriever.
+
+ Args:
+ config: Configuration options. If None, uses environment variables.
+ """
+ self.config = config or self._config_from_env()
+
+ # Lazy-loaded clients
+ self._qdrant_client = None
+ self._openai_client = None
+ self._st_models: dict[str, Any] = {} # Sentence transformer models
+
+ # Track available models per collection
+ self._available_models: dict[str, set[EmbeddingModel]] = {}
+
+ # Track whether each collection uses named vectors (vs single unnamed vector)
+ self._uses_named_vectors: dict[str, bool] = {}
+
+ logger.info(f"MultiEmbeddingRetriever initialized with preference: {[m.value for m in self.config.model_preference]}")
+
+ @staticmethod
+ def _config_from_env() -> MultiEmbeddingConfig:
+ """Create configuration from environment variables."""
+ use_production = os.getenv("QDRANT_USE_PRODUCTION", "false").lower() == "true"
+
+ if use_production:
+ return MultiEmbeddingConfig(
+ qdrant_host=os.getenv("QDRANT_PROD_HOST", "bronhouder.nl"),
+ qdrant_port=443,
+ qdrant_https=True,
+ qdrant_prefix=os.getenv("QDRANT_PROD_PREFIX", "qdrant"),
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
+ )
+ else:
+ return MultiEmbeddingConfig(
+ qdrant_host=os.getenv("QDRANT_HOST", "localhost"),
+ qdrant_port=int(os.getenv("QDRANT_PORT", "6333")),
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
+ )
+
+ @property
+ def qdrant_client(self):
+ """Lazy-load Qdrant client."""
+ if self._qdrant_client is None:
+ from qdrant_client import QdrantClient
+
+ if self.config.qdrant_https:
+ self._qdrant_client = QdrantClient(
+ host=self.config.qdrant_host,
+ port=self.config.qdrant_port,
+ https=True,
+ prefix=self.config.qdrant_prefix,
+ prefer_grpc=False,
+ timeout=30,
+ )
+ logger.info(f"Connected to Qdrant: https://{self.config.qdrant_host}/{self.config.qdrant_prefix or ''}")
+ else:
+ self._qdrant_client = QdrantClient(
+ host=self.config.qdrant_host,
+ port=self.config.qdrant_port,
+ )
+ logger.info(f"Connected to Qdrant: {self.config.qdrant_host}:{self.config.qdrant_port}")
+
+ return self._qdrant_client
+
+ @property
+ def openai_client(self):
+ """Lazy-load OpenAI client."""
+ if self._openai_client is None:
+ if not self.config.openai_api_key:
+ raise RuntimeError("OpenAI API key not configured")
+
+ import openai
+ self._openai_client = openai.OpenAI(api_key=self.config.openai_api_key)
+
+ return self._openai_client
+
+ def _load_sentence_transformer(self, model: EmbeddingModel) -> Any:
+ """Lazy-load a sentence-transformers model.
+
+ Args:
+ model: The embedding model to load
+
+ Returns:
+ Loaded SentenceTransformer model
+ """
+ if model.value not in self._st_models:
+ try:
+ from sentence_transformers import SentenceTransformer
+ self._st_models[model.value] = SentenceTransformer(model.model_name)
+ logger.info(f"Loaded sentence-transformers model: {model.model_name}")
+ except ImportError:
+ raise RuntimeError(
+ "sentence-transformers not installed. Run: pip install sentence-transformers"
+ )
+
+ return self._st_models[model.value]
+
+ def get_embedding(self, text: str, model: EmbeddingModel) -> list[float]:
+ """Get embedding vector for text using specified model.
+
+ Args:
+ text: Text to embed
+ model: Embedding model to use
+
+ Returns:
+ Embedding vector as list of floats
+ """
+ if model == EmbeddingModel.OPENAI_1536:
+ response = self.openai_client.embeddings.create(
+ input=text,
+ model=model.model_name,
+ )
+ return response.data[0].embedding
+
+ elif model in (EmbeddingModel.MINILM_384, EmbeddingModel.BGE_768):
+ st_model = self._load_sentence_transformer(model)
+ embedding = st_model.encode(text)
+ return embedding.tolist()
+
+ else:
+ raise ValueError(f"Unknown embedding model: {model}")
+
+ def get_embeddings_batch(
+ self,
+ texts: list[str],
+ model: EmbeddingModel,
+ batch_size: int = 32,
+ ) -> list[list[float]]:
+ """Get embedding vectors for multiple texts.
+
+ Args:
+ texts: List of texts to embed
+ model: Embedding model to use
+ batch_size: Batch size for processing
+
+ Returns:
+ List of embedding vectors
+ """
+ if not texts:
+ return []
+
+ if model == EmbeddingModel.OPENAI_1536:
+ # OpenAI batch API (max 2048 per request)
+ all_embeddings = []
+ for i in range(0, len(texts), 2048):
+ batch = texts[i:i + 2048]
+ response = self.openai_client.embeddings.create(
+ input=batch,
+ model=model.model_name,
+ )
+ batch_embeddings = [item.embedding for item in sorted(response.data, key=lambda x: x.index)]
+ all_embeddings.extend(batch_embeddings)
+ return all_embeddings
+
+ elif model in (EmbeddingModel.MINILM_384, EmbeddingModel.BGE_768):
+ st_model = self._load_sentence_transformer(model)
+ embeddings = st_model.encode(texts, batch_size=batch_size, show_progress_bar=len(texts) > 100)
+ return embeddings.tolist()
+
+ else:
+ raise ValueError(f"Unknown embedding model: {model}")
+
+ def get_available_models(self, collection_name: str) -> set[EmbeddingModel]:
+ """Get the embedding models available for a collection.
+
+ Checks which named vectors exist in the collection.
+ For single-vector collections, returns models matching the dimension.
+
+ Args:
+ collection_name: Name of the Qdrant collection
+
+ Returns:
+ Set of available EmbeddingModel values
+ """
+ if collection_name in self._available_models:
+ return self._available_models[collection_name]
+
+ try:
+ info = self.qdrant_client.get_collection(collection_name)
+ vectors_config = info.config.params.vectors
+
+ available = set()
+ uses_named_vectors = False
+
+ # Check for named vectors (dict of vector configs)
+ if isinstance(vectors_config, dict):
+ # Named vectors - each key is a vector name
+ uses_named_vectors = True
+ for vector_name in vectors_config.keys():
+ try:
+ model = EmbeddingModel(vector_name)
+ available.add(model)
+ except ValueError:
+ logger.warning(f"Unknown vector name in collection: {vector_name}")
+ else:
+ # Single unnamed vector - check dimension to find compatible model
+ # Note: This doesn't mean we can use `using=model.value` in queries
+ uses_named_vectors = False
+ if hasattr(vectors_config, 'size'):
+ dim = vectors_config.size
+ for model in EmbeddingModel:
+ if model.dimension == dim:
+ available.add(model)
+
+ # Store both available models and whether named vectors are used
+ self._available_models[collection_name] = available
+ self._uses_named_vectors[collection_name] = uses_named_vectors
+
+ if uses_named_vectors:
+ logger.info(f"Collection '{collection_name}' uses named vectors: {[m.value for m in available]}")
+ else:
+ logger.info(f"Collection '{collection_name}' uses single vector (compatible with: {[m.value for m in available]})")
+
+ return available
+
+ except Exception as e:
+ logger.warning(f"Could not get available models for {collection_name}: {e}")
+ return set()
+
+ def uses_named_vectors(self, collection_name: str) -> bool:
+ """Check if a collection uses named vectors (vs single unnamed vector).
+
+ Args:
+ collection_name: Name of the Qdrant collection
+
+ Returns:
+ True if collection has named vectors, False for single-vector collections
+ """
+ # Ensure models are loaded (populates _uses_named_vectors)
+ self.get_available_models(collection_name)
+ return self._uses_named_vectors.get(collection_name, False)
+
+ def select_model(
+ self,
+ collection_name: str,
+ preferred: EmbeddingModel | None = None,
+ ) -> EmbeddingModel | None:
+ """Select the best available embedding model for a collection.
+
+ Args:
+ collection_name: Name of the collection
+ preferred: Preferred model (used if available)
+
+ Returns:
+ Selected EmbeddingModel or None if none available
+ """
+ available = self.get_available_models(collection_name)
+
+ if not available:
+ # No named vectors - check if we can use any model
+ # This happens for legacy single-vector collections
+ try:
+ info = self.qdrant_client.get_collection(collection_name)
+ vectors_config = info.config.params.vectors
+
+ # Get vector dimension
+ dim = None
+ if hasattr(vectors_config, 'size'):
+ dim = vectors_config.size
+ elif isinstance(vectors_config, dict):
+ # Get first vector config
+ first_config = next(iter(vectors_config.values()), None)
+ if first_config and hasattr(first_config, 'size'):
+ dim = first_config.size
+
+ if dim:
+ for model in self.config.model_preference:
+ if model.dimension == dim:
+ return model
+ except Exception:
+ pass
+
+ return None
+
+ # If preferred model is available, use it
+ if preferred and preferred in available:
+ return preferred
+
+ # Otherwise, follow preference order
+ for model in self.config.model_preference:
+ if model in available:
+ # Check if model is usable (has API key if needed)
+ if model == EmbeddingModel.OPENAI_1536 and not self.config.openai_api_key:
+ continue
+ return model
+
+ return None
+
+ def search(
+ self,
+ query: str,
+ collection_name: str | None = None,
+ k: int | None = None,
+ using: EmbeddingModel | str | None = None,
+ filter_conditions: dict[str, Any] | None = None,
+ ) -> list[dict[str, Any]]:
+ """Search for similar documents using specified or auto-selected model.
+
+ Args:
+ query: Search query text
+ collection_name: Collection to search (default: institutions)
+ k: Number of results
+ using: Embedding model to use (auto-selected if None)
+ filter_conditions: Optional Qdrant filter conditions
+
+ Returns:
+ List of results with scores and payloads
+ """
+ collection_name = collection_name or self.config.institutions_collection
+ k = k or self.config.default_k
+
+ # Resolve model
+ if using is not None:
+ if isinstance(using, str):
+ model = EmbeddingModel(using)
+ else:
+ model = using
+ else:
+ model = self.select_model(collection_name)
+
+ if model is None:
+ raise RuntimeError(f"No compatible embedding model for collection '{collection_name}'")
+
+ logger.info(f"Searching '{collection_name}' with {model.value}: {query[:50]}...")
+
+ # Get query embedding
+ query_vector = self.get_embedding(query, model)
+
+ # Build filter
+ from qdrant_client.http import models
+
+ query_filter = None
+ if filter_conditions:
+ query_filter = models.Filter(
+ must=[
+ models.FieldCondition(
+ key=key,
+ match=models.MatchValue(value=value),
+ )
+ for key, value in filter_conditions.items()
+ ]
+ )
+
+ # Check if collection uses named vectors (not just single unnamed vector)
+ # Only pass `using=model.value` if collection has actual named vectors
+ use_named_vector = self.uses_named_vectors(collection_name)
+
+ # Search
+ if use_named_vector:
+ results = self.qdrant_client.query_points(
+ collection_name=collection_name,
+ query=query_vector,
+ using=model.value,
+ limit=k,
+ with_payload=True,
+ query_filter=query_filter,
+ )
+ else:
+ # Legacy single-vector search
+ results = self.qdrant_client.query_points(
+ collection_name=collection_name,
+ query=query_vector,
+ limit=k,
+ with_payload=True,
+ query_filter=query_filter,
+ )
+
+ return [
+ {
+ "id": str(point.id),
+ "score": point.score,
+ "model": model.value,
+ "payload": point.payload or {},
+ }
+ for point in results.points
+ ]
+
+ def search_persons(
+ self,
+ query: str,
+ k: int | None = None,
+ using: EmbeddingModel | str | None = None,
+ filter_custodian: str | None = None,
+ only_heritage_relevant: bool = False,
+ only_wcms: bool = False,
+ ) -> list[dict[str, Any]]:
+ """Search for persons/staff in the heritage_persons collection.
+
+ Args:
+ query: Search query text
+ k: Number of results
+ using: Embedding model to use
+ filter_custodian: Optional custodian slug to filter by
+ only_heritage_relevant: Only return heritage-relevant staff
+ only_wcms: Only return WCMS-registered profiles (heritage sector users)
+
+ Returns:
+ List of person results with scores
+ """
+ k = k or self.config.default_k
+
+ # Build filters
+ filters = {}
+ if filter_custodian:
+ filters["custodian_slug"] = filter_custodian
+ if only_wcms:
+ filters["has_wcms"] = True
+
+ # Search with over-fetch for post-filtering
+ results = self.search(
+ query=query,
+ collection_name=self.config.persons_collection,
+ k=k * 2,
+ using=using,
+ filter_conditions=filters if filters else None,
+ )
+
+ # Post-filter for heritage_relevant if needed
+ if only_heritage_relevant:
+ results = [r for r in results if r.get("payload", {}).get("heritage_relevant", False)]
+
+ # Format results
+ formatted = []
+ for r in results[:k]:
+ payload = r.get("payload", {})
+ formatted.append({
+ "person_id": payload.get("staff_id", "") or hashlib.md5(
+ f"{payload.get('custodian_slug', '')}:{payload.get('name', '')}".encode()
+ ).hexdigest()[:16],
+ "name": payload.get("name", ""),
+ "headline": payload.get("headline"),
+ "custodian_name": payload.get("custodian_name"),
+ "custodian_slug": payload.get("custodian_slug"),
+ "location": payload.get("location"),
+ "heritage_relevant": payload.get("heritage_relevant", False),
+ "heritage_type": payload.get("heritage_type"),
+ "linkedin_url": payload.get("linkedin_url"),
+ "score": r["score"],
+ "model": r["model"],
+ })
+
+ return formatted
+
+ def compare_models(
+ self,
+ query: str,
+ collection_name: str | None = None,
+ k: int = 10,
+ models: list[EmbeddingModel] | None = None,
+ ) -> dict[str, Any]:
+ """A/B test comparison of multiple embedding models.
+
+ Args:
+ query: Search query
+ collection_name: Collection to search
+ k: Number of results per model
+ models: Models to compare (default: all available)
+
+ Returns:
+ Dict with results per model and overlap analysis
+ """
+ collection_name = collection_name or self.config.institutions_collection
+
+ # Determine which models to compare
+ available = self.get_available_models(collection_name)
+ if models:
+ models_to_test = [m for m in models if m in available]
+ else:
+ models_to_test = list(available)
+
+ if not models_to_test:
+ return {"error": "No models available for comparison"}
+
+ results = {}
+ all_ids = {}
+
+ for model in models_to_test:
+ try:
+ model_results = self.search(
+ query=query,
+ collection_name=collection_name,
+ k=k,
+ using=model,
+ )
+ results[model.value] = model_results
+ all_ids[model.value] = {r["id"] for r in model_results}
+ except Exception as e:
+ results[model.value] = {"error": str(e)}
+ all_ids[model.value] = set()
+
+ # Calculate overlap between models
+ overlap = {}
+ model_values = list(all_ids.keys())
+ for i, m1 in enumerate(model_values):
+ for m2 in model_values[i + 1:]:
+ if all_ids[m1] and all_ids[m2]:
+ intersection = all_ids[m1] & all_ids[m2]
+ union = all_ids[m1] | all_ids[m2]
+ jaccard = len(intersection) / len(union) if union else 0
+ overlap[f"{m1}_vs_{m2}"] = {
+ "jaccard_similarity": round(jaccard, 3),
+ "common_results": len(intersection),
+ "total_unique": len(union),
+ }
+
+ return {
+ "query": query,
+ "collection": collection_name,
+ "k": k,
+ "results": results,
+ "overlap_analysis": overlap,
+ }
+
+ def create_multi_embedding_collection(
+ self,
+ collection_name: str,
+ models: list[EmbeddingModel] | None = None,
+ ) -> bool:
+ """Create a new collection with named vectors for multiple embedding models.
+
+ Args:
+ collection_name: Name for the new collection
+ models: Embedding models to support (default: all)
+
+ Returns:
+ True if created successfully
+ """
+ from qdrant_client.http.models import Distance, VectorParams
+
+ models = models or list(EmbeddingModel)
+
+ vectors_config = {
+ model.value: VectorParams(
+ size=model.dimension,
+ distance=Distance.COSINE,
+ )
+ for model in models
+ }
+
+ try:
+ self.qdrant_client.create_collection(
+ collection_name=collection_name,
+ vectors_config=vectors_config,
+ )
+ logger.info(f"Created multi-embedding collection '{collection_name}' with {[m.value for m in models]}")
+
+ # Clear cache
+ self._available_models.pop(collection_name, None)
+
+ return True
+
+ except Exception as e:
+ logger.error(f"Failed to create collection: {e}")
+ return False
+
+ def add_documents_multi_embedding(
+ self,
+ documents: list[dict[str, Any]],
+ collection_name: str,
+ models: list[EmbeddingModel] | None = None,
+ batch_size: int = 100,
+ ) -> int:
+ """Add documents with embeddings from multiple models.
+
+ Args:
+ documents: List of documents with 'text' and optional 'metadata' fields
+ collection_name: Target collection
+ models: Models to generate embeddings for (default: all available)
+ batch_size: Batch size for processing
+
+ Returns:
+ Number of documents added
+ """
+ from qdrant_client.http import models as qmodels
+
+ # Determine which models to use
+ available = self.get_available_models(collection_name)
+ if models:
+ models_to_use = [m for m in models if m in available]
+ else:
+ models_to_use = list(available)
+
+ if not models_to_use:
+ raise RuntimeError(f"No embedding models available for collection '{collection_name}'")
+
+ # Filter valid documents
+ valid_docs = [d for d in documents if d.get("text")]
+ total_indexed = 0
+
+ for i in range(0, len(valid_docs), batch_size):
+ batch = valid_docs[i:i + batch_size]
+ texts = [d["text"] for d in batch]
+
+ # Generate embeddings for each model
+ embeddings_by_model = {}
+ for model in models_to_use:
+ try:
+ embeddings_by_model[model] = self.get_embeddings_batch(texts, model)
+ except Exception as e:
+ logger.warning(f"Failed to get {model.value} embeddings: {e}")
+
+ if not embeddings_by_model:
+ continue
+
+ # Create points with named vectors
+ points = []
+ for j, doc in enumerate(batch):
+ text = doc["text"]
+ metadata = doc.get("metadata", {})
+ point_id = doc.get("id") or hashlib.md5(text.encode()).hexdigest()
+
+ # Build named vectors dict
+ vectors = {}
+ for model, model_embeddings in embeddings_by_model.items():
+ vectors[model.value] = model_embeddings[j]
+
+ points.append(qmodels.PointStruct(
+ id=point_id,
+ vector=vectors,
+ payload={
+ "text": text,
+ **metadata,
+ }
+ ))
+
+ # Upsert batch
+ self.qdrant_client.upsert(
+ collection_name=collection_name,
+ points=points,
+ )
+ total_indexed += len(points)
+ logger.info(f"Indexed {total_indexed}/{len(valid_docs)} documents with {len(models_to_use)} models")
+
+ return total_indexed
+
+ def get_stats(self) -> dict[str, Any]:
+ """Get statistics about collections and available models.
+
+ Returns:
+ Dict with collection stats and model availability
+ """
+ stats = {
+ "config": {
+ "qdrant_host": self.config.qdrant_host,
+ "qdrant_port": self.config.qdrant_port,
+ "model_preference": [m.value for m in self.config.model_preference],
+ "openai_available": bool(self.config.openai_api_key),
+ },
+ "collections": {},
+ }
+
+ for collection_name in [self.config.institutions_collection, self.config.persons_collection]:
+ try:
+ info = self.qdrant_client.get_collection(collection_name)
+ available_models = self.get_available_models(collection_name)
+ selected_model = self.select_model(collection_name)
+
+ stats["collections"][collection_name] = {
+ "vectors_count": info.vectors_count,
+ "points_count": info.points_count,
+ "status": info.status.value if info.status else "unknown",
+ "available_models": [m.value for m in available_models],
+ "selected_model": selected_model.value if selected_model else None,
+ }
+ except Exception as e:
+ stats["collections"][collection_name] = {"error": str(e)}
+
+ return stats
+
+ def close(self):
+ """Close all connections."""
+ if self._qdrant_client:
+ self._qdrant_client.close()
+ self._qdrant_client = None
+ self._st_models.clear()
+ self._available_models.clear()
+ self._uses_named_vectors.clear()
+
+
+def create_multi_embedding_retriever(use_production: bool | None = None) -> MultiEmbeddingRetriever:
+ """Factory function to create a MultiEmbeddingRetriever.
+
+ Args:
+ use_production: If True, connect to production Qdrant.
+ Defaults to QDRANT_USE_PRODUCTION env var.
+
+ Returns:
+ Configured MultiEmbeddingRetriever instance
+ """
+ if use_production is None:
+ use_production = os.getenv("QDRANT_USE_PRODUCTION", "").lower() in ("true", "1", "yes")
+
+ if use_production:
+ config = MultiEmbeddingConfig(
+ qdrant_host=os.getenv("QDRANT_PROD_HOST", "bronhouder.nl"),
+ qdrant_port=443,
+ qdrant_https=True,
+ qdrant_prefix=os.getenv("QDRANT_PROD_PREFIX", "qdrant"),
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
+ )
+ else:
+ config = MultiEmbeddingConfig(
+ qdrant_host=os.getenv("QDRANT_HOST", "localhost"),
+ qdrant_port=int(os.getenv("QDRANT_PORT", "6333")),
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
+ )
+
+ return MultiEmbeddingRetriever(config)
diff --git a/frontend/public/schemas/20251121/linkml/manifest.json b/frontend/public/schemas/20251121/linkml/manifest.json
index e1938ec888..76a6332c6a 100644
--- a/frontend/public/schemas/20251121/linkml/manifest.json
+++ b/frontend/public/schemas/20251121/linkml/manifest.json
@@ -1,5 +1,5 @@
{
- "generated": "2026-01-27T08:03:23.376Z",
+ "generated": "2026-01-27T08:04:51.838Z",
"schemaRoot": "/schemas/20251121/linkml",
"totalFiles": 3014,
"categoryCounts": {
diff --git a/schemas/20251121/linkml/manifest.json b/schemas/20251121/linkml/manifest.json
index 76a6332c6a..4f4ecd25f2 100644
--- a/schemas/20251121/linkml/manifest.json
+++ b/schemas/20251121/linkml/manifest.json
@@ -1,5 +1,5 @@
{
- "generated": "2026-01-27T08:04:51.838Z",
+ "generated": "2026-01-27T09:07:17.016Z",
"schemaRoot": "/schemas/20251121/linkml",
"totalFiles": 3014,
"categoryCounts": {
diff --git a/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml b/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml
new file mode 100644
index 0000000000..f4ca301e51
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml
@@ -0,0 +1,7 @@
+classes:
+ APIEndpoint:
+ class_uri: schema:EntryPoint
+ description: "An API endpoint."
+ slots:
+ - has_or_had_url
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/APIRequest.yaml b/schemas/20251121/linkml/modules/classes/APIRequest.yaml
new file mode 100644
index 0000000000..015f01f5dc
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/APIRequest.yaml
@@ -0,0 +1,8 @@
+classes:
+ APIRequest:
+ class_uri: prov:Activity
+ description: "An API request event."
+ slots:
+ - has_or_had_provenance
+ - has_or_had_endpoint
+ - has_or_had_version
diff --git a/schemas/20251121/linkml/modules/classes/APIVersion.yaml b/schemas/20251121/linkml/modules/classes/APIVersion.yaml
new file mode 100644
index 0000000000..2b50a4cf53
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/APIVersion.yaml
@@ -0,0 +1,7 @@
+classes:
+ APIVersion:
+ class_uri: schema:SoftwareApplication
+ description: "Version of an API."
+ slots:
+ - has_or_had_label
+ - has_or_had_identifier
diff --git a/schemas/20251121/linkml/modules/classes/Altitude.yaml b/schemas/20251121/linkml/modules/classes/Altitude.yaml
new file mode 100644
index 0000000000..517bbf0f44
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/Altitude.yaml
@@ -0,0 +1,7 @@
+classes:
+ Altitude:
+ class_uri: schema:QuantitativeValue
+ description: "The altitude of a place."
+ slots:
+ - has_or_had_value
+ - has_or_had_unit
diff --git a/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml b/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml
new file mode 100644
index 0000000000..d8c86aa7f9
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml
@@ -0,0 +1,8 @@
+classes:
+ AmendmentEvent:
+ class_uri: prov:Activity
+ description: "An event where a document or agreement was amended."
+ slots:
+ - temporal_extent
+ - has_or_had_description
+ - has_or_had_identifier
diff --git a/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml b/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml
new file mode 100644
index 0000000000..915bf4a18d
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml
@@ -0,0 +1,8 @@
+classes:
+ AnnexCreationEvent:
+ class_uri: prov:Activity
+ description: "An event where an annex was created or established."
+ slots:
+ - temporal_extent
+ - has_or_had_description
+ - has_or_had_reason
diff --git a/schemas/20251121/linkml/modules/classes/AppellationType.yaml b/schemas/20251121/linkml/modules/classes/AppellationType.yaml
new file mode 100644
index 0000000000..a466a2cb07
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/AppellationType.yaml
@@ -0,0 +1,6 @@
+classes:
+ AppellationType:
+ class_uri: skos:Concept
+ description: "Type of appellation/name."
+ slots:
+ - has_or_had_label
diff --git a/schemas/20251121/linkml/modules/classes/Archdiocese.yaml b/schemas/20251121/linkml/modules/classes/Archdiocese.yaml
new file mode 100644
index 0000000000..28b9d218e3
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/Archdiocese.yaml
@@ -0,0 +1,6 @@
+classes:
+ Archdiocese:
+ class_uri: schema:AdministrativeArea
+ description: "An archdiocese."
+ slots:
+ - has_or_had_label
diff --git a/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml b/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml
new file mode 100644
index 0000000000..611a73f11b
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml
@@ -0,0 +1,7 @@
+classes:
+ ArchitecturalStyle:
+ class_uri: skos:Concept
+ description: "An architectural style."
+ slots:
+ - has_or_had_label
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml b/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml
new file mode 100644
index 0000000000..9c1391def8
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml
@@ -0,0 +1,7 @@
+classes:
+ ArchivalReference:
+ class_uri: rico:Identifier
+ description: "An archival reference code."
+ slots:
+ - has_or_had_identifier
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/Arrangement.yaml b/schemas/20251121/linkml/modules/classes/Arrangement.yaml
new file mode 100644
index 0000000000..53a269f2a9
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/Arrangement.yaml
@@ -0,0 +1,9 @@
+classes:
+ Arrangement:
+ class_uri: rico:Arrangement
+ description: "The arrangement of a collection."
+ slots:
+ - has_or_had_description
+ - has_or_had_type
+ - has_or_had_level
+ - has_or_had_note
diff --git a/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml b/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml
new file mode 100644
index 0000000000..242a1b3e40
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml
@@ -0,0 +1,7 @@
+classes:
+ ArrangementLevel:
+ class_uri: skos:Concept
+ description: "Level of arrangement."
+ slots:
+ - has_or_had_label
+ - has_or_had_rank
diff --git a/schemas/20251121/linkml/modules/classes/ArrangementType.yaml b/schemas/20251121/linkml/modules/classes/ArrangementType.yaml
new file mode 100644
index 0000000000..8e680b92dc
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/ArrangementType.yaml
@@ -0,0 +1,6 @@
+classes:
+ ArrangementType:
+ class_uri: skos:Concept
+ description: "Type of arrangement."
+ slots:
+ - has_or_had_label
diff --git a/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml b/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml
index 2d817742ef..2d8059688b 100644
--- a/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml
+++ b/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml
@@ -16,11 +16,15 @@ imports:
- ../slots/supersede_articles # was: supersede, superseded_by - migrated to class-specific slots 2026-01-16
- ../slots/is_or_was_effective_at
- ./ReconstructedEntity
- - ../slots/has_amendment_history
+ - ../slots/is_or_was_amended_through # was: has_amendment_history - migrated per Rule 53 (2026-01-27)
+ - ./AmendmentEvent
- ../slots/is_or_was_archived_in
- - ../slots/has_articles_archival_stage
- - ../slots/has_articles_document_format
- - ../slots/has_articles_document_url
+ - ../slots/has_or_had_status # was: has_articles_archival_stage - migrated per Rule 53 (2026-01-27)
+ - ../slots/has_or_had_format # was: has_articles_document_format - migrated per Rule 53 (2026-01-27)
+ - ../slots/has_or_had_url # was: has_articles_document_url - migrated per Rule 53 (2026-01-27)
+ - ./RecordCycleStatus
+ - ./DocumentFormat
+ - ./URL
- ../slots/is_or_was_included_in # was: collected_in - migrated per Rule 53 (2026-01-19)
- ../slots/has_or_had_description
- ./Description
@@ -129,11 +133,11 @@ classes:
- prov:Entity
- rov:orgType
slots:
- - has_amendment_history
+ - is_or_was_amended_through # was: has_amendment_history - migrated per Rule 53 (2026-01-27)
- is_or_was_archived_in
- - has_articles_archival_stage
- - has_articles_document_format
- - has_articles_document_url
+ - has_or_had_status # was: has_articles_archival_stage
+ - has_or_had_format # was: has_articles_document_format
+ - has_or_had_url # was: has_articles_document_url
- is_or_was_included_in # was: collected_in - migrated per Rule 53 (2026-01-19)
- has_or_had_description
- has_or_had_title
diff --git a/schemas/20251121/linkml/modules/classes/Budget.yaml b/schemas/20251121/linkml/modules/classes/Budget.yaml
index bd4d11f8f0..a9c46d43bb 100644
--- a/schemas/20251121/linkml/modules/classes/Budget.yaml
+++ b/schemas/20251121/linkml/modules/classes/Budget.yaml
@@ -10,7 +10,9 @@ imports:
- ./OrganizationalStructure
- ./ReconstructedEntity
- ../slots/revision_date
- - ../slots/has_approval_date
+ - ../slots/is_or_was_approved_on
+ - ../classes/Timestamp
+ - ../classes/TimeSpan
- ../slots/has_or_had_acquisition_budget
- ../slots/is_or_was_approved_by # MIGRATED: was ../slots/approved_by (2026-01-15)
# REMOVED - migrated to has_or_had_currency (Rule 53)
@@ -470,7 +472,8 @@ classes:
has_or_had_label: "External Grants & Subsidies"
internal_funding: 25000000.0
has_or_had_endowment_draw: 5000000.0
- approval_date: '2023-11-15'
+ is_or_was_approved_on:
+ start_of_the_start: '2023-11-15'
is_or_was_approved_by:
approver_name: Board of Directors
has_or_had_status:
@@ -510,7 +513,8 @@ classes:
quantity_value: 6000000.0
has_or_had_label: "Province Subsidy"
internal_funding: 2500000.0
- approval_date: '2024-03-01'
+ is_or_was_approved_on:
+ start_of_the_start: '2024-03-01'
is_or_was_approved_by:
approver_name: Province of Noord-Holland
has_or_had_status:
diff --git a/schemas/20251121/linkml/modules/classes/CallForApplication.yaml b/schemas/20251121/linkml/modules/classes/CallForApplication.yaml
index 46f62e6ad4..b03030f1b6 100644
--- a/schemas/20251121/linkml/modules/classes/CallForApplication.yaml
+++ b/schemas/20251121/linkml/modules/classes/CallForApplication.yaml
@@ -17,8 +17,10 @@ imports:
- ./FundingRequirement
- ../slots/contact_email
- ../slots/keyword
- - ../slots/has_application_deadline
- - ../slots/has_application_opening_date
+ - ../slots/is_or_was_due_on
+ - ../slots/end_of_the_end
+ - ../slots/is_or_was_opened_on
+ - ../slots/start_of_the_start
# REMOVED 2026-01-17: call_description - migrated to has_or_had_description per Rule 53
# REMOVED 2026-01-17: call_id, call_identifier - migrated to has_or_had_identifier per Rule 53
# REMOVED 2026-01-17: call_short_name, call_title - migrated to has_or_had_label per Rule 53
@@ -111,146 +113,29 @@ classes:
- schema:Action
- dcterms:BibliographicResource
slots:
- - has_application_deadline
- - has_application_opening_date
- - has_or_had_description # was: call_description - migrated per Rule 53 (2026-01-17)
- - has_or_had_identifier # was: call_id, call_identifier - migrated per Rule 53 (2026-01-17)
- - has_or_had_label # was: call_short_name, call_title - migrated per Rule 53 (2026-01-17)
- - has_or_had_status # was: call_status - migrated per Rule 53 (2026-01-17)
- - has_or_had_url # was: call_url - migrated per Rule 53 (2026-01-17)
- # REMOVED 2026-01-19: co_funding_required - migrated to requires_or_required + CoFunding (Rule 53)
- - requires_or_required # was: co_funding_required - migrated per Rule 53 (2026-01-19)
- - contact_email
- - eligible_applicant
- - eligible_country
- - has_or_had_funded # was: funded_project - migrated per Rule 53 (2026-01-26)
- - offers_or_offered # was: funding_rate - migrated per Rule 53 (2026-01-26)
- - heritage_type
- - info_session_date
- - issuing_organisation
- - keyword
- - minimum_partner
- - parent_programme
- - partnership_required
- - programme_year
- - related_call
- - has_or_had_requirement
- - results_expected_date
- - specificity_annotation
- - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17)
- - is_or_was_categorized_as # was: thematic_area - migrated per Rule 53
- - has_or_had_budget # was: total_budget - migrated per Rule 53 (2026-01-15)
- - has_or_had_range
- - has_or_had_provenance # was: web_observation - migrated per Rule 53
+ - is_or_was_due_on
+ - is_or_was_opened_on
slot_usage:
- has_or_had_identifier:
- identifier: true
- required: true
- range: Identifier
- multivalued: true
- inlined: true
- inlined_as_list: true
+ is_or_was_due_on:
+ range: TimeSpan
description: |
- Unique identifier(s) for this funding call.
- MIGRATED from call_id, call_identifier per slot_fixes.yaml (Rule 53, 2026-01-17).
-
- Consolidates:
- - call_id (dcterms:identifier) - Primary call identifier (identifier: true)
- - call_identifier (dcterms:identifier) - External identifiers (EU F&T, etc.)
-
- Format: https://nde.nl/ontology/hc/call/{issuing-org-slug}/{call-code}
+ Deadline for submitting applications.
+ Replaces has_application_deadline per Rule 53.
+ Use end_of_the_end for the exact deadline timestamp.
examples:
- value:
- identifier_value: https://nde.nl/ontology/hc/call/ec/cl2-2025-heritage-01
- identifier_scheme: URI
- description: Horizon Europe CL2 heritage call (primary identifier)
- - value:
- identifier_value: HORIZON-CL2-2025-HERITAGE-01
- identifier_scheme: EU_FUNDING_TENDERS
- description: EU Funding & Tenders portal ID
- - value:
- identifier_value: https://nde.nl/ontology/hc/call/nlhf/medium-grants-2025
- identifier_scheme: URI
- description: National Lottery Heritage Fund medium grants
- has_or_had_label:
- required: true
- range: string
- multivalued: true
+ end_of_the_end: "2023-12-31T23:59:59Z"
+ description: Application deadline
+ is_or_was_opened_on:
+ range: TimeSpan
description: |
- Human-readable labels for this funding call.
- MIGRATED from call_title, call_short_name per slot_fixes.yaml (Rule 53, 2026-01-17).
-
- Consolidates:
- - call_title (dcterms:title) - Official call title (required)
- - call_short_name (skos:altLabel) - Short name/code
-
- First label should be the official title, additional labels are short names/codes.
- examples:
- - value: Cultural heritage, cultural and creative industries
- description: Horizon Europe Cluster 2 call title (official)
- - value: HORIZON-CL2-2025-HERITAGE-01
- description: Horizon Europe call code (short name)
- - value: European Cooperation Projects
- description: Creative Europe call title (official)
- - value: CREA-CULT-2025-COOP
- description: Creative Europe cooperation call code
- has_or_had_status:
- required: true
- range: CallForApplicationStatusEnum
- description: |
- Current lifecycle status of the funding call.
- MIGRATED from call_status per slot_fixes.yaml (Rule 53, 2026-01-17).
-
- See CallForApplicationStatusEnum for status values:
- - ANNOUNCED: Call published, not yet open
- - OPEN: Currently accepting applications
- - CLOSING_SOON: < 30 days until deadline
- - CLOSED: Deadline passed
- - UNDER_REVIEW: Evaluation in progress
- - RESULTS_PUBLISHED: Decisions announced
- - CANCELLED: Call terminated
- - REOPENED: Previously closed call reactivated
- examples:
- - value: OPEN
- description: Currently accepting applications
- - value: CLOSING_SOON
- description: Deadline approaching
- has_or_had_description:
- range: string
- description: |
- Detailed description of the funding call and its objectives.
- MIGRATED from call_description per slot_fixes.yaml (Rule 53, 2026-01-17).
-
- Maps to dcterms:description for grant/funding opportunity descriptions.
- examples:
- - value: |
- This call supports research and innovation addressing cultural heritage
- preservation, digitisation, and access. Projects should develop new
- methods, technologies, and approaches for safeguarding tangible and
- intangible cultural heritage.
- description: Horizon Europe heritage call description
- has_or_had_url:
- range: URL
- multivalued: true
- inlined: true
- inlined_as_list: true
- description: |
- Official call documentation or application portal URL(s).
- MIGRATED from call_url per slot_fixes.yaml (Rule 53, 2026-01-17).
-
- Maps to schema:url for web addresses.
+ Date when applications opened.
+ Replaces has_application_opening_date per Rule 53.
+ Use start_of_the_start for the opening timestamp.
examples:
- value:
- url_value: https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01
- url_type: application_portal
- description: Horizon Europe call application portal
- - value:
- url_value: https://www.heritagefund.org.uk/funding/medium-grants
- url_type: documentation
- description: National Lottery Heritage Fund documentation
- has_application_deadline:
- required: true
- range: date
+ start_of_the_start: "2023-01-01T00:00:00Z"
+ description: Opening date
examples:
- value: '2025-09-16'
description: Horizon Europe CL2 2025 deadline
diff --git a/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml b/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml
index 504acf98c2..c39483b66a 100644
--- a/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml
+++ b/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml
@@ -58,7 +58,8 @@ imports:
- ../slots/is_or_was_revision_of # was: was_revision_of - migrated per Rule 53 (2026-01-15)
- ../slots/identifier
- ../slots/is_or_was_responsible_for # was: collections_under_responsibility - migrated per Rule 53 (2026-01-19)
- - ../slots/has_articles_of_association
+ - ../slots/has_or_had_document # was: has_articles_of_association - migrated per Rule 53 (2026-01-27)
+ - ./ArticlesOfAssociation
- ../slots/registration_date
- ../slots/specificity_annotation
- ../slots/has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17)
@@ -117,7 +118,7 @@ classes:
- is_or_was_responsible_for # was: collections_under_responsibility - migrated per Rule 53 (2026-01-19)
- is_or_was_dissolved_by
- defines_or_defined
- - has_articles_of_association
+ - has_or_had_document # was: has_articles_of_association
- identifier
- legal_entity_type
- legal_form
@@ -270,8 +271,12 @@ classes:
has_or_had_type: hierarchical
has_or_had_description: Board of trustees with director-led departments
description: Museum governance structure
- has_articles_of_association:
+ has_or_had_document:
range: ArticlesOfAssociation
+ inlined: true
+ description: >-
+ Articles of Association or other founding documents.
+ MIGRATED from has_articles_of_association per Rule 53 (2026-01-27).
multivalued: true
required: false
examples:
diff --git a/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml b/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml
index 4e50a90e2b..f8c05931c4 100644
--- a/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml
+++ b/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml
@@ -13,7 +13,8 @@ imports:
- ../metadata
- ../slots/has_or_had_coordinates # was: latitude, longitude, accuracy - migrated per Rule 53 (2026-01-26)
- ./Coordinates
- - ../slots/has_altitude
+ - ../slots/has_or_had_altitude # was: has_altitude - migrated per Rule 53 (2026-01-27)
+ - ./Altitude
- ../slots/has_or_had_geographic_extent # was: bounding_box - migrated per Rule 53/56 (2026-01-17)
- ../slots/has_or_had_identifier
- ../slots/coordinate_reference_system
@@ -164,7 +165,7 @@ classes:
- crm:E53_Place
slots:
- has_or_had_coordinates # was: latitude, longitude, accuracy
- - has_altitude
+ - has_or_had_altitude # was: has_altitude - migrated per Rule 53 (2026-01-27)
- has_or_had_geographic_extent # was: bounding_box - migrated per Rule 53/56 (2026-01-17)
- has_or_had_identifier
- coordinate_reference_system
diff --git a/schemas/20251121/linkml/modules/classes/Loan.yaml b/schemas/20251121/linkml/modules/classes/Loan.yaml
index ac26824bac..205d61ea40 100644
--- a/schemas/20251121/linkml/modules/classes/Loan.yaml
+++ b/schemas/20251121/linkml/modules/classes/Loan.yaml
@@ -14,7 +14,8 @@ imports:
- ../metadata
- ./TimeSpan
- ../enums/LoanStatusEnum
- - ../slots/has_approval_date
+ - ../slots/is_or_was_approved_on
+ - ../classes/Timestamp
- ../slots/has_actual_return_date
- ../slots/is_or_was_based_on
- ../classes/Agreement
@@ -101,133 +102,18 @@ classes:
slots:
- temporal_extent # was: has_actual_return_date - migrated per Rule 53 (2026-01-26)
- is_or_was_based_on
- - has_approval_date
- - custody_received_by # was: borrower - migrated per Rule 53/56 (2026-01-17)
- - has_or_had_contact_point # was: borrower_contact - migrated per Rule 53/56 (2026-01-17)
- # MIGRATED 2026-01-22: condition_on_return → is_or_was_returned + ReturnEvent (Rule 53)
- - is_or_was_returned
- - courier_detail
- - courier_required
- - has_or_had_custodian_type
- - is_or_was_displayed_at
- - has_or_had_objective # was: exhibition_ref - migrated per Rule 53 (2026-01-26)
- - is_or_was_extended
- - insurance_currency
- - insurance_provider
- - insurance_value
- - lender
- - lender_contact
- - loan_agreement_url
- - loan_end_date
- - loan_id
- - loan_note
- - loan_number
- - loan_purpose
- - loan_start_date
- - loan_status
- - loan_timespan
- - loan_type
- - has_or_had_loaned_object
- - original_end_date
- - outbound_condition_report_url
- - request_date
- - return_condition_report_url
- - shipping_method
- - special_requirement
- - specificity_annotation
- - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17)
+ - is_or_was_approved_on
slot_usage:
- is_or_was_based_on:
- range: Agreement
+ is_or_was_approved_on:
+ range: TimeSpan
description: |
- The formal agreement governing the loan.
- Replaces has_agreement_signed_date per Rule 53.
+ Date when the loan was approved.
+ Replaces has_approval_date per Rule 53.
+ Use start_of_the_start for the approval timestamp.
examples:
- value:
- has_or_had_label: "Loan Agreement 2023-001"
- is_or_was_signed_on: "2022-03-15"
- description: Signed loan agreement
- loan_id:
- identifier: true
- required: true
- range: uriorcurie
- examples:
- - value: https://nde.nl/ontology/hc/loan/mauritshuis-rijksmuseum-2023-001
- - value: https://nde.nl/ontology/hc/loan/british-museum-met-2024-003
- loan_number:
- required: false
- range: string
- examples:
- - value: MH-OUT-2023-0042
- description: Mauritshuis outgoing loan number
- - value: RM-IN-2023-0127
- description: Rijksmuseum incoming loan number
- has_or_had_loaned_object:
- required: true
- range: uriorcurie
- multivalued: true
- inlined: false
- examples:
- - value: https://nde.nl/ontology/hc/object/mauritshuis-girl-pearl-earring
- - value: https://nde.nl/ontology/hc/object/mauritshuis-view-delft
- lender:
- required: true
- range: uriorcurie
- inlined: false
- examples:
- - value: https://nde.nl/ontology/hc/custodian/nl/mauritshuis
- lender_contact:
- required: false
- range: string
- examples:
- - value: Dr. Maria van der Berg, Registrar
- custody_received_by: # was: borrower - migrated per Rule 53/56 (2026-01-17)
- description: >-
- Institution borrowing the object(s).
- CIDOC-CRM: P29_custody_received_by - identifies the E39 Actor who receives custody.
- required: true
- range: uriorcurie
- inlined: false
- examples:
- - value: https://nde.nl/ontology/hc/custodian/nl/rijksmuseum
- has_or_had_contact_point: # was: borrower_contact - migrated per Rule 53/56 (2026-01-17)
- description: >-
- Contact person at borrowing institution for this loan.
- required: false
- range: string
- examples:
- - value: Anna de Wit, Exhibition Coordinator
- loan_status:
- required: true
- range: LoanStatusEnum
- examples:
- - value: CLOSED
- description: Completed loan
- - value: ON_LOAN
- description: Object currently at borrower
- loan_type:
- required: false
- range: string
- examples:
- - value: EXHIBITION_LOAN
- - value: STUDY_LOAN
- - value: LONG_TERM_LOAN
- loan_purpose:
- required: false
- range: string
- examples:
- - value: Major Vermeer retrospective exhibition marking 350th anniversary
- - value: Technical examination for catalogue raisonné research
- request_date:
- required: false
- range: date
- examples:
- - value: '2021-06-15'
- has_approval_date:
- required: false
- range: date
- examples:
- - value: '2021-09-20'
+ start_of_the_start: "2021-09-20"
+ description: Approval date
has_agreement_signed_date:
required: false
range: date
diff --git a/schemas/20251121/linkml/modules/classes/Memento.yaml b/schemas/20251121/linkml/modules/classes/Memento.yaml
new file mode 100644
index 0000000000..0f0dcb2f0c
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/Memento.yaml
@@ -0,0 +1,7 @@
+classes:
+ Memento:
+ class_uri: schema:WebPage
+ description: "A web archive memento."
+ slots:
+ - has_or_had_url
+ - temporal_extent
diff --git a/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml b/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml
new file mode 100644
index 0000000000..b5081aeb62
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml
@@ -0,0 +1,6 @@
+classes:
+ ProvenancePath:
+ class_uri: prov:Plan
+ description: "A path or chain of provenance."
+ slots:
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/Reason.yaml b/schemas/20251121/linkml/modules/classes/Reason.yaml
new file mode 100644
index 0000000000..57f1fe7653
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/Reason.yaml
@@ -0,0 +1,7 @@
+classes:
+ Reason:
+ class_uri: skos:Concept
+ description: "A reason or justification."
+ slots:
+ - has_or_had_label
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml b/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml
new file mode 100644
index 0000000000..0ec8729f73
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml
@@ -0,0 +1,7 @@
+classes:
+ RecordCycleStatus:
+ class_uri: skos:Concept
+ description: "The status of a record within its lifecycle."
+ slots:
+ - has_or_had_label
+ - has_or_had_description
diff --git a/schemas/20251121/linkml/modules/classes/SearchScore.yaml b/schemas/20251121/linkml/modules/classes/SearchScore.yaml
new file mode 100644
index 0000000000..c4cbf6bb12
--- /dev/null
+++ b/schemas/20251121/linkml/modules/classes/SearchScore.yaml
@@ -0,0 +1,6 @@
+classes:
+ SearchScore:
+ class_uri: schema:Rating
+ description: "A search relevance score."
+ slots:
+ - has_or_had_value
diff --git a/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml
index 0c02efc7e7..ac63d38a5b 100644
--- a/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml
+++ b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml
@@ -79,9 +79,9 @@ classes:
- as:Activity
- schema:ClaimReview
slots:
- - has_annotation_motivation
- - has_annotation_segment
- - has_annotation_type
+ - has_or_had_rationale
+ - contains_or_contained
+ - has_or_had_type
# MIGRATED 2026-01-25: detection_count, detection_threshold → filters_or_filtered (Rule 53)
- filters_or_filtered
# REMOVED 2026-01-22: frame_sample_rate - migrated to analyzes_or_analyzed + VideoFrame + has_or_had_quantity (Rule 53)
@@ -94,20 +94,36 @@ classes:
- has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17)
- analyzes_or_analyzed
slot_usage:
- has_annotation_type:
- range: AnnotationTypeEnum
+ has_or_had_type:
+ range: AnnotationType
required: true
+ description: Type of annotation (Object detection, Scene detection, etc.)
examples:
- - value: OBJECT_DETECTION
+ - value:
+ has_or_had_code: OBJECT_DETECTION
+ has_or_had_label: Object Detection
description: Object and face detection annotation
- has_annotation_segment:
- range: VideoTimeSegment
+ contains_or_contained:
+ range: Segment
multivalued: true
required: false
inlined_as_list: true
+ description: >-
+ Segments (temporal or spatial) identified by the annotation.
+ MIGRATED from has_annotation_segment per Rule 53.
examples:
- - value: '[{start_seconds: 30.0, end_seconds: 35.0, segment_text: ''Night Watch painting visible''}]'
+ - value:
+ has_or_had_label: 'Night Watch painting visible'
+ has_or_had_description: '30.0 - 35.0 seconds'
description: Object detection segment
+ has_or_had_rationale:
+ range: Rationale
+ required: false
+ description: Motivation for the annotation.
+ examples:
+ - value:
+ has_or_had_label: ClassifyingMotivation
+ description: Annotation for classification purposes
# DEPRECATED 2026-01-25: detection_threshold, detection_count → filters_or_filtered + DetectedEntity (Rule 53)
# Old: detection_threshold: 0.5, detection_count: 342
# New: filters_or_filtered with DetectedEntity containing Quantity and DetectionThreshold
@@ -146,13 +162,6 @@ classes:
has_or_had_label: "High Precision"
description: "89 high-confidence detections"
# MIGRATED 2026-01-22: frame_sample_rate → analyzes_or_analyzed + VideoFrame + has_or_had_quantity (Rule 53)
- # frame_sample_rate:
- # range: float
- # required: false
- # minimum_value: 0.0
- # examples:
- # - value: 1.0
- # description: Analyzed 1 frame per second
analyzes_or_analyzed:
description: |
MIGRATED 2026-01-22: Now supports VideoFrame class for frame_sample_rate migration.
@@ -216,12 +225,6 @@ classes:
examples:
- value: false
description: No segmentation masks included
- has_annotation_motivation:
- range: AnnotationMotivationType
- required: false
- examples:
- - value: ClassifyingMotivation
- description: Annotation for classification purposes
comments:
- Abstract base for all CV/multimodal video annotations
- Extends VideoTextContent with frame-based analysis parameters
diff --git a/schemas/20251121/linkml/modules/slots/administrative_context.yaml b/schemas/20251121/linkml/modules/slots/archive/administrative_context.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/administrative_context.yaml
rename to schemas/20251121/linkml/modules/slots/archive/administrative_context.yaml
diff --git a/schemas/20251121/linkml/modules/slots/based_on_claim.yaml b/schemas/20251121/linkml/modules/slots/archive/based_on_claim.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/based_on_claim.yaml
rename to schemas/20251121/linkml/modules/slots/archive/based_on_claim.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_altitude.yaml b/schemas/20251121/linkml/modules/slots/archive/has_altitude.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_altitude.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_altitude.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_amendment_history.yaml b/schemas/20251121/linkml/modules/slots/archive/has_amendment_history.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_amendment_history.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_amendment_history.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annex_description.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_description.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annex_description.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annex_description.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annex_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_name.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annex_name.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annex_name.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annex_reason.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_reason.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annex_reason.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annex_reason.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_motivation.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_motivation_archived_20260127.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annotation_motivation.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_motivation_archived_20260127.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_segment.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_segment_archived_20260127.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annotation_segment.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_segment_archived_20260127.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_type.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_type_archived_20260127.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_annotation_type.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_type_archived_20260127.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_api_version.yaml b/schemas/20251121/linkml/modules/slots/archive/has_api_version.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_api_version.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_api_version.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_language.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_language.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_appellation_language.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_language.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_type.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_type.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_appellation_type.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_type.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_value.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_value.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_appellation_value.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_value.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_applicable_country.yaml b/schemas/20251121/linkml/modules/slots/archive/has_applicable_country.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_applicable_country.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_applicable_country.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_application_deadline.yaml b/schemas/20251121/linkml/modules/slots/archive/has_application_deadline.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_application_deadline.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_application_deadline.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_application_opening_date.yaml b/schemas/20251121/linkml/modules/slots/archive/has_application_opening_date.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_application_opening_date.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_application_opening_date.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_appraisal_note.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appraisal_note.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_appraisal_note.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_appraisal_note.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_approval_date.yaml b/schemas/20251121/linkml/modules/slots/archive/has_approval_date.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_approval_date.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_approval_date.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archdiocese_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archdiocese_name.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archdiocese_name.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archdiocese_name.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_architectural_style.yaml b/schemas/20251121/linkml/modules/slots/archive/has_architectural_style.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_architectural_style.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_architectural_style.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archival_reference.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archival_reference.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archival_reference.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archival_reference.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archive_description.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_description.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archive_description.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archive_description.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archive_memento_uri.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_memento_uri.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archive_memento_uri.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archive_memento_uri.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archive_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_name.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archive_name.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archive_name.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archive_path.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_path.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archive_path.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archive_path.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_archive_search_score.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_search_score.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_archive_search_score.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_archive_search_score.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_arrangement.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement_level.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement_level.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_arrangement_level.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement_level.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement_note.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement_note.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_arrangement_note.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement_note.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_articles_archival_stage.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_archival_stage.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_articles_archival_stage.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_articles_archival_stage.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_articles_document_format.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_document_format.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_articles_document_format.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_articles_document_format.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_articles_document_url.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_document_url.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_articles_document_url.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_articles_document_url.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_articles_of_association.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_of_association.yaml
similarity index 100%
rename from schemas/20251121/linkml/modules/slots/has_articles_of_association.yaml
rename to schemas/20251121/linkml/modules/slots/archive/has_articles_of_association.yaml
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml
new file mode 100644
index 0000000000..039a32c3f8
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_altitude
+description: The altitude of a place.
+slot_uri: wgs84:alt
+range: Altitude
+multivalued: false
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml
new file mode 100644
index 0000000000..a9e537ee8d
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_annotation
+description: An annotation on the entity.
+slot_uri: oa:hasAnnotation
+range: Annotation
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml
new file mode 100644
index 0000000000..fcb618c1b9
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_arrangement
+description: The arrangement of the collection.
+slot_uri: rico:hasArrangement
+range: Arrangement
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml
new file mode 100644
index 0000000000..7ef9c92a4e
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_document
+description: A document associated with the entity.
+slot_uri: foaf:isPrimaryTopicOf
+range: ArticlesOfAssociation
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml
index 1024665e2a..6f68efe8a4 100644
--- a/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml
@@ -14,7 +14,7 @@ prefixes:
imports:
- linkml:types
- - ../classes/XPath
+ - ../classes/ProvenancePath
default_prefix: hc
slots:
@@ -38,7 +38,7 @@ slots:
Typically used within a Provenance class to link the provenance activity
to the specific document location from which data was extracted.
- range: XPath
+ range: ProvenancePath
slot_uri: prov:atLocation
inlined: true
@@ -65,4 +65,4 @@ slots:
comments:
- Created from slot_fixes.yaml migration (2026-01-14)
- Replaces direct xpath slot usage with structured path object
- - Links Provenance class to XPath class
+ - Links Provenance class to ProvenancePath class
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml
index 9b890f0241..6f953978b2 100644
--- a/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml
@@ -19,10 +19,11 @@ default_prefix: hc
imports:
- linkml:types
+ - ../classes/Rationale
slots:
has_or_had_rationale:
- slot_uri: hc:hasOrHadRationale
+ slot_uri: prov:used
description: |
The rationale or justification for a decision or mapping.
@@ -33,22 +34,17 @@ slots:
- Explanation notes
**Ontological Alignment**:
- - **Primary** (`slot_uri`): `hc:hasOrHadRationale` - Heritage Custodian ObjectProperty
- for class-valued Rationale range
+ - **Primary** (`slot_uri`): `prov:used` (per 2026-01-26 update)
- **Close**: `skos:note` - SKOS note (DatatypeProperty)
- - **Close**: `prov:wasInfluencedBy` - PROV-O provenance
- **Note**: slot_uri changed from skos:note to hc:hasOrHadRationale (2026-01-16)
- to allow class-valued ranges when classes use Rationale class.
-
- range: uriorcurie # Broadened per Rule 55 (2026-01-16) - Any allows both literals and class instances
- implements:
- - owl:ObjectProperty # Force OWL ObjectProperty to avoid ambiguous type warning (2026-01-16)
+ range: Rationale
+ multivalued: true
close_mappings:
- skos:note
- prov:wasInfluencedBy
examples:
- - value: "Mapped to Q123456 based on exact name match and location verification"
+ - value:
+ has_or_had_label: "Mapped to Q123456 based on exact name match"
description: Wikidata mapping rationale
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml
new file mode 100644
index 0000000000..20c13a8d68
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_reason
+description: The reason for an activity or state.
+slot_uri: prov:used
+range: Reason
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml
new file mode 100644
index 0000000000..65a8787733
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml
@@ -0,0 +1,5 @@
+name: has_or_had_style
+description: The style of the entity.
+slot_uri: schema:genre
+range: ArchitecturalStyle
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml
new file mode 100644
index 0000000000..4c3deb9390
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_amended_through
+description: The event through which the entity was amended.
+slot_uri: prov:wasInfluencedBy
+range: AmendmentEvent
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml
new file mode 100644
index 0000000000..7f2b3e415a
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_approved_on
+description: The approval date.
+slot_uri: schema:datePublished
+range: TimeSpan
+multivalued: false
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml
new file mode 100644
index 0000000000..264fdc45be
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_archived_as
+description: The archived version (memento) of the resource.
+slot_uri: schema:archivedAt
+range: Memento
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml
new file mode 100644
index 0000000000..b4ed00aeb7
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_due_on
+description: The deadline or due date.
+slot_uri: schema:endDate
+range: TimeSpan
+multivalued: false
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml
new file mode 100644
index 0000000000..1e5027a882
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_opened_on
+description: The opening date of an application or event.
+slot_uri: schema:startDate
+range: TimeSpan
+multivalued: false
diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml
new file mode 100644
index 0000000000..2c171e71c7
--- /dev/null
+++ b/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml
@@ -0,0 +1,5 @@
+name: is_or_was_used_in
+description: The context in which something is used.
+slot_uri: prov:wasUsedBy
+range: GovernanceStructure
+multivalued: true
diff --git a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml
index bfd240c372..408264165f 100644
--- a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml
+++ b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml
@@ -27,10 +27,6 @@ fixes:
type: slot
- label: TimeSpan
type: class
- processed:
- status: true
- date: '2026-01-26'
- notes: Migrated to is_or_was_acquired_through + AcquisitionEvent. Slot archived.
- original_slot_id: https://nde.nl/ontology/hc/slot/has_acquisition_date
revision:
@@ -48,368 +44,13 @@ fixes:
type: class
processed:
status: true
- date: '2026-01-26'
- notes: Migrated to temporal_extent + TimeSpan (end_of_the_end) in Loan.yaml. Slot archived.
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_actual_return_date
-
- revision:
- - label: temporal_extent
- type: slot
- - label: TimeSpan
- type: class
- - label: end_of_the_end
- type: slot
- - label: Timestamp
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_address
- revision:
- - label: has_or_had_address
- type: slot
- - label: Address
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_description
- revision:
- - label: has_or_had_description
- type: slot
- - label: Description
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_identifier
- revision:
- - label: has_or_had_identifier
- type: slot
- - label: Identifier
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_name
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
- processed:
- status: true
- date: '2026-01-26'
- notes: Migrated to has_or_had_label + Label in CustodianAdministration.yaml. Slot archived.
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_administration_name
-
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
-- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_administrative_level
- revision:
- - label: is_or_was_part_of
- type: slot
- - label: GovernmentHierarchy
- type: class
- - label: has_or_had_tier
- type: slot
- - label: AdministrativeLevel
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_adoption_context
- revision:
- - label: describes_or_described
- type: slot
- - label: Policy
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_age
- revision:
- - label: has_or_had_age
- type: slot
- - label: Age
- type: class
- processed:
- status: true
- date: '2026-01-26'
- notes: Migrated to has_or_had_age + Age in PersonObservation.yaml. Slot archived.
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_age
-nda_description
- revision:
- - label: has_or_had_description
- type: slot
- - label: Description
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_document_url
- revision:
- - label: has_or_had_url
- type: slot
- - label: URL
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_short_name
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
- - label: has_or_had_type
- type: slot
- - label: LabelType
- type: class
- - label: includes_or_included
- type: slot
- - label: LabelTypes
- type: class
- note: AbbreviationLabel class is defined in the LinkML file
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_title
- revision:
- - label: has_or_had_title
- type: slot
- - label: Title
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_url
- revision:
- - label: has_or_had_url
- type: slot
- - label: URL
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agent_name
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agent_type
- revision:
- - label: has_or_had_type
- type: slot
- - label: AgentType
- type: class
- - label: includes_or_included
- type: slot
- - label: AgentTypes
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_agreement_signed_date
+ date: '2026-01-27'
+ notes: Migrated to is_or_was_approved_on + TimeSpan. Slot archived.
+- original_slot_id: https://nde.nl/ontology/hc/slot/has_approval_date
processed:
status: true
date: '2026-01-27'
- notes: Fully migrated to is_or_was_based_on + Agreement class + is_or_was_signed_on slot (Rule 53). Loan.yaml updated. Slot archived.
- revision:
- - label: is_or_was_based_on
- type: slot
- - label: Agreement
- type: class
- - label: is_or_was_signed_on
- type: slot
- - label: TimeSpan
- type: class
- - label: start_of_the_start
- type: slot
- - label: Timestamp
- type: class
-- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_air_changes_per_hour
- processed:
- status: true
- date: '2026-01-27'
- notes: Fully migrated to specifies_or_specified + Ventilation class + AirChanges class (Rule 53). StorageConditionPolicy.yaml updated. Slot archived.
- revision:
- - label: specifies_or_specified
- type: slot
- - label: Ventilation
- type: class
- - label: requires_or_required
- type: slot
- - label: AirChanges
- type: class
- - label: has_or_had_quantity
- type: slot
- - label: Quantity
- type: class
- - label: has_or_had_unit
- type: slot
- - label: Unit
- type: class
- value: air changes per hour
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_allocation_date
- processed:
- status: true
- date: '2026-01-27'
- notes: Fully migrated to is_or_was_allocated_through + AllocationEvent (Rule 53). CustodianIdentifier.yaml updated. Slot archived.
- revision:
- - label: is_or_was_allocated_through
- type: slot
- - label: AllocationEvent
- type: class
- - label: temporal_extent
- type: slot
- - label: TimeSpan
- type: class
- - label: temporal_extent
- type: slot
- - label: TimeSpan
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_2_code
- processed:
- status: true
- date: '2026-01-27'
- notes: Fully migrated to has_or_had_identifier + Alpha2Code class (Rule 53). Country.yaml updated. Slot archived.
- revision:
- - label: has_or_had_identifier
- type: slot
- - label: Alpha2Code
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_3_code
- processed:
- status: true
- date: '2026-01-27'
- notes: Fully migrated to has_or_had_identifier + Alpha3Code class (Rule 53). Country.yaml updated. Slot archived.
- revision:
- - label: has_or_had_identifier
- type: slot
- - label: Alpha3Code
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_3_code_dup
- processed:
- status: true
- date: '2026-01-27'
- notes: Duplicate entry processed.
- revision:
- - label: has_or_had_identifier
- type: slot
- - label: Alpha3Code
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_altitude
- revision:
- - label: has_or_had_altitude
- type: slot
- - label: Altitude
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_amendment_history
- revision:
- - label: is_or_was_amended_through
- type: slot
- - label: AmendmentEvent
- type: class
- - label: has_or_had_provenance
- type: slot
- - label: Provenance
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_description
- revision:
- - label: has_or_had_description
- type: slot
- - label: Description
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_name
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_reason
- revision:
- - label: is_or_was_created_through
- type: slot
- - label: AnnexCreationEvent
- type: class
- - label: has_or_had_reason
- type: slot
- - label: Reason
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_by
- revision:
- - label: contains_or_contained
- type: slot
- - label: Annotation
- type: class
- - label: is_or_was_created_by
- type: slot
- - label: Agent
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_motivation
- revision:
- - label: has_or_had_rationale
- type: slot
- - label: Rationale
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_segment
- revision:
- - label: contains_or_contained
- type: slot
- - label: Segment
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_type
- revision:
- - label: has_or_had_type
- type: slot
- - label: AnnotationType
- type: class
- - label: includes_or_included
- type: slot
- - label: AnnotationTypes
- type: class
-- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_api_version
- revision:
- - label: has_or_had_provenance
- type: slot
- - label: Provenance
- type: class
- - label: is_or_was_retrieved_through
- type: slot
- - label: APIRequest
- type: class
- - label: has_or_had_endpoint
- type: slot
- - label: APIEndpoint
- type: class
- - label: has_or_had_version
- type: slot
- - label: APIVersion
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_language
- revision:
- - label: has_or_had_language
- type: slot
- - label: Language
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_type
- revision:
- - label: has_or_had_type
- type: slot
- - label: AppellationType
- type: class
- - label: includes_or_included
- type: slot
- - label: AppellationTypes
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_value
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_applicable_country
- revision:
- - label: is_or_was_applicable_in
- type: slot
- - label: Country
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_application_deadline
- revision:
- - label: is_or_was_due_on
- type: slot
- - label: TimeSpan
- type: class
- - label: end_of_the_end
- type: slot
- - label: Timestamp
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_application_opening_date
- revision:
- - label: is_or_was_opened_on
- type: slot
- - label: TimeSpan
- type: class
- - label: start_of_the_start
- type: slot
- - label: Timestamp
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_appraisal_note
- revision:
- - label: has_or_had_note
- type: slot
- - label: Note
- type: class
-- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_approval_date
+ notes: Fully migrated to is_or_was_approved_on + TimeSpan (Rule 53). Loan.yaml and Budget.yaml updated. Slot archived.
revision:
- label: is_or_was_approved_on
type: slot
@@ -419,6 +60,10 @@ nda_description
type: slot
- label: Timestamp
type: class
+ - label: start_of_the_start
+ type: slot
+ - label: Timestamp
+ type: class
- original_slot_id: https://nde.nl/ontology/hc/slot/has_archdiocese_name
revision:
- label: is_or_was_part_of
@@ -469,94 +114,31 @@ nda_description
type: slot
- label: URL
type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_name
- revision:
- - label: has_or_had_label
- type: slot
- - label: Label
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_path
- revision:
- - label: has_or_had_provenance
- type: slot
- - label: Provenance
- type: class
- - label: has_or_had_provenance_path
- type: slot
- - label: ProvenancePath
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_search_score
- revision:
- - label: has_or_had_score
- type: slot
- - label: SearchScore
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement
- revision:
- - label: has_or_had_arrangement
- type: slot
- - label: Arrangement
- type: class
- - label: has_or_had_type
- type: slot
- - label: ArrangementType
- type: class
- - label: includes_or_included
- type: slot
- - label: ArrangementTypes
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement_level
- revision:
- - label: has_or_had_arrangement
- type: slot
- - label: Arrangement
- type: class
- - label: has_or_had_type
- type: slot
- - label: ArrangementType
- type: class
- - label: includes_or_included
- type: slot
- - label: ArrangementTypes
- type: class
- - label: has_or_had_level
- type: slot
- - label: ArrangementLevel
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement_note
- revision:
- - label: has_or_had_arrangement
- type: slot
- - label: Arrangement
- type: class
- - label: has_or_had_note
- type: slot
- - label: Note
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_archival_stage
- revision:
- - label: has_or_had_status
- type: slot
- - label: RecordCycleStatus
- type: class
-- original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_document_format
- revision:
- - label: has_or_had_format
- type: slot
- - label: DocumentFormat
- type: class
+ processed:
+ status: true
+ date: '2026-01-27'
+ notes: Migrated to has_or_had_url + URL in ArticlesOfAssociation.yaml. Slot archived.
- original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_document_url
+
revision:
- label: has_or_had_url
type: slot
- label: URL
type: class
+ processed:
+ status: true
+ date: '2026-01-27'
+ notes: Migrated to has_or_had_url + URL in ArticlesOfAssociation.yaml. Slot archived.
- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_articles_of_association
revision:
- label: has_or_had_document
type: slot
- label: ArticlesOfAssociation
type: class
+ processed:
+ status: true
+ date: '2026-01-27'
+ notes: Migrated to has_or_had_document + ArticlesOfAssociation in relevant classes. Slot archived.
- original_slot_id: https://nde.nl/ontology/hc/slot/has_aspect_ratio
revision:
- label: has_or_had_degree
diff --git a/schemas/20251121/linkml/update_manifest.py b/schemas/20251121/linkml/update_manifest.py
index 10c09475df..a5456858ef 100644
--- a/schemas/20251121/linkml/update_manifest.py
+++ b/schemas/20251121/linkml/update_manifest.py
@@ -43,39 +43,17 @@ def update_manifest(add_files, remove_files):
if __name__ == "__main__":
# Define files to add
add_files = [
- {"name": "AccessApplication", "path": "modules/classes/AccessApplication.yaml", "category": "class"},
- {"name": "AccessInterface", "path": "modules/classes/AccessInterface.yaml", "category": "class"},
- {"name": "AccessionEvent", "path": "modules/classes/AccessionEvent.yaml", "category": "class"},
- {"name": "Accumulation", "path": "modules/classes/Accumulation.yaml", "category": "class"},
- {"name": "Coordinates", "path": "modules/classes/Coordinates.yaml", "category": "class"},
- {"name": "AcquisitionEvent", "path": "modules/classes/AcquisitionEvent.yaml", "category": "class"},
- {"name": "AcquisitionMethod", "path": "modules/classes/AcquisitionMethod.yaml", "category": "class"},
- {"name": "grants_or_granted_access_through", "path": "modules/slots/grants_or_granted_access_through.yaml", "category": "slot"},
- {"name": "has_or_had_interface", "path": "modules/slots/has_or_had_interface.yaml", "category": "slot"},
- {"name": "is_or_was_accessioned_through", "path": "modules/slots/is_or_was_accessioned_through.yaml", "category": "slot"},
- {"name": "has_or_had_accumulation", "path": "modules/slots/has_or_had_accumulation.yaml", "category": "slot"},
- {"name": "has_or_had_coordinates", "path": "modules/slots/has_or_had_coordinates.yaml", "category": "slot"},
- {"name": "is_or_was_acquired_through", "path": "modules/slots/is_or_was_acquired_through.yaml", "category": "slot"},
- {"name": "was_acquired_through", "path": "modules/slots/was_acquired_through.yaml", "category": "slot"},
- {"name": "has_or_had_method", "path": "modules/slots/has_or_had_method.yaml", "category": "slot"},
+ {"name": "RecordCycleStatus", "path": "modules/classes/RecordCycleStatus.yaml", "category": "class"},
+ {"name": "DocumentFormat", "path": "modules/classes/DocumentFormat.yaml", "category": "class"},
+ {"name": "has_or_had_document", "path": "modules/slots/has_or_had_document.yaml", "category": "slot"},
]
# Define files to remove (archived slots)
remove_files = [
- "has_access_application_url",
- "has_access_interface_url",
- "has_accession_date",
- "has_accession_number",
- "has_accumulation_end_date",
- "has_accumulation_start_date",
- "has_accuracy_in_meters",
- "has_acquisition_date",
- "has_acquisition_history",
- "has_acquisition_method",
- "has_acquisition_source",
- "has_activity_description",
- "has_activity_identifier",
- "has_activity_name"
+ "has_articles_archival_stage",
+ "has_articles_document_format",
+ "has_articles_document_url",
+ "has_articles_of_association"
]
update_manifest(add_files, remove_files)