"""
Hybrid Retriever: Vector Search + Knowledge Graph Expansion
Combines Qdrant vector similarity search with Oxigraph SPARQL graph expansion
to provide semantically-aware and structurally-enriched retrieval.
Architecture:
1. Vector Search (Qdrant) - Find semantically similar institutions AND persons
2. Graph Expansion (Oxigraph) - Expand via relationships:
- Same city/region
- Same institution type
- Related collections
- Organizational relationships
3. Re-ranking - Combine scores for final ranking
4. Query Routing - Detect if query is about institutions or persons
Collections:
- heritage_custodians: Institution data (27K+ records)
- heritage_persons: Staff/person data (10K+ records)
Example usage:
retriever = HybridRetriever(
qdrant_host="localhost",
qdrant_port=6333,
sparql_endpoint="http://localhost:7878/query"
)
# Institution search
results = retriever.search("museums with Dutch colonial history")
# Person search (auto-detected or explicit)
results = retriever.search("Who works at the Nationaal Archief?")
results = retriever.search_persons("archivist at Rijksmuseum")
"""
import hashlib
import logging
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from typing import Any, TYPE_CHECKING
import httpx
# Polygon filter for geographic containment testing (Dutch provinces)
from glam_extractor.geocoding.polygon_filter import (
get_polygon_filter,
ProvincePolygonFilter,
)
if TYPE_CHECKING:
from qdrant_client import QdrantClient
from openai import OpenAI
from sentence_transformers import SentenceTransformer
# Forward reference as string to avoid circular imports
MultiEmbeddingRetriever = Any # Actually from glam_extractor.api.multi_embedding_retriever
EmbeddingModel = Any # Actually from glam_extractor.api.multi_embedding_retriever
logger = logging.getLogger(__name__)
# SPARQL endpoint configuration
DEFAULT_SPARQL_ENDPOINT = os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query")
DEFAULT_SPARQL_TIMEOUT = 30.0
# Ontology prefixes used in Oxigraph
SPARQL_PREFIXES = """
PREFIX hc:
PREFIX hcc:
PREFIX ghc:
PREFIX skos:
PREFIX wdt:
PREFIX wd:
PREFIX schema:
PREFIX geo:
PREFIX rdfs:
PREFIX rdf:
"""
@dataclass
class RetrievedInstitution:
"""A retrieved heritage institution with combined scores."""
ghcid: str
name: str
uri: str
vector_score: float = 0.0
graph_score: float = 0.0
combined_score: float = 0.0
# Metadata from vector search
institution_type: str | None = None
country: str | None = None
city: str | None = None
description: str | None = None
# Geographic coordinates
latitude: float | None = None
longitude: float | None = None
# Graph expansion data
related_institutions: list[str] = field(default_factory=list)
expansion_reason: str | None = None # e.g., "same_city", "same_type", "related_collection"
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for API responses."""
return {
"ghcid": self.ghcid,
"name": self.name,
"uri": self.uri,
"scores": {
"vector": round(self.vector_score, 4),
"graph": round(self.graph_score, 4),
"combined": round(self.combined_score, 4),
},
"metadata": {
"institution_type": self.institution_type,
"country": self.country,
"city": self.city,
"description": self.description,
"latitude": self.latitude,
"longitude": self.longitude,
},
"graph_expansion": {
"related_institutions": self.related_institutions,
"expansion_reason": self.expansion_reason,
}
}
# ===================================================================
# Linked Data URI Generation Utilities
# ===================================================================
# Generate stable ontology-aligned URIs for Person and PersonObservation
# following the LinkML schema at schemas/20251121/linkml/
# Namespace: https://nde.nl/ontology/hc/
# ===================================================================
import re
import unicodedata
# Ontology namespaces
ONTOLOGY_BASE = "https://nde.nl/ontology/hc"
PERSON_HUB_PREFIX = f"{ONTOLOGY_BASE}/person"
PERSON_OBS_PREFIX = f"{ONTOLOGY_BASE}/person-obs"
CUSTODIAN_PREFIX = f"{ONTOLOGY_BASE}/custodian"
# JSON-LD context for person search responses
PERSON_JSONLD_CONTEXT = {
"@vocab": f"{ONTOLOGY_BASE}/",
"schema": "http://schema.org/",
"pico": "https://personsincontext.org/model#",
"prov": "http://www.w3.org/ns/prov#",
"foaf": "http://xmlns.com/foaf/0.1/",
"name": "schema:name",
"jobTitle": "schema:jobTitle",
"affiliation": "schema:affiliation",
"sameAs": "schema:sameAs",
"refers_to_person": "pico:observationOf",
"observation_source": "prov:hadPrimarySource",
}
def generate_slug(text: str) -> str:
"""Generate URL-safe slug from text.
Examples:
"Kitty Bogte" → "kitty-bogte"
"Dr. Jane Smith" → "dr-jane-smith"
"Taco Dibbits" → "taco-dibbits"
"""
if not text:
return "unknown"
# Normalize unicode (NFD decomposition) and remove diacritics
normalized = unicodedata.normalize('NFD', text)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Convert to lowercase
lowercase = ascii_text.lower()
# Replace non-alphanumeric with hyphens
slug = re.sub(r'[^a-z0-9]+', '-', lowercase)
# Collapse multiple hyphens and strip leading/trailing
slug = re.sub(r'-+', '-', slug).strip('-')
return slug or "unknown"
def generate_role_slug(headline: str | None) -> str:
"""Generate role slug from job title/headline.
Examples:
"Programmer/curator" → "programmer-curator"
"Senior Archivist" → "senior-archivist"
None → "staff"
"""
if not headline:
return "staff"
return generate_slug(headline)
def generate_person_hub_uri(name: str, linkedin_slug: str | None = None) -> str:
"""Generate Person hub URI (abstract identity).
Format: https://nde.nl/ontology/hc/person/{person-slug}
Uses LinkedIn slug if available for stability, otherwise derives from name.
Examples:
generate_person_hub_uri("Kitty Bogte", "kittybogte")
→ "https://nde.nl/ontology/hc/person/kittybogte"
generate_person_hub_uri("Dr. Jane Smith")
→ "https://nde.nl/ontology/hc/person/dr-jane-smith"
"""
if linkedin_slug:
slug = linkedin_slug
else:
slug = generate_slug(name)
return f"{PERSON_HUB_PREFIX}/{slug}"
def generate_observation_uri(
custodian_slug: str | None,
person_name: str,
role_slug: str | None = None,
linkedin_slug: str | None = None
) -> str:
"""Generate PersonObservation URI.
Format: https://nde.nl/ontology/hc/person-obs/{custodian-slug}/{person-slug}/{role-slug}
Examples:
generate_observation_uri("nl-ga-nationaal-archief", "Kitty Bogte", "programmer-curator")
→ "https://nde.nl/ontology/hc/person-obs/nl-ga-nationaal-archief/kitty-bogte/programmer-curator"
"""
custodian = custodian_slug or "unknown-custodian"
person = linkedin_slug or generate_slug(person_name)
role = role_slug or "staff"
return f"{PERSON_OBS_PREFIX}/{custodian}/{person}/{role}"
def generate_custodian_uri(custodian_slug: str | None, ghcid: str | None = None) -> str | None:
"""Generate Custodian URI.
Format: https://nde.nl/ontology/hc/custodian/{ghcid-or-slug}
"""
if ghcid:
return f"{CUSTODIAN_PREFIX}/{ghcid}"
elif custodian_slug:
return f"{CUSTODIAN_PREFIX}/{custodian_slug}"
return None
def extract_linkedin_slug(linkedin_url: str | None) -> str | None:
"""Extract slug from LinkedIn URL.
Examples:
"https://www.linkedin.com/in/kittybogte" → "kittybogte"
"https://linkedin.com/in/jane-smith-12345" → "jane-smith-12345"
"""
if not linkedin_url:
return None
match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
return match.group(1) if match else None
@dataclass
class RetrievedPerson:
"""A retrieved person/staff member with search scores and linked data URIs."""
person_id: str
name: str
vector_score: float = 0.0
combined_score: float = 0.0
richness_score: float = 0.0 # Metadata richness score (0-1)
# Metadata from vector search
headline: str | None = None # Job title/role
custodian_name: str | None = None # Organization they work at
custodian_slug: str | None = None
location: str | None = None
heritage_relevant: bool = False
heritage_type: str | None = None # GLAMORCUBESFIXPHDNT code
source_type: str | None = None # "staff_list" or "entity_profile"
linkedin_url: str | None = None
has_wcms: bool = False # WCMS-registered profile (heritage sector user)
# WCMS-specific fields for display on review page
wcms_user_id: str | None = None
wcms_abs_id: str | None = None # NAN identifier
wcms_crm_id: str | None = None
wcms_username: str | None = None
wcms_username_url: str | None = None
wcms_status: str | None = None # "Active" or "Blocked"
wcms_roles: list[str] | None = None
wcms_registered_since: str | None = None
wcms_last_access: str | None = None
# Contact details
email: str | None = None
email_domain: str | None = None
# Linked data fields (generated)
linkedin_profile_path: str | None = None # Path to entity JSON file
@property
def linkedin_slug(self) -> str | None:
"""Extract LinkedIn slug from URL."""
return extract_linkedin_slug(self.linkedin_url)
@property
def person_hub_uri(self) -> str:
"""Generate Person hub URI (abstract identity)."""
return generate_person_hub_uri(self.name, self.linkedin_slug)
@property
def observation_uri(self) -> str:
"""Generate PersonObservation URI."""
role_slug = generate_role_slug(self.headline)
return generate_observation_uri(
self.custodian_slug,
self.name,
role_slug,
self.linkedin_slug
)
@property
def custodian_uri(self) -> str | None:
"""Generate Custodian URI."""
return generate_custodian_uri(self.custodian_slug)
def to_dict(self, include_jsonld: bool = True) -> dict[str, Any]:
"""Convert to dictionary for API responses.
Args:
include_jsonld: If True, include JSON-LD linked data fields (@id, @type, etc.)
"""
result = {
"person_id": self.person_id,
"name": self.name,
"scores": {
"vector": round(self.vector_score, 4),
"combined": round(self.combined_score, 4),
"richness": round(self.richness_score, 4),
},
"metadata": {
"headline": self.headline,
"custodian_name": self.custodian_name,
"custodian_slug": self.custodian_slug,
"location": self.location,
"heritage_relevant": self.heritage_relevant,
"heritage_type": self.heritage_type,
"source_type": self.source_type,
"linkedin_url": self.linkedin_url,
"has_wcms": self.has_wcms,
# WCMS fields for review page
"wcms_user_id": self.wcms_user_id,
"wcms_abs_id": self.wcms_abs_id,
"wcms_crm_id": self.wcms_crm_id,
"wcms_username": self.wcms_username,
"wcms_username_url": self.wcms_username_url,
"wcms_status": self.wcms_status,
"wcms_roles": self.wcms_roles,
"wcms_registered_since": self.wcms_registered_since,
"wcms_last_access": self.wcms_last_access,
# Contact details
"email": self.email,
"email_domain": self.email_domain,
}
}
if include_jsonld:
# Add JSON-LD linked data fields
result["@id"] = self.observation_uri
result["@type"] = "pico:PersonObservation"
result["refers_to_person"] = self.person_hub_uri
# Add custodian affiliation if available
if self.custodian_uri:
result["unit_affiliation"] = self.custodian_uri
# Add schema:sameAs for LinkedIn URL
if self.linkedin_url:
result["schema:sameAs"] = self.linkedin_url
# Add linkedin_profile_path if available
if self.linkedin_profile_path:
result["linkedin_profile_path"] = self.linkedin_profile_path
return result
# Query type detection patterns
PERSON_QUERY_PATTERNS = [
# Dutch
"wie werkt", "wie werk", "werken in", "werken bij", "medewerker", "personeel",
"staff", "werknemer", "expert", "experts", "specialist", "specialisten",
"directeur", "curator", "archivaris", "bibliothecaris", "conservator",
"team", "collega", "collegas", "mensen bij", "werkzaam",
# English
"who works", "staff at", "employees", "team at", "people at", "work at",
"director of", "curator at", "archivist", "librarian", "works at",
"experts at", "specialists", "professionals at",
# Generic
"linkedin", "person", "professional",
]
# ===================================================================
# Dutch Province/Subdivision Code Mapping (ISO 3166-2:NL)
# ===================================================================
# Maps province names (lowercase, various spellings) to ISO 3166-2 codes
# Used for filtering Qdrant queries by region
# Qdrant payload field: "region" (stores short codes like "NH", "ZH")
# ===================================================================
DUTCH_PROVINCE_CODES: dict[str, str] = {
# Noord-Holland
"noord-holland": "NH",
"noordholland": "NH",
"north holland": "NH",
"north-holland": "NH",
# Zuid-Holland
"zuid-holland": "ZH",
"zuidholland": "ZH",
"south holland": "ZH",
"south-holland": "ZH",
# Utrecht
"utrecht": "UT",
# Gelderland
"gelderland": "GE",
# Noord-Brabant
"noord-brabant": "NB",
"noordbrabant": "NB",
"brabant": "NB",
"north brabant": "NB",
# Limburg
"limburg": "LI",
# Overijssel
"overijssel": "OV",
# Friesland / Fryslân
"friesland": "FR",
"fryslân": "FR",
"fryslan": "FR",
# Groningen
"groningen": "GR",
# Drenthe
"drenthe": "DR",
# Flevoland
"flevoland": "FL",
# Zeeland
"zeeland": "ZE",
}
def get_province_code(province_name: str | None) -> str | None:
"""Convert Dutch province name to ISO 3166-2 subdivision code (without country prefix).
Args:
province_name: Province name in Dutch or English (case-insensitive)
Returns:
Two-letter province code (e.g., "NH", "ZH") or None if not found
Example:
>>> get_province_code("Noord-Holland")
'NH'
>>> get_province_code("south holland")
'ZH'
>>> get_province_code("Bavaria")
None
"""
if not province_name:
return None
return DUTCH_PROVINCE_CODES.get(province_name.lower().strip())
def looks_like_person_name(query: str) -> bool:
"""Detect if query looks like a person's name for name-boosted search.
A query looks like a person name if it:
- Contains 2-4 capitalized words (first/last name pattern)
- Does NOT contain common non-name words (institutions, locations, etc.)
- Does NOT contain question words (who, what, where, etc.)
Args:
query: Search query string
Returns:
True if query appears to be a person name
Examples:
>>> looks_like_person_name("Kitty Bogte")
True
>>> looks_like_person_name("Who works at the Rijksmuseum?")
False
>>> looks_like_person_name("archivist at Nationaal Archief")
False
"""
# Skip if query contains question words or common phrases
non_name_indicators = [
# Question words
"who", "what", "where", "which", "how", "why",
"wie", "wat", "waar", "welk", "hoe", "waarom",
# Role/job indicators
"works at", "working at", "werkt bij", "werkzaam",
"archivist", "curator", "director", "librarian",
"archivaris", "directeur", "bibliothecaris",
# Prepositions indicating context
" at ", " in ", " of ", " for ", " the ",
" bij ", " in ", " van ", " voor ", " de ", " het ",
# Punctuation that indicates non-name queries
"?", "!",
]
query_lower = query.lower()
for indicator in non_name_indicators:
if indicator in query_lower:
return False
# Check for capitalized word pattern (typical of names)
words = query.strip().split()
if len(words) < 2 or len(words) > 4:
return False
# Check if words look like name components (capitalized or all letters)
capitalized_count = sum(1 for w in words if w[0].isupper() and w.isalpha())
# Most name words should be capitalized
return capitalized_count >= len(words) - 1 # Allow one lowercase (e.g., "van", "de")
def calculate_name_match_boost(query: str, name: str) -> float:
"""Calculate a score boost for name matching.
Uses case-insensitive substring matching to boost results where
the query matches part or all of the person's name.
Args:
query: Search query (potential name)
name: Person's name from search result
Returns:
Boost factor (1.0 = no boost, >1.0 = boosted)
- 3.0: Exact match (case-insensitive)
- 2.5: Query contains full name or name contains full query
- 2.0: Partial match (first or last name matches)
- 1.0: No match
"""
query_lower = query.lower().strip()
name_lower = name.lower().strip()
# Exact match
if query_lower == name_lower:
return 3.0
# Query is substring of name or vice versa
if query_lower in name_lower or name_lower in query_lower:
return 2.5
# Check for partial matches (first or last name)
query_parts = set(query_lower.split())
name_parts = set(name_lower.split())
# How many query parts match name parts?
matching_parts = query_parts & name_parts
if matching_parts:
# More matching parts = higher boost
match_ratio = len(matching_parts) / max(len(query_parts), len(name_parts))
return 1.0 + match_ratio # 1.5-2.0 range for partial matches
return 1.0 # No boost
def detect_query_type(query: str, dspy_entity_type: str | None = None) -> str:
"""Detect if query is about institutions or persons.
Uses DSPy LLM classification if provided, falls back to keyword heuristics.
Args:
query: Search query string
dspy_entity_type: Optional entity_type from DSPy HeritageQueryRouter
("person", "institution", or "both")
Returns:
"person" or "institution"
"""
# Prefer DSPy semantic classification when available
if dspy_entity_type:
if dspy_entity_type in ("person", "both"):
return "person"
if dspy_entity_type == "institution":
return "institution"
# Fallback to keyword heuristics
query_lower = query.lower()
for pattern in PERSON_QUERY_PATTERNS:
if pattern in query_lower:
return "person"
return "institution"
# ===================================================================
# Schema-Aware Filter Mapping for DSPy Heritage Query Router
# ===================================================================
#
# These mappings are now loaded DYNAMICALLY from the LinkML schema files
# via the ontology_mapping module. This ensures:
# 1. Schema is the single source of truth (no hardcoded values)
# 2. Multilingual support (Dutch, German, French, Spanish, etc.)
# 3. Automatic updates when schema changes
#
# The ontology_mapping module extracts synonyms from YAML comments
# and provides fuzzy matching for natural language queries.
# ===================================================================
def _get_custodian_type_mapping() -> dict[str, str]:
"""Get custodian type to heritage code mapping from schema.
Dynamically loads from CustodianPrimaryTypeEnum in LinkML schema.
Falls back to minimal hardcoded mapping if schema unavailable.
Returns:
Dict mapping custodian type (e.g., "MUSEUM") to heritage code (e.g., "M")
"""
try:
# Try backend.rag path first (when backend is in Python path)
from backend.rag.ontology_mapping import get_custodian_type_mapping
mapping = get_custodian_type_mapping()
if mapping:
return mapping
except ImportError:
try:
# Fallback: try direct import (when ontology_mapping is in sys.path)
from ontology_mapping import get_custodian_type_mapping # type: ignore[import-not-found]
mapping = get_custodian_type_mapping()
if mapping:
return mapping
except ImportError:
logger.warning("ontology_mapping not available, using fallback mapping")
except Exception as e:
logger.warning(f"Failed to load custodian type mapping from schema: {e}")
# Fallback: minimal GLAMORCUBESFIXPHDNT mapping
return {
"GALLERY": "G", "LIBRARY": "L", "ARCHIVE": "A", "MUSEUM": "M",
"OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C",
"UNKNOWN": "U", "BIO_CUSTODIAN": "B", "EDUCATION_PROVIDER": "E",
"COLLECTING_SOCIETY": "S", "FEATURE": "F", "INTANGIBLE_HERITAGE_GROUP": "I",
"MIXED": "X", "PERSONAL_COLLECTION": "P", "HOLY_SITE": "H",
"DIGITAL_PLATFORM": "D", "NGO": "N", "TASTE_SMELL_HERITAGE": "T",
}
def _get_role_category_keywords() -> dict[str, list[str]]:
"""Get role category keywords from schema.
Dynamically loads from RoleCategoryEnum in LinkML schema.
Falls back to hardcoded keywords if schema unavailable.
Returns:
Dict mapping role category (e.g., "CURATORIAL") to keywords list
"""
try:
# Try backend.rag path first (when backend is in Python path)
from backend.rag.ontology_mapping import get_role_keywords
keywords = get_role_keywords()
if keywords:
return keywords
except ImportError:
try:
# Fallback: try direct import (when ontology_mapping is in sys.path)
from ontology_mapping import get_role_keywords # type: ignore[import-not-found]
keywords = get_role_keywords()
if keywords:
return keywords
except ImportError:
logger.warning("ontology_mapping not available, using fallback role keywords")
except Exception as e:
logger.warning(f"Failed to load role keywords from schema: {e}")
# Fallback: essential role category keywords (hardcoded)
return {
"CURATORIAL": [
"curator", "curatorial", "collectie", "collection", "tentoonstellingen",
"exhibitions", "acquisitions", "registrar", "museum professional"
],
"CONSERVATION": [
"conservator", "conservation", "restaurator", "restoration", "preservatie",
"preservation", "materiaal", "material", "preventive"
],
"ARCHIVAL": [
"archivist", "archivaris", "archief", "archive", "records", "documentalist",
"erfgoed", "heritage records", "acquisitie", "beschrijving"
],
"LIBRARY": [
"bibliothecaris", "librarian", "bibliotheek", "library", "catalogus",
"cataloging", "metadata", "special collections", "reference"
],
"DIGITAL": [
"digital", "digitaal", "developer", "data", "software", "IT", "tech",
"engineer", "digitalisering", "digitization", "web", "database"
],
"EDUCATION": [
"educatie", "education", "learning", "museum educator", "outreach",
"public programs", "docent", "teacher", "rondleiding", "guide"
],
"GOVERNANCE": [
"bestuur", "board", "governance", "trustee", "raad", "council",
"advisory", "commissie", "committee"
],
"LEADERSHIP": [
"director", "directeur", "manager", "head of", "hoofd", "chief",
"CEO", "president", "leider", "leadership"
],
"RESEARCH": [
"onderzoek", "research", "researcher", "wetenschapper", "scientist",
"academic", "scholar", "fellow", "postdoc", "PhD"
],
"TECHNICAL": [
"technical", "technisch", "facilities", "installation", "AV",
"audiovisual", "lighting", "security", "beveiliging"
],
"SUPPORT": [
"support", "admin", "administratie", "office", "HR", "finance",
"marketing", "communications", "front desk", "visitor services"
],
"CREATIVE": [
"design", "ontwerp", "creative", "graphic", "exhibition design",
"multimedia", "artist", "kunstenaar", "visual"
],
"EXTERNAL": [
"volunteer", "vrijwilliger", "intern", "stagiair", "consultant",
"advisor", "external", "contractor", "freelance"
],
}
# Lazy-loaded module-level caches (populated on first access)
_CUSTODIAN_TYPE_MAPPING: dict[str, str] | None = None
_ROLE_CATEGORY_KEYWORDS: dict[str, list[str]] | None = None
def get_custodian_type_to_heritage_code() -> dict[str, str]:
"""Get cached custodian type to heritage code mapping."""
global _CUSTODIAN_TYPE_MAPPING
if _CUSTODIAN_TYPE_MAPPING is None:
_CUSTODIAN_TYPE_MAPPING = _get_custodian_type_mapping()
return _CUSTODIAN_TYPE_MAPPING
def get_role_category_keywords() -> dict[str, list[str]]:
"""Get cached role category keywords."""
global _ROLE_CATEGORY_KEYWORDS
if _ROLE_CATEGORY_KEYWORDS is None:
_ROLE_CATEGORY_KEYWORDS = _get_role_category_keywords()
return _ROLE_CATEGORY_KEYWORDS
def build_schema_aware_person_filter(
heritage_type_code: str | None = None,
heritage_relevant_only: bool = False,
custodian_slug: str | None = None,
only_wcms: bool = False,
) -> dict[str, Any] | None:
"""Build Qdrant filter conditions for schema-aware person search.
Args:
heritage_type_code: Single-letter heritage type code (M, A, L, etc.)
heritage_relevant_only: Only return heritage-relevant staff
custodian_slug: Filter by specific custodian
only_wcms: Only return WCMS-registered profiles (heritage sector users)
Returns:
Dict of filter conditions for Qdrant, or None if no filters
"""
filters: dict[str, Any] = {}
if heritage_type_code and heritage_type_code not in ("U", "UNKNOWN", "UNSPECIFIED"):
filters["heritage_type"] = heritage_type_code
if heritage_relevant_only:
filters["heritage_relevant"] = True
if custodian_slug:
filters["custodian_slug"] = custodian_slug
if only_wcms:
filters["has_wcms"] = True
return filters if filters else None
def filter_by_role_category_keywords(
results: list["RetrievedPerson"],
role_category: str | None,
) -> list["RetrievedPerson"]:
"""Post-filter search results by role category using headline keywords.
Since role_category is not indexed in Qdrant, we use headline keyword matching
to filter results after vector search.
Args:
results: List of RetrievedPerson from vector search
role_category: Target role category (CURATORIAL, ARCHIVAL, etc.)
Returns:
Filtered list of RetrievedPerson matching the role category
"""
if not role_category or role_category in ("UNKNOWN", "UNSPECIFIED"):
return results
keywords = get_role_category_keywords().get(role_category, [])
if not keywords:
return results
filtered = []
for person in results:
headline = (person.headline or "").lower()
# Check if any keyword matches the headline
if any(kw.lower() in headline for kw in keywords):
filtered.append(person)
# If filtering removed all results, return original (don't be too strict)
if not filtered:
logger.info(f"Role category filter '{role_category}' removed all results, returning unfiltered")
return results
logger.info(f"Role category filter '{role_category}' reduced results from {len(results)} to {len(filtered)}")
return filtered
def get_heritage_type_code(custodian_type: str | None) -> str | None:
"""Convert CustodianPrimaryTypeEnum value to single-letter heritage code.
Args:
custodian_type: Custodian type from DSPy router (e.g., "MUSEUM", "ARCHIVE")
Returns:
Single-letter heritage code (e.g., "M", "A") or None if not mappable
"""
if not custodian_type or custodian_type in ("UNKNOWN", "UNSPECIFIED"):
return None
return get_custodian_type_to_heritage_code().get(custodian_type)
class SPARQLClient:
"""Client for querying Oxigraph SPARQL endpoint."""
def __init__(
self,
endpoint: str = DEFAULT_SPARQL_ENDPOINT,
timeout: float = DEFAULT_SPARQL_TIMEOUT,
max_connections: int = 20 # Allow concurrent connections for parallel queries
):
self.endpoint = endpoint
self.timeout = timeout
self.max_connections = max_connections
self._client: httpx.Client | None = None
@property
def client(self) -> httpx.Client:
"""Lazy-initialize HTTP client with connection pooling."""
if self._client is None:
# Configure connection pool for parallel SPARQL queries
limits = httpx.Limits(
max_keepalive_connections=self.max_connections,
max_connections=self.max_connections,
keepalive_expiry=30.0 # Keep connections alive for reuse
)
self._client = httpx.Client(
timeout=self.timeout,
limits=limits,
http2=False # HTTP/1.1 is often faster for small queries
)
return self._client
def query(self, sparql: str, log_timing: bool = False) -> list[dict[str, Any]]:
"""Execute SPARQL query and return results.
Args:
sparql: SPARQL query string
log_timing: Whether to log query execution time
Returns:
List of result bindings as dictionaries
"""
full_query = SPARQL_PREFIXES + sparql
start_time = time.time() if log_timing else 0
try:
response = self.client.post(
self.endpoint,
data={"query": full_query},
headers={"Accept": "application/sparql-results+json"}
)
response.raise_for_status()
data = response.json()
bindings = data.get("results", {}).get("bindings", [])
# Convert bindings to simple dicts
results = []
for binding in bindings:
row = {}
for key, value in binding.items():
row[key] = value.get("value", "")
results.append(row)
if log_timing:
duration_ms = (time.time() - start_time) * 1000
logger.debug(f"SPARQL query completed: {len(results)} results in {duration_ms:.0f}ms")
return results
except httpx.HTTPError as e:
logger.error(f"SPARQL query failed: {e}")
return []
except Exception as e:
logger.error(f"Unexpected error in SPARQL query: {e}")
return []
def close(self) -> None:
"""Close the HTTP client."""
if self._client:
self._client.close()
self._client = None
class HybridRetriever:
"""Hybrid retriever combining vector search with knowledge graph expansion.
The retrieval process:
1. Vector search finds semantically similar institutions
2. For each result, SPARQL expands to find related institutions:
- Institutions in the same city
- Institutions of the same type
- Institutions with related collections
3. Results are re-ranked based on combined vector + graph scores
Embedding Models:
- If OpenAI API key is available AND collection uses 1536-dim vectors: use OpenAI
- Otherwise: use sentence-transformers (all-MiniLM-L6-v2, 384-dim)
Multi-Embedding Support:
Set use_multi_embedding=True to enable support for multiple embedding models
via Qdrant's named vectors feature. This allows:
- A/B testing different embedding models
- Seamless migration between models
- Specifying which model to use per query
Args:
qdrant_host: Qdrant server hostname
qdrant_port: Qdrant REST API port
sparql_endpoint: Oxigraph SPARQL endpoint URL
vector_weight: Weight for vector similarity scores (0-1)
graph_weight: Weight for graph expansion scores (0-1)
collection_name: Qdrant collection name
embedding_model: Embedding model name (auto-detected if not specified)
k_vector: Number of initial vector search results
k_expand: Number of graph expansion results per seed
k_final: Final number of results to return
use_multi_embedding: Enable multi-embedding mode with named vectors
preferred_embedding_model: Preferred model for multi-embedding mode
"""
# Class-level type annotations for instance attributes
qdrant_host: str
qdrant_port: int
sparql_endpoint: str
vector_weight: float
graph_weight: float
collection_name: str
k_vector: int
k_expand: int
k_final: int
openai_api_key: str | None
use_production_qdrant: bool
use_multi_embedding: bool
preferred_embedding_model: str | None
sparql_client: "SPARQLClient"
embedding_model: str
# Private attributes with lazy initialization
_qdrant_client: "QdrantClient | None"
_openai_client: "OpenAI | None"
_st_model: "SentenceTransformer | None"
_use_sentence_transformers: bool
_collection_vector_size: int | None
_multi_retriever: "MultiEmbeddingRetriever | None"
_selected_multi_model: "EmbeddingModel | None"
def __init__(
self,
qdrant_host: str = "localhost",
qdrant_port: int = 6333,
sparql_endpoint: str = DEFAULT_SPARQL_ENDPOINT,
vector_weight: float = 0.7,
graph_weight: float = 0.3,
collection_name: str = "heritage_custodians",
embedding_model: str | None = None, # Auto-detect if None
k_vector: int = 10,
k_expand: int = 5,
k_final: int = 10,
openai_api_key: str | None = None,
use_production_qdrant: bool = False,
use_multi_embedding: bool = False,
preferred_embedding_model: str | None = None,
):
self.qdrant_host = qdrant_host
self.qdrant_port = qdrant_port
self.sparql_endpoint = sparql_endpoint
self.vector_weight = vector_weight
self.graph_weight = graph_weight
self.collection_name = collection_name
self.k_vector = k_vector
self.k_expand = k_expand
self.k_final = k_final
self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
self.use_production_qdrant = use_production_qdrant
self.use_multi_embedding = use_multi_embedding
self.preferred_embedding_model = preferred_embedding_model
# Initialize SPARQL client
self.sparql_client = SPARQLClient(endpoint=sparql_endpoint)
# Lazy-load Qdrant, OpenAI, and sentence-transformers clients
self._qdrant_client = None
self._openai_client = None
self._st_model = None
self._use_sentence_transformers = False
self._collection_vector_size: int | None = None
# Multi-embedding retriever (lazy-loaded)
self._multi_retriever = None
# Currently selected multi-embedding model (for multi-embedding mode)
self._selected_multi_model = None
# Determine embedding model to use
self.embedding_model = embedding_model or self._auto_detect_embedding_model()
logger.info(
f"Initialized HybridRetriever: "
f"Qdrant={qdrant_host}:{qdrant_port}, "
f"SPARQL={sparql_endpoint}, "
f"embedding_model={self.embedding_model}, "
f"multi_embedding={use_multi_embedding}, "
f"weights=vector:{vector_weight}/graph:{graph_weight}"
)
@property
def qdrant_client(self) -> "QdrantClient":
"""Lazy-load Qdrant client."""
if self._qdrant_client is None:
from qdrant_client import QdrantClient
if self.use_production_qdrant:
# Connect via HTTPS to production
self._qdrant_client = QdrantClient(
host="bronhouder.nl",
port=443,
https=True,
prefix="qdrant",
prefer_grpc=False,
timeout=30
)
else:
self._qdrant_client = QdrantClient(
host=self.qdrant_host,
port=self.qdrant_port
)
return self._qdrant_client
@property
def openai_client(self) -> "OpenAI":
"""Lazy-load OpenAI client."""
if self._openai_client is None:
if not self.openai_api_key:
raise RuntimeError(
"OpenAI API key not available. Set OPENAI_API_KEY or use sentence-transformers."
)
import openai
self._openai_client = openai.OpenAI(api_key=self.openai_api_key)
return self._openai_client
def _get_collection_vector_size(self) -> int | None:
"""Get the vector size of the Qdrant collection."""
try:
info = self.qdrant_client.get_collection(self.collection_name)
if hasattr(info.config.params, 'vectors'):
vectors_config = info.config.params.vectors
if isinstance(vectors_config, dict):
# Named vectors
first_config = next(iter(vectors_config.values()), None)
return first_config.size if first_config else None
elif vectors_config is not None:
# Single vector config
return vectors_config.size
return None
except Exception as e:
logger.warning(f"Could not get collection vector size: {e}")
return None
def _auto_detect_embedding_model(self) -> str:
"""Auto-detect which embedding model to use based on collection and available APIs.
Detection priority:
1. Check main collection (heritage_custodians) vector size
2. If main collection doesn't exist, check heritage_persons collection
3. If OpenAI key available and collection uses 1536-dim, use OpenAI
4. Otherwise use sentence-transformers (384-dim, all-MiniLM-L6-v2)
"""
# Check main collection vector size first
vector_size = self._get_collection_vector_size()
self._collection_vector_size = vector_size
# If main collection doesn't exist, try heritage_persons collection
if vector_size is None:
logger.info(f"Collection '{self.collection_name}' not found, checking heritage_persons")
person_vector_size = self._get_person_collection_vector_size()
if person_vector_size:
vector_size = person_vector_size
logger.info(f"Using heritage_persons collection vector size: {vector_size}")
if vector_size == 384:
# Collection uses sentence-transformers dimensions
self._use_sentence_transformers = True
logger.info("Auto-detected 384-dim vectors, using sentence-transformers")
return "all-MiniLM-L6-v2"
elif vector_size == 1536 and self.openai_api_key:
# Collection uses OpenAI dimensions and we have API key
self._use_sentence_transformers = False
logger.info("Auto-detected 1536-dim vectors with OpenAI key, using OpenAI")
return "text-embedding-3-small"
elif self.openai_api_key:
# Default to OpenAI if we have key
self._use_sentence_transformers = False
return "text-embedding-3-small"
else:
# Fallback to sentence-transformers
self._use_sentence_transformers = True
logger.info("No OpenAI key, falling back to sentence-transformers")
return "all-MiniLM-L6-v2"
def _load_sentence_transformer(self) -> "SentenceTransformer":
"""Lazy-load sentence-transformers model."""
if self._st_model is None:
try:
from sentence_transformers import SentenceTransformer
self._st_model = SentenceTransformer(self.embedding_model)
logger.info(f"Loaded sentence-transformers model: {self.embedding_model}")
except ImportError:
raise RuntimeError("sentence-transformers not installed. Run: pip install sentence-transformers")
return self._st_model
@property
def multi_retriever(self) -> "MultiEmbeddingRetriever | None":
"""Lazy-load MultiEmbeddingRetriever when multi-embedding mode is enabled.
Returns:
MultiEmbeddingRetriever instance or None if not in multi-embedding mode
"""
if not self.use_multi_embedding:
return None
if self._multi_retriever is None:
from glam_extractor.api.multi_embedding_retriever import (
MultiEmbeddingRetriever,
MultiEmbeddingConfig,
EmbeddingModel,
)
# Create config matching current settings
config = MultiEmbeddingConfig(
qdrant_host=self.qdrant_host,
qdrant_port=self.qdrant_port,
qdrant_https=self.use_production_qdrant,
qdrant_prefix="qdrant" if self.use_production_qdrant else None,
openai_api_key=self.openai_api_key,
institutions_collection=self.collection_name,
)
self._multi_retriever = MultiEmbeddingRetriever(config)
# Auto-select model if not specified
if self.preferred_embedding_model:
try:
self._selected_multi_model = EmbeddingModel(self.preferred_embedding_model)
except ValueError:
logger.warning(f"Unknown embedding model: {self.preferred_embedding_model}")
assert self._multi_retriever is not None # Set above
self._selected_multi_model = self._multi_retriever.select_model(self.collection_name)
else:
assert self._multi_retriever is not None # Set above
self._selected_multi_model = self._multi_retriever.select_model(self.collection_name)
logger.info(f"MultiEmbeddingRetriever initialized, selected model: {self._selected_multi_model}")
return self._multi_retriever
def _get_embedding(self, text: str, using: str | None = None) -> list[float]:
"""Get embedding vector for text using the appropriate model.
Args:
text: Text to embed
using: Optional embedding model name (for multi-embedding mode)
Returns:
Embedding vector as list of floats
"""
# If multi-embedding mode, delegate to MultiEmbeddingRetriever
if self.use_multi_embedding and self.multi_retriever:
from glam_extractor.api.multi_embedding_retriever import EmbeddingModel
# Determine which model to use
if using:
try:
model = EmbeddingModel(using)
except ValueError:
logger.warning(f"Unknown model '{using}', using default")
model = self._selected_multi_model
else:
model = self._selected_multi_model
if model:
return self.multi_retriever.get_embedding(text, model)
else:
# Fallback to legacy mode
logger.warning("No multi-embedding model available, falling back to legacy")
# Legacy single-model embedding
if self._use_sentence_transformers:
model = self._load_sentence_transformer()
embedding = model.encode(text)
return embedding.tolist()
else:
response = self.openai_client.embeddings.create(
input=text,
model=self.embedding_model
)
return response.data[0].embedding
def _vector_search(
self,
query: str,
k: int,
using: str | None = None,
region_codes: list[str] | None = None,
cities: list[str] | None = None,
institution_types: list[str] | None = None,
use_polygon_filter: bool = True,
) -> list[RetrievedInstitution]:
"""Perform vector similarity search in Qdrant.
Args:
query: Search query text
k: Number of results to retrieve
using: Optional embedding model name (for multi-embedding mode)
region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
use_polygon_filter: If True, apply polygon-based geographic filtering
using actual province boundaries (default: True)
Returns:
List of RetrievedInstitution with vector scores
"""
query_vector = self._get_embedding(query, using=using)
# When polygon filtering is enabled and regions are specified,
# over-fetch to ensure we have enough results after polygon filtering
effective_limit = k
if use_polygon_filter and region_codes:
effective_limit = k * 3 # Over-fetch 3x for polygon filtering
logger.debug(f"Over-fetching {effective_limit} results for polygon filtering")
# Build query parameters
search_params = {
"collection_name": self.collection_name,
"query": query_vector,
"limit": effective_limit,
"with_payload": True,
}
# Build geographic/type filter if any criteria provided
# NOTE: Always apply region metadata filter to Qdrant first to get relevant results.
# The polygon filter (if enabled) is an additional precision filter applied afterward.
# Previously we disabled metadata region filter when polygon filter was enabled,
# but this caused vector search to return results from wrong regions.
if region_codes or cities or institution_types:
from glam_extractor.ontology.qdrant_filters import QdrantFilterBuilder
# Convert institution types from full names (LIBRARY, MUSEUM) to single-letter codes (L, M)
# because Qdrant stores institution_type as single-letter codes per GLAMORCUBESFIXPHDNT
type_codes = None
if institution_types:
type_mapping = get_custodian_type_to_heritage_code()
type_codes = [type_mapping.get(t, t) for t in institution_types]
# Filter out any that didn't map (keep original if 1 char already)
type_codes = [c for c in type_codes if c and len(c) == 1]
logger.debug(f"Converted institution types: {institution_types} -> {type_codes}")
builder = QdrantFilterBuilder()
filter_dict = builder.combined_filter(
primary_types=type_codes, # Use single-letter codes
region_codes=region_codes, # Always apply region filter to get relevant results
cities=cities,
combine_mode="must",
)
if filter_dict:
query_filter = QdrantFilterBuilder.to_qdrant_models(filter_dict)
search_params["query_filter"] = query_filter
logger.info(
f"Applied Qdrant filter: types={type_codes}, "
f"regions={region_codes}, cities={cities}"
)
# Add named vector 'using' ONLY if collection actually has named vectors
# Single-vector collections will error with "Not existing vector name" otherwise
if self.use_multi_embedding and self.multi_retriever:
uses_named = self.multi_retriever.uses_named_vectors(self.collection_name)
if uses_named:
if using:
search_params["using"] = using
elif self._selected_multi_model:
search_params["using"] = self._selected_multi_model.value
# else: single-vector collection, don't add 'using' parameter
results = self.qdrant_client.query_points(**search_params)
institutions = []
for point in results.points:
payload = point.payload or {}
inst = RetrievedInstitution(
ghcid=payload.get("ghcid", ""),
name=payload.get("name", ""),
uri=payload.get("uri", f"https://nde.nl/ontology/hc/custodian/{payload.get('ghcid', '')}"),
vector_score=point.score,
institution_type=payload.get("institution_type"),
country=payload.get("country"),
city=payload.get("city"),
description=payload.get("text", "")[:200] if payload.get("text") else None,
latitude=payload.get("latitude"),
longitude=payload.get("longitude"),
)
institutions.append(inst)
# Apply polygon-based geographic filtering if enabled and regions specified
if use_polygon_filter and region_codes and institutions:
institutions = self._apply_polygon_filter(institutions, region_codes, k)
return institutions
def _apply_polygon_filter(
self,
institutions: list[RetrievedInstitution],
region_codes: list[str],
k: int,
) -> list[RetrievedInstitution]:
"""Filter institutions by polygon containment in specified regions.
Uses actual province boundary polygons to ensure results are
geographically within the requested regions, not just metadata matching.
Args:
institutions: List of retrieved institutions with lat/lon
region_codes: List of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
k: Maximum number of results to return
Returns:
Filtered list of institutions within the specified regions
"""
polygon_filter = get_polygon_filter()
# Handle case where polygon filter module is not available or not loaded
if polygon_filter is None:
logger.warning("Polygon filter not available, skipping geographic filtering")
return institutions[:k]
if not polygon_filter.is_loaded:
logger.warning("Polygon filter not loaded, skipping geographic filtering")
return institutions[:k]
filtered = []
for inst in institutions:
if inst.latitude is None or inst.longitude is None:
# No coordinates, check if metadata region matches
if inst.country == "NL":
# For Dutch institutions without coords, fallback to metadata
# Extract region from GHCID (format: NL-{REGION}-...)
if inst.ghcid and len(inst.ghcid) > 3:
ghcid_region = inst.ghcid.split("-")[1] if "-" in inst.ghcid else None
if ghcid_region and ghcid_region.upper() in [r.upper() for r in region_codes]:
filtered.append(inst)
continue
# Check if point is within any of the requested regions
for region_code in region_codes:
if polygon_filter.point_in_province(inst.latitude, inst.longitude, region_code):
filtered.append(inst)
break # Don't add same institution multiple times
logger.info(
f"Polygon filter: {len(filtered)}/{len(institutions)} institutions "
f"in regions {region_codes}"
)
# Return up to k results
return filtered[:k]
def _build_batched_expansion_query(
self,
seed_institutions: list[RetrievedInstitution],
exclude_ghcids: set[str],
limit_per_expansion: int = 5
) -> tuple[str, dict[str, dict]]:
"""Build a single SPARQL query with UNION clauses for all expansions.
DEDUPLICATES by city code and type+country to avoid redundant query patterns.
For example, if 5 seeds are all from Amsterdam with type MUSEUM, we only
create ONE city expansion (for AMS) and ONE type expansion (for NL + M),
not 10 redundant UNIONs.
Args:
seed_institutions: Seed institutions to expand from
exclude_ghcids: GHCIDs to exclude from results
limit_per_expansion: Max results per expansion type
Returns:
Tuple of (SPARQL query string, expansion_metadata dict)
expansion_metadata maps expansion_key -> {seed, type, city/type_code}
"""
unions = []
expansion_metadata = {}
# Track unique patterns to avoid duplicate queries
seen_city_codes: set[str] = set()
seen_type_patterns: set[str] = set() # "country-type_code" pattern
seeds_to_expand = seed_institutions[:5]
city_idx = 0
type_idx = 0
for seed in seeds_to_expand:
# City expansion - deduplicate by city code
if seed.city:
city_code = seed.city[:3].upper()
if city_code not in seen_city_codes:
seen_city_codes.add(city_code)
expansion_key = f"city_{city_idx}"
city_idx += 1
unions.append(f"""
{{
SELECT ?s ?name ?ghcid ?type ("{expansion_key}" AS ?expansion_key) WHERE {{
?s a hcc:Custodian ;
skos:prefLabel ?name ;
hc:ghcid ?ghcid .
FILTER(CONTAINS(?ghcid, "-{city_code}-"))
OPTIONAL {{ ?s hc:institutionType ?type }}
}}
LIMIT {limit_per_expansion + len(exclude_ghcids)}
}}
""")
expansion_metadata[expansion_key] = {
"seed": seed,
"type": "city",
"city": seed.city,
"city_code": city_code
}
# Type expansion - deduplicate by country + type_code pattern
if seed.institution_type and seed.country:
type_code = get_custodian_type_to_heritage_code().get(seed.institution_type, "")
if type_code:
pattern_key = f"{seed.country}-{type_code}"
if pattern_key not in seen_type_patterns:
seen_type_patterns.add(pattern_key)
expansion_key = f"type_{type_idx}"
type_idx += 1
unions.append(f"""
{{
SELECT ?s ?name ?ghcid ?city ("{expansion_key}" AS ?expansion_key) WHERE {{
?s a hcc:Custodian ;
skos:prefLabel ?name ;
hc:ghcid ?ghcid .
FILTER(STRSTARTS(?ghcid, "{seed.country}-"))
FILTER(CONTAINS(?ghcid, "-{type_code}-"))
OPTIONAL {{ ?s schema:location ?city }}
}}
LIMIT {limit_per_expansion + len(exclude_ghcids)}
}}
""")
expansion_metadata[expansion_key] = {
"seed": seed,
"type": "type",
"institution_type": seed.institution_type,
"type_code": type_code,
"country": seed.country
}
if not unions:
return "", {}
# Log deduplication stats
logger.info(f"Batched SPARQL: {len(unions)} UNIONs (deduplicated from max {len(seeds_to_expand) * 2}). "
f"Unique cities: {seen_city_codes}, Unique types: {seen_type_patterns}")
# Combine all unions into a single query
query = f"""
SELECT ?s ?name ?ghcid ?type ?city ?expansion_key WHERE {{
{" UNION ".join(unions)}
}}
"""
return query, expansion_metadata
def _graph_expand_batched(
self,
seed_institutions: list[RetrievedInstitution]
) -> list[RetrievedInstitution]:
"""Expand seed results using a SINGLE batched SPARQL query.
This is a significant optimization over the parallel ThreadPoolExecutor
approach. Instead of 10 HTTP requests (even in parallel), we execute
ONE SPARQL query with UNION clauses.
Performance comparison:
- Sequential: 10 queries × ~100ms = 4+ seconds
- Parallel (ThreadPool): ~500ms-1s (limited by GIL/connection pool)
- Batched (this method): ONE query ~150-300ms
Args:
seed_institutions: Initial vector search results
Returns:
Additional institutions found via graph expansion
"""
start_time = time.time()
exclude_ghcids = {inst.ghcid for inst in seed_institutions}
expanded = []
seen_ghcids = set(exclude_ghcids)
# Build batched query
query, expansion_metadata = self._build_batched_expansion_query(
seed_institutions, exclude_ghcids, limit_per_expansion=self.k_expand
)
if not query:
logger.debug("No graph expansion tasks to execute")
return expanded
# Execute single batched query
query_start = time.time()
results = self.sparql_client.query(query)
query_duration = (time.time() - query_start) * 1000
logger.debug(f"Batched SPARQL query: {len(results)} raw results in {query_duration:.0f}ms")
# Group results by expansion_key
results_by_expansion: dict[str, list[dict]] = {}
for row in results:
exp_key = row.get("expansion_key", "")
if exp_key:
if exp_key not in results_by_expansion:
results_by_expansion[exp_key] = []
results_by_expansion[exp_key].append(row)
# Process results, filtering and creating RetrievedInstitution objects
for exp_key, rows in results_by_expansion.items():
if exp_key not in expansion_metadata:
continue
meta = expansion_metadata[exp_key]
seed = meta["seed"]
exp_type = meta["type"]
count = 0
for row in rows:
ghcid = row.get("ghcid", "")
if not ghcid or ghcid in seen_ghcids:
continue
if count >= self.k_expand:
break
seen_ghcids.add(ghcid)
count += 1
if exp_type == "city":
expanded.append(RetrievedInstitution(
ghcid=ghcid,
name=row.get("name", ""),
uri=row.get("s", ""),
graph_score=0.8, # High score for same city
institution_type=row.get("type"),
expansion_reason="same_city",
related_institutions=[seed.ghcid]
))
elif exp_type == "type":
expanded.append(RetrievedInstitution(
ghcid=ghcid,
name=row.get("name", ""),
uri=row.get("s", ""),
graph_score=0.5, # Medium score for same type
institution_type=seed.institution_type,
city=row.get("city"),
expansion_reason="same_type",
related_institutions=[seed.ghcid]
))
logger.debug(f"Expansion {exp_key}: {count} results for {seed.ghcid}")
total_time = (time.time() - start_time) * 1000
logger.info(f"Graph expansion (batched): 1 query, {len(results)} raw results, "
f"{len(expanded)} expanded in {total_time:.0f}ms")
return expanded
def _expand_by_city(self, city: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
"""Find other institutions in the same city via SPARQL.
Note: This method is kept for backwards compatibility and direct calls.
For batch operations, use _graph_expand_batched() instead.
Args:
city: City name to search for
exclude_ghcids: GHCIDs to exclude from results
limit: Maximum number of results
Returns:
List of institution data dicts
"""
if not city:
return []
query = f"""
SELECT ?s ?name ?ghcid ?type WHERE {{
?s a hcc:Custodian ;
skos:prefLabel ?name ;
hc:ghcid ?ghcid .
# Match city in GHCID (format: CC-RR-CCC-T-ABBR)
FILTER(CONTAINS(?ghcid, "-{city[:3].upper()}-"))
OPTIONAL {{ ?s hc:institutionType ?type }}
}}
LIMIT {limit + len(exclude_ghcids)}
"""
results = self.sparql_client.query(query)
# Filter out excluded GHCIDs
filtered = []
for row in results:
ghcid = row.get("ghcid", "")
if ghcid not in exclude_ghcids:
filtered.append(row)
if len(filtered) >= limit:
break
return filtered
def _expand_by_type(self, institution_type: str, country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
"""Find other institutions of the same type in the same country.
Args:
institution_type: Institution type (MUSEUM, LIBRARY, etc.)
country: Country code (ISO 3166-1 alpha-2)
exclude_ghcids: GHCIDs to exclude
limit: Maximum number of results
Returns:
List of institution data dicts
"""
if not institution_type:
return []
# Map institution type to GHCID type code using dynamic schema mapping
type_code = get_custodian_type_to_heritage_code().get(institution_type, "")
if not type_code or not country:
return []
query = f"""
SELECT ?s ?name ?ghcid ?city WHERE {{
?s a hcc:Custodian ;
skos:prefLabel ?name ;
hc:ghcid ?ghcid .
# Match country and type in GHCID
FILTER(STRSTARTS(?ghcid, "{country}-"))
FILTER(CONTAINS(?ghcid, "-{type_code}-"))
OPTIONAL {{ ?s schema:location ?city }}
}}
LIMIT {limit + len(exclude_ghcids)}
"""
results = self.sparql_client.query(query)
filtered = []
for row in results:
ghcid = row.get("ghcid", "")
if ghcid not in exclude_ghcids:
filtered.append(row)
if len(filtered) >= limit:
break
return filtered
def _expand_by_wikidata_country(self, wikidata_country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]:
"""Find institutions in the same country using Wikidata P17 property.
Args:
wikidata_country: Wikidata entity ID for country (e.g., Q55 for Netherlands)
exclude_ghcids: GHCIDs to exclude
limit: Maximum number of results
Returns:
List of institution data dicts
"""
if not wikidata_country:
return []
query = f"""
SELECT ?s ?name ?ghcid ?type WHERE {{
?s a hcc:Custodian ;
skos:prefLabel ?name ;
hc:ghcid ?ghcid ;
wdt:P17 wd:{wikidata_country} .
OPTIONAL {{ ?s hc:institutionType ?type }}
}}
LIMIT {limit + len(exclude_ghcids)}
"""
results = self.sparql_client.query(query)
filtered = []
for row in results:
ghcid = row.get("ghcid", "")
if ghcid not in exclude_ghcids:
filtered.append(row)
if len(filtered) >= limit:
break
return filtered
def _graph_expand(
self,
seed_institutions: list[RetrievedInstitution],
use_batched: bool = True
) -> list[RetrievedInstitution]:
"""Expand seed results using knowledge graph relationships.
By default uses batched SPARQL (single query with UNION) for best performance.
Falls back to parallel ThreadPoolExecutor if batched fails.
Performance comparison:
- Sequential: 10 queries × ~100ms = 4+ seconds
- Parallel (ThreadPool): ~500ms-3s (limited by GIL/connection pool)
- Batched (UNION query): ONE query ~150-300ms ← DEFAULT
Args:
seed_institutions: Initial vector search results
use_batched: If True (default), use batched SPARQL query.
If False, use parallel ThreadPoolExecutor.
Returns:
Additional institutions found via graph expansion
"""
if use_batched:
try:
return self._graph_expand_batched(seed_institutions)
except Exception as e:
logger.warning(f"Batched graph expansion failed, falling back to parallel: {e}")
# Fall through to parallel implementation
return self._graph_expand_parallel(seed_institutions)
def _graph_expand_parallel(
self,
seed_institutions: list[RetrievedInstitution]
) -> list[RetrievedInstitution]:
"""Expand seed results using parallel SPARQL queries (fallback method).
Uses ThreadPoolExecutor to parallelize SPARQL queries. This is slower than
the batched approach but serves as a fallback.
Args:
seed_institutions: Initial vector search results
Returns:
Additional institutions found via graph expansion
"""
start_time = time.time()
exclude_ghcids = {inst.ghcid for inst in seed_institutions}
expanded = []
seen_ghcids = set(exclude_ghcids)
# Prepare all expansion tasks
# Each task is a tuple: (task_type, seed, query_params)
tasks = []
seeds_to_expand = seed_institutions[:5] # Expand top 5 seeds
for seed in seeds_to_expand:
# City expansion task
if seed.city:
tasks.append(("city", seed, {"city": seed.city}))
# Type expansion task
if seed.institution_type and seed.country:
tasks.append(("type", seed, {
"institution_type": seed.institution_type,
"country": seed.country
}))
if not tasks:
logger.debug("No graph expansion tasks to execute")
return expanded
# Execute SPARQL queries in parallel
# Use min(10, len(tasks)) workers to avoid over-parallelization
max_workers = min(10, len(tasks))
def execute_expansion(task):
"""Execute a single expansion task and return results with metadata."""
task_type, seed, params = task
task_start = time.time()
try:
if task_type == "city":
results = self._expand_by_city(
params["city"], exclude_ghcids, limit=self.k_expand
)
return {
"task_type": task_type,
"seed": seed,
"results": results,
"duration_ms": (time.time() - task_start) * 1000
}
elif task_type == "type":
results = self._expand_by_type(
params["institution_type"],
params["country"],
exclude_ghcids,
limit=self.k_expand
)
return {
"task_type": task_type,
"seed": seed,
"results": results,
"duration_ms": (time.time() - task_start) * 1000
}
except Exception as e:
logger.warning(f"Graph expansion task failed: {task_type} for {seed.ghcid}: {e}")
return {
"task_type": task_type,
"seed": seed,
"results": [],
"duration_ms": (time.time() - task_start) * 1000,
"error": str(e)
}
# Run all tasks in parallel
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(execute_expansion, task): task for task in tasks}
for future in as_completed(futures):
result = future.result()
if result is None:
continue
task_type = result["task_type"]
seed = result["seed"]
rows = result["results"]
duration = result.get("duration_ms", 0)
logger.debug(f"Graph expansion {task_type} for {seed.ghcid}: "
f"{len(rows)} results in {duration:.0f}ms")
# Process results based on task type
if task_type == "city":
for row in rows:
ghcid = row.get("ghcid", "")
if ghcid and ghcid not in seen_ghcids:
seen_ghcids.add(ghcid)
expanded.append(RetrievedInstitution(
ghcid=ghcid,
name=row.get("name", ""),
uri=row.get("s", ""),
graph_score=0.8, # High score for same city
institution_type=row.get("type"),
expansion_reason="same_city",
related_institutions=[seed.ghcid]
))
elif task_type == "type":
for row in rows:
ghcid = row.get("ghcid", "")
if ghcid and ghcid not in seen_ghcids:
seen_ghcids.add(ghcid)
expanded.append(RetrievedInstitution(
ghcid=ghcid,
name=row.get("name", ""),
uri=row.get("s", ""),
graph_score=0.5, # Medium score for same type
institution_type=seed.institution_type,
city=row.get("city"),
expansion_reason="same_type",
related_institutions=[seed.ghcid]
))
total_time = (time.time() - start_time) * 1000
logger.info(f"Graph expansion completed: {len(tasks)} queries, "
f"{len(expanded)} results in {total_time:.0f}ms (parallel)")
return expanded
def _combine_and_rank(
self,
vector_results: list[RetrievedInstitution],
graph_results: list[RetrievedInstitution],
k: int
) -> list[RetrievedInstitution]:
"""Combine vector and graph results with weighted scoring and graph inheritance.
This method implements a hybrid scoring approach:
1. Direct merge: If a graph result matches a vector result (same GHCID),
the graph_score is directly applied
2. Graph inheritance: Vector results inherit a portion of graph scores from
related institutions found via graph expansion (same city/type)
Args:
vector_results: Results from vector search
graph_results: Results from graph expansion
k: Number of final results
Returns:
Combined and ranked results
"""
# Debug logging for investigation
logger.debug(f"Combining {len(vector_results)} vector + {len(graph_results)} graph results")
# Create lookup by GHCID for merging
results_by_ghcid: dict[str, RetrievedInstitution] = {}
# Track which vector GHCIDs we have for inheritance
vector_ghcids = set()
# Add vector results
for inst in vector_results:
if inst.ghcid:
results_by_ghcid[inst.ghcid] = inst
vector_ghcids.add(inst.ghcid)
logger.debug(f" Vector: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) "
f"v={inst.vector_score:.3f} g={inst.graph_score:.3f}")
# Track direct merges and inheritance candidates
direct_merges = 0
inheritance_boosts = []
# Merge graph results and build inheritance map
# inheritance_map: vector_ghcid -> list of (related_ghcid, graph_score, reason)
inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids}
for inst in graph_results:
logger.debug(f" Graph: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) "
f"g={inst.graph_score:.3f} reason={inst.expansion_reason} "
f"related_to={inst.related_institutions}")
if inst.ghcid in results_by_ghcid:
# Direct merge: graph result matches existing vector result
existing = results_by_ghcid[inst.ghcid]
old_graph_score = existing.graph_score
existing.graph_score = max(existing.graph_score, inst.graph_score)
existing.related_institutions.extend(inst.related_institutions)
if inst.expansion_reason:
existing.expansion_reason = inst.expansion_reason
direct_merges += 1
logger.debug(f" -> Direct merge! {inst.ghcid} graph_score: {old_graph_score:.3f} -> {existing.graph_score:.3f}")
else:
# New institution from graph expansion
results_by_ghcid[inst.ghcid] = inst
# Build inheritance: this graph result was expanded FROM a vector result
# The related_institutions field contains the seed GHCID(s) it was expanded from
for seed_ghcid in inst.related_institutions:
if seed_ghcid in inheritance_map:
inheritance_map[seed_ghcid].append(
(inst.ghcid, inst.graph_score, inst.expansion_reason or "related")
)
logger.debug(f"Direct merges: {direct_merges}")
# Apply graph score inheritance to vector results
# Vector results inherit a portion of graph scores from their related institutions
INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores
for vector_ghcid, related_list in inheritance_map.items():
if related_list and vector_ghcid in results_by_ghcid:
inst = results_by_ghcid[vector_ghcid]
# Calculate inherited score: average of related graph scores * inheritance factor
related_scores = [score for _, score, _ in related_list]
inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR
old_graph_score = inst.graph_score
# Inherit: take max of current graph_score and inherited score
inst.graph_score = max(inst.graph_score, inherited_score)
if inst.graph_score > old_graph_score:
# Track related institutions for context
related_ghcids = [ghcid for ghcid, _, _ in related_list]
inst.related_institutions.extend(related_ghcids[:3]) # Add up to 3 related
inheritance_boosts.append({
"ghcid": vector_ghcid,
"name": inst.name,
"old_graph": old_graph_score,
"new_graph": inst.graph_score,
"inherited_from": len(related_list),
"reasons": list(set(r for _, _, r in related_list))
})
logger.debug(f" Inheritance: {vector_ghcid} graph_score: {old_graph_score:.3f} -> "
f"{inst.graph_score:.3f} (from {len(related_list)} related institutions)")
if inheritance_boosts:
logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: "
f"{[b['ghcid'] for b in inheritance_boosts[:3]]}...")
# Calculate combined scores
for inst in results_by_ghcid.values():
inst.combined_score = (
self.vector_weight * inst.vector_score +
self.graph_weight * inst.graph_score
)
# Sort by combined score
ranked = sorted(
results_by_ghcid.values(),
key=lambda x: x.combined_score,
reverse=True
)
# Log top results for debugging
logger.debug(f"Top {min(5, len(ranked))} combined results:")
for i, inst in enumerate(ranked[:5]):
logger.debug(f" {i+1}. {inst.ghcid} ({inst.name[:25] if inst.name else '?'}...) "
f"combined={inst.combined_score:.3f} (v={inst.vector_score:.3f}, g={inst.graph_score:.3f})")
return ranked[:k]
def _get_person_collection_vector_size(self) -> int | None:
"""Get the vector size of the person collection."""
try:
info = self.qdrant_client.get_collection("heritage_persons")
if hasattr(info.config.params, 'vectors'):
vectors_config = info.config.params.vectors
if isinstance(vectors_config, dict):
first_config = next(iter(vectors_config.values()), None)
return first_config.size if first_config else None
elif vectors_config is not None:
return vectors_config.size # type: ignore[union-attr]
return None
except Exception as e:
logger.warning(f"Could not get person collection vector size: {e}")
return None
def _person_vector_search(
self,
query: str,
k: int,
using: str | None = None,
filter_conditions: dict[str, Any] | None = None,
) -> list[RetrievedPerson]:
"""Perform vector similarity search in Qdrant heritage_persons collection.
Args:
query: Search query text
k: Number of results to retrieve
using: Optional embedding model name (for multi-embedding mode)
filter_conditions: Optional dict of field->value filters for Qdrant
Returns:
List of RetrievedPerson with vector scores
"""
from qdrant_client.http import models
# Check person collection vector size and use appropriate model
person_vector_size = self._get_person_collection_vector_size()
person_model = using
if person_vector_size == 384 and not using:
# Person collection uses MiniLM (384-dim), override model selection
person_model = "minilm_384"
logger.info(f"Person collection uses 384-dim vectors, using MiniLM model")
elif person_vector_size == 1536 and not using:
person_model = "openai_1536"
elif person_vector_size == 768 and not using:
person_model = "bge_768"
query_vector = self._get_embedding(query, using=person_model)
try:
# Build query parameters
search_params: dict[str, Any] = {
"collection_name": "heritage_persons",
"query": query_vector,
"limit": k,
"with_payload": True,
}
# Add named vector 'using' ONLY if collection actually has named vectors
# Single-vector collections will error with "Not existing vector name" otherwise
if self.use_multi_embedding and self.multi_retriever:
uses_named = self.multi_retriever.uses_named_vectors("heritage_persons")
if uses_named:
if using:
search_params["using"] = using
elif self._selected_multi_model:
search_params["using"] = self._selected_multi_model.value
# else: single-vector collection, don't add 'using' parameter
# Add schema-aware filters if provided
if filter_conditions:
filter_list = []
for key, value in filter_conditions.items():
# Handle advanced match filters (e.g. {"email": {"match": {"text": "nos"}}})
if isinstance(value, dict) and "match" in value:
filter_list.append(
models.FieldCondition(
key=key,
match=models.MatchText(**value["match"])
)
)
else:
# Standard exact match value
filter_list.append(
models.FieldCondition(
key=key,
match=models.MatchValue(value=value),
)
)
search_params["query_filter"] = models.Filter(must=filter_list)
logger.info(f"[Qdrant] Applied person filters: {filter_conditions}")
logger.info(f"[Qdrant] Searching '{search_params['collection_name']}' with params: query_filter={filter_conditions}, limit={k}")
results = self.qdrant_client.query_points(**search_params)
except Exception as e:
logger.warning(f"Person collection search failed: {e}")
return []
persons = []
for point in results.points:
payload = point.payload or {}
# Extract richness score from payload (indexed by index_persons_qdrant.py)
richness_score = payload.get("richness_score", 0.0)
person = RetrievedPerson(
person_id=payload.get("staff_id", "") or hashlib.md5(
f"{payload.get('custodian_slug', '')}:{payload.get('name', '')}".encode()
).hexdigest()[:16],
name=payload.get("name", ""),
vector_score=point.score,
richness_score=richness_score,
headline=payload.get("headline"),
custodian_name=payload.get("custodian_name"),
custodian_slug=payload.get("custodian_slug"),
location=payload.get("location"),
heritage_relevant=payload.get("heritage_relevant", False),
heritage_type=payload.get("heritage_type"),
source_type=payload.get("source_type"),
linkedin_url=payload.get("linkedin_url"),
has_wcms=payload.get("has_wcms", False),
# WCMS-specific fields
wcms_user_id=payload.get("wcms_user_id"),
wcms_abs_id=payload.get("wcms_abs_id"),
wcms_crm_id=payload.get("wcms_crm_id"),
wcms_username=payload.get("wcms_username"),
wcms_username_url=payload.get("wcms_username_url"),
wcms_status=payload.get("wcms_status"),
wcms_roles=payload.get("wcms_roles"),
wcms_registered_since=payload.get("wcms_registered_since"),
wcms_last_access=payload.get("wcms_last_access"),
# Contact details
email=payload.get("email"),
email_domain=payload.get("email_domain"),
)
# Apply richness score boosting
# Formula: combined_score = vector_score * (0.7 + 0.3 * richness_score)
# - Profiles with richness_score=0 get 70% of vector score
# - Profiles with richness_score=1 get 100% of vector score
# This ensures rich profiles rank higher than sparse ones at similar similarity
richness_boost = 0.7 + 0.3 * richness_score
person.combined_score = person.vector_score * richness_boost
# Apply name-matching boost for queries that look like person names
# This ensures that searching for "Kitty Bogte" returns Kitty Bogte first,
# even if vector similarity ranks other Dutch names higher
if looks_like_person_name(query) and person.name:
name_boost = calculate_name_match_boost(query, person.name)
if name_boost > 1.0:
logger.debug(f"Name match boost {name_boost}x for '{person.name}' (query: '{query}')")
person.combined_score *= name_boost
persons.append(person)
# Re-sort by combined score after name boosting
persons.sort(key=lambda p: p.combined_score, reverse=True)
return persons
def search_persons(
self,
query: str,
k: int | None = None,
filter_custodian: str | None = None,
only_heritage_relevant: bool = False,
only_wcms: bool = False,
using: str | None = None,
# Schema-aware filter parameters (from DSPy HeritageQueryRouter)
target_role_category: str | None = None,
target_custodian_type: str | None = None,
# Extra filters for robust domain search (e.g. email substring)
extra_filters: dict[str, Any] | None = None,
) -> list[RetrievedPerson]:
"""Search for persons/staff in the heritage_persons collection.
Args:
query: Natural language search query
k: Number of results to return (default: k_final)
filter_custodian: Optional custodian slug to filter by
only_heritage_relevant: Only return heritage-relevant staff
only_wcms: Only return WCMS-registered profiles (heritage sector users)
using: Optional embedding model name (for multi-embedding mode).
One of: "openai_1536", "minilm_384", "bge_768"
target_role_category: Role category from DSPy router (CURATORIAL, ARCHIVAL, etc.)
Used for headline-based post-filtering since not indexed in Qdrant.
target_custodian_type: Custodian type from DSPy router (MUSEUM, ARCHIVE, etc.)
Converted to heritage_type code for Qdrant filtering.
extra_filters: Optional extra Qdrant filters (e.g. {"email": {"match": {"text": "nos"}}})
Returns:
List of RetrievedPerson with scores
"""
k = k or self.k_final
# Build Qdrant filter conditions from schema-aware parameters
heritage_type_code = get_heritage_type_code(target_custodian_type)
filter_conditions = build_schema_aware_person_filter(
heritage_type_code=heritage_type_code,
heritage_relevant_only=only_heritage_relevant,
custodian_slug=filter_custodian,
only_wcms=only_wcms,
) or {}
# Merge extra filters if provided (e.g. email match)
if extra_filters:
filter_conditions.update(extra_filters)
if not filter_conditions:
filter_conditions = None
logger.info(f"Person search for: {query[:50]}... (model: {using or 'auto'}, role_category: {target_role_category}, custodian_type: {target_custodian_type}, extras: {extra_filters})")
# Over-fetch to allow for post-filtering and name boosting
# - Base multiplier: 2x for general queries
# - Role category filter: 3x (need more candidates for keyword filtering)
# - Name queries: fetch minimum 100 to ensure name boost can find exact matches
# (vector similarity often ranks similar-sounding names higher than exact matches)
is_name_query = looks_like_person_name(query)
fetch_multiplier = 3 if target_role_category else 2
fetch_count = max(k * fetch_multiplier, 100 if is_name_query else 0)
results = self._person_vector_search(query, fetch_count, using=using, filter_conditions=filter_conditions)
logger.info(f"Found {len(results)} person results after Qdrant filtering")
# Apply role category post-filtering (keyword-based since not indexed)
if target_role_category:
results = filter_by_role_category_keywords(results, target_role_category)
# Sort by combined score and limit
results.sort(key=lambda x: x.combined_score, reverse=True)
return results[:k]
def search(
self,
query: str,
k: int | None = None,
expand_graph: bool = True,
filter_conditions: dict[str, Any] | None = None,
auto_route: bool = True,
using: str | None = None,
region_codes: list[str] | None = None,
cities: list[str] | None = None,
institution_types: list[str] | None = None,
) -> list[RetrievedInstitution] | list[RetrievedPerson]:
"""Perform hybrid vector + graph search with automatic query routing.
If auto_route is True, automatically detects if query is about persons
(e.g., "Who works at Rijksmuseum?") and routes to person search.
Args:
query: Natural language search query
k: Number of results to return (default: k_final)
expand_graph: Whether to perform graph expansion (institution search only)
filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params)
auto_route: Automatically detect and route person queries
using: Optional embedding model name (for multi-embedding mode).
One of: "openai_1536", "minilm_384", "bge_768"
region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
for filtering by province/subdivision
cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
Returns:
List of RetrievedInstitution or RetrievedPerson with combined scores
"""
k = k or self.k_final
# Auto-route person queries
if auto_route:
query_type = detect_query_type(query)
if query_type == "person":
logger.info(f"Auto-routing to person search for: {query[:50]}...")
return self.search_persons(query, k=k, using=using)
# Institution search (original behavior)
filter_info = []
if region_codes:
filter_info.append(f"regions={region_codes}")
if cities:
filter_info.append(f"cities={cities}")
if institution_types:
filter_info.append(f"types={institution_types}")
filter_str = f" [{', '.join(filter_info)}]" if filter_info else ""
logger.info(f"Vector search for: {query[:50]}...{filter_str} (model: {using or 'auto'})")
vector_results = self._vector_search(
query,
self.k_vector,
using=using,
region_codes=region_codes,
cities=cities,
institution_types=institution_types,
)
logger.info(f"Found {len(vector_results)} vector results")
# Step 2: Graph expansion (if enabled)
graph_results = []
if expand_graph and vector_results:
logger.info("Expanding via knowledge graph...")
graph_results = self._graph_expand(vector_results)
logger.info(f"Found {len(graph_results)} graph expansion results")
# Step 3: Combine and rank
final_results = self._combine_and_rank(vector_results, graph_results, k)
logger.info(f"Returning {len(final_results)} combined results")
return final_results
def search_institutions(
self,
query: str,
k: int | None = None,
expand_graph: bool = True,
filter_conditions: dict[str, Any] | None = None,
using: str | None = None,
region_codes: list[str] | None = None,
cities: list[str] | None = None,
institution_types: list[str] | None = None,
) -> list[RetrievedInstitution]:
"""Explicit institution search (bypasses auto-routing).
Args:
query: Natural language search query
k: Number of results to return (default: k_final)
expand_graph: Whether to perform graph expansion
filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params)
using: Optional embedding model name (for multi-embedding mode).
One of: "openai_1536", "minilm_384", "bge_768"
region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"])
for filtering by province/subdivision
cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"])
institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"])
Returns:
List of RetrievedInstitution with combined scores
"""
# auto_route=False ensures we get RetrievedInstitution, not RetrievedPerson
results = self.search(
query,
k=k,
expand_graph=expand_graph,
filter_conditions=filter_conditions,
auto_route=False,
using=using,
region_codes=region_codes,
cities=cities,
institution_types=institution_types,
)
return results # type: ignore[return-value]
def __call__(self, query: str, k: int | None = None) -> list[str]:
"""DSPy-compatible interface returning passage texts.
Supports both institution and person queries with auto-routing.
Args:
query: Search query
k: Number of results
Returns:
List of passage texts (institution/person descriptions)
"""
results = self.search(query, k=k)
passages = []
for r in results:
if isinstance(r, RetrievedPerson):
# Person result
org = f" at {r.custodian_name}" if r.custodian_name else ""
role = r.headline or "Unknown role"
passages.append(f"{r.name} ({role}{org})")
else:
# Institution result
inst_type = r.institution_type or "Unknown type"
desc = r.description or "No description"
passages.append(f"{r.name} ({inst_type}) - {desc}")
return passages
def get_stats(self) -> dict[str, Any]:
"""Get retriever statistics.
Returns:
Dict with Qdrant and Oxigraph stats
"""
stats = {
"qdrant": {
"institutions": {},
"persons": {},
},
"oxigraph": {},
"config": {
"vector_weight": self.vector_weight,
"graph_weight": self.graph_weight,
"k_vector": self.k_vector,
"k_expand": self.k_expand,
"k_final": self.k_final
}
}
# Qdrant institution collection stats
try:
info = self.qdrant_client.get_collection(self.collection_name)
stats["qdrant"]["institutions"] = {
"collection": self.collection_name,
"points_count": info.points_count,
"status": info.status.value if info.status else "unknown"
}
except Exception as e:
stats["qdrant"]["institutions"]["error"] = str(e)
# Qdrant person collection stats
try:
info = self.qdrant_client.get_collection("heritage_persons")
stats["qdrant"]["persons"] = {
"collection": "heritage_persons",
"points_count": info.points_count,
"status": info.status.value if info.status else "unknown"
}
except Exception as e:
stats["qdrant"]["persons"]["error"] = str(e)
# Oxigraph stats
try:
result = self.sparql_client.query(
"SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s a hcc:Custodian }"
)
if result:
stats["oxigraph"]["custodian_count"] = int(result[0].get("count", 0))
except Exception as e:
stats["oxigraph"]["error"] = str(e)
return stats
def close(self):
"""Clean up resources."""
self.sparql_client.close()
if self._qdrant_client:
self._qdrant_client.close()
def create_hybrid_retriever(
use_production: bool = False,
**kwargs
) -> HybridRetriever:
"""Factory function to create a hybrid retriever.
Args:
use_production: If True, connect to production endpoints
**kwargs: Additional arguments for HybridRetriever
Returns:
Configured HybridRetriever instance
"""
if use_production:
return HybridRetriever(
qdrant_host="bronhouder.nl",
qdrant_port=443,
sparql_endpoint="https://bronhouder.nl/sparql",
use_production_qdrant=True,
**kwargs
)
else:
return HybridRetriever(
qdrant_host=os.getenv("QDRANT_HOST", "localhost"),
qdrant_port=int(os.getenv("QDRANT_PORT", "6333")),
sparql_endpoint=os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query"),
**kwargs
)