From 80eb3d969c0726912dc9bf98bda1f4248fc8d38a Mon Sep 17 00:00:00 2001 From: kempersc Date: Tue, 27 Jan 2026 10:07:16 +0100 Subject: [PATCH] Add new slots for heritage custodian ontology - Introduced `has_api_version`, `has_appellation_language`, `has_appellation_type`, `has_appellation_value`, `has_applicable_country`, `has_application_deadline`, `has_application_opening_date`, `has_appraisal_note`, `has_approval_date`, `has_archdiocese_name`, `has_architectural_style`, `has_archival_reference`, `has_archive_description`, `has_archive_memento_uri`, `has_archive_name`, `has_archive_path`, `has_archive_search_score`, `has_arrangement`, `has_arrangement_level`, `has_arrangement_note`, `has_articles_archival_stage`, `has_articles_document_format`, `has_articles_document_url`, `has_articles_of_association`, `has_or_had_altitude`, `has_or_had_annotation`, `has_or_had_arrangement`, `has_or_had_document`, `has_or_had_reason`, `has_or_had_style`, `is_or_was_amended_through`, `is_or_was_approved_on`, `is_or_was_archived_as`, `is_or_was_due_on`, `is_or_was_opened_on`, and `is_or_was_used_in` slots. - Each slot includes detailed descriptions, range specifications, and appropriate mappings to existing ontologies. --- .../rules/no-deletion-from-slot-fixes.md | 17 + ...reserve-bespoke-slots-until-refactoring.md | 32 + backend/rag/hybrid_retriever.py | 2534 +++++++++++++++++ backend/rag/main.py | 56 +- backend/rag/multi_embedding_retriever.py | 846 ++++++ .../schemas/20251121/linkml/manifest.json | 2 +- schemas/20251121/linkml/manifest.json | 2 +- .../linkml/modules/classes/APIEndpoint.yaml | 7 + .../linkml/modules/classes/APIRequest.yaml | 8 + .../linkml/modules/classes/APIVersion.yaml | 7 + .../linkml/modules/classes/Altitude.yaml | 7 + .../modules/classes/AmendmentEvent.yaml | 8 + .../modules/classes/AnnexCreationEvent.yaml | 8 + .../modules/classes/AppellationType.yaml | 6 + .../linkml/modules/classes/Archdiocese.yaml | 6 + .../modules/classes/ArchitecturalStyle.yaml | 7 + .../modules/classes/ArchivalReference.yaml | 7 + .../linkml/modules/classes/Arrangement.yaml | 9 + .../modules/classes/ArrangementLevel.yaml | 7 + .../modules/classes/ArrangementType.yaml | 6 + .../classes/ArticlesOfAssociation.yaml | 20 +- .../linkml/modules/classes/Budget.yaml | 10 +- .../modules/classes/CallForApplication.yaml | 155 +- .../modules/classes/CustodianLegalStatus.yaml | 11 +- .../modules/classes/GeoSpatialPlace.yaml | 5 +- .../20251121/linkml/modules/classes/Loan.yaml | 134 +- .../linkml/modules/classes/Memento.yaml | 7 + .../modules/classes/ProvenancePath.yaml | 6 + .../linkml/modules/classes/Reason.yaml | 7 + .../modules/classes/RecordCycleStatus.yaml | 7 + .../linkml/modules/classes/SearchScore.yaml | 6 + .../modules/classes/VideoAnnotation.yaml | 47 +- .../{ => archive}/administrative_context.yaml | 0 .../slots/{ => archive}/based_on_claim.yaml | 0 .../slots/{ => archive}/has_altitude.yaml | 0 .../{ => archive}/has_amendment_history.yaml | 0 .../{ => archive}/has_annex_description.yaml | 0 .../slots/{ => archive}/has_annex_name.yaml | 0 .../slots/{ => archive}/has_annex_reason.yaml | 0 ...otation_motivation_archived_20260127.yaml} | 0 ...annotation_segment_archived_20260127.yaml} | 0 ...as_annotation_type_archived_20260127.yaml} | 0 .../slots/{ => archive}/has_api_version.yaml | 0 .../has_appellation_language.yaml | 0 .../{ => archive}/has_appellation_type.yaml | 0 .../{ => archive}/has_appellation_value.yaml | 0 .../{ => archive}/has_applicable_country.yaml | 0 .../has_application_deadline.yaml | 0 .../has_application_opening_date.yaml | 0 .../{ => archive}/has_appraisal_note.yaml | 0 .../{ => archive}/has_approval_date.yaml | 0 .../{ => archive}/has_archdiocese_name.yaml | 0 .../has_architectural_style.yaml | 0 .../{ => archive}/has_archival_reference.yaml | 0 .../has_archive_description.yaml | 0 .../has_archive_memento_uri.yaml | 0 .../slots/{ => archive}/has_archive_name.yaml | 0 .../slots/{ => archive}/has_archive_path.yaml | 0 .../has_archive_search_score.yaml | 0 .../slots/{ => archive}/has_arrangement.yaml | 0 .../{ => archive}/has_arrangement_level.yaml | 0 .../{ => archive}/has_arrangement_note.yaml | 0 .../has_articles_archival_stage.yaml | 0 .../has_articles_document_format.yaml | 0 .../has_articles_document_url.yaml | 0 .../has_articles_of_association.yaml | 0 .../modules/slots/has_or_had_altitude.yaml | 5 + .../modules/slots/has_or_had_annotation.yaml | 5 + .../modules/slots/has_or_had_arrangement.yaml | 5 + .../modules/slots/has_or_had_document.yaml | 5 + .../slots/has_or_had_provenance_path.yaml | 6 +- .../modules/slots/has_or_had_rationale.yaml | 18 +- .../modules/slots/has_or_had_reason.yaml | 5 + .../modules/slots/has_or_had_style.yaml | 5 + .../slots/is_or_was_amended_through.yaml | 5 + .../modules/slots/is_or_was_approved_on.yaml | 5 + .../modules/slots/is_or_was_archived_as.yaml | 5 + .../modules/slots/is_or_was_due_on.yaml | 5 + .../modules/slots/is_or_was_opened_on.yaml | 5 + .../modules/slots/is_or_was_used_in.yaml | 5 + .../linkml/modules/slots/slot_fixes.yaml | 460 +-- schemas/20251121/linkml/update_manifest.py | 36 +- 82 files changed, 3786 insertions(+), 791 deletions(-) create mode 100644 .opencode/rules/no-deletion-from-slot-fixes.md create mode 100644 .opencode/rules/preserve-bespoke-slots-until-refactoring.md create mode 100644 backend/rag/hybrid_retriever.py create mode 100644 backend/rag/multi_embedding_retriever.py create mode 100644 schemas/20251121/linkml/modules/classes/APIEndpoint.yaml create mode 100644 schemas/20251121/linkml/modules/classes/APIRequest.yaml create mode 100644 schemas/20251121/linkml/modules/classes/APIVersion.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Altitude.yaml create mode 100644 schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml create mode 100644 schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml create mode 100644 schemas/20251121/linkml/modules/classes/AppellationType.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Archdiocese.yaml create mode 100644 schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml create mode 100644 schemas/20251121/linkml/modules/classes/ArchivalReference.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Arrangement.yaml create mode 100644 schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml create mode 100644 schemas/20251121/linkml/modules/classes/ArrangementType.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Memento.yaml create mode 100644 schemas/20251121/linkml/modules/classes/ProvenancePath.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Reason.yaml create mode 100644 schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml create mode 100644 schemas/20251121/linkml/modules/classes/SearchScore.yaml rename schemas/20251121/linkml/modules/slots/{ => archive}/administrative_context.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/based_on_claim.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_altitude.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_amendment_history.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_annex_description.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_annex_name.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_annex_reason.yaml (100%) rename schemas/20251121/linkml/modules/slots/{has_annotation_motivation.yaml => archive/has_annotation_motivation_archived_20260127.yaml} (100%) rename schemas/20251121/linkml/modules/slots/{has_annotation_segment.yaml => archive/has_annotation_segment_archived_20260127.yaml} (100%) rename schemas/20251121/linkml/modules/slots/{has_annotation_type.yaml => archive/has_annotation_type_archived_20260127.yaml} (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_api_version.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_appellation_language.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_appellation_type.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_appellation_value.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_applicable_country.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_application_deadline.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_application_opening_date.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_appraisal_note.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_approval_date.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archdiocese_name.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_architectural_style.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archival_reference.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archive_description.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archive_memento_uri.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archive_name.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archive_path.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_archive_search_score.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_arrangement.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_arrangement_level.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_arrangement_note.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_articles_archival_stage.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_articles_document_format.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_articles_document_url.yaml (100%) rename schemas/20251121/linkml/modules/slots/{ => archive}/has_articles_of_association.yaml (100%) create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_document.yaml create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml create mode 100644 schemas/20251121/linkml/modules/slots/has_or_had_style.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml diff --git a/.opencode/rules/no-deletion-from-slot-fixes.md b/.opencode/rules/no-deletion-from-slot-fixes.md new file mode 100644 index 0000000000..ae5a6ac855 --- /dev/null +++ b/.opencode/rules/no-deletion-from-slot-fixes.md @@ -0,0 +1,17 @@ +# Rule: Do Not Delete Entries from slot_fixes.yaml + +**CRITICAL**: Entries in `schemas/20251121/linkml/modules/slots/slot_fixes.yaml` MUST NEVER be deleted. + +This file serves as a persistent audit log and migration tracking registry. + +**Protocol**: +1. **Process** the migration specified in the `revision` section. +2. **Update** the `processed` section: + * Set `status: true`. + * Add a `notes` field describing the action taken (e.g., "Migrated to has_or_had_name + PersonName class. Slot archived."). + * Add a `date` field (YYYY-MM-DD). +3. **Keep** the original entry intact. + +**Forbidden**: +* ❌ Deleting a processed block. +* ❌ Removing an entry because the slot file doesn't exist (mark as processed with note "Slot file not found, skipped"). diff --git a/.opencode/rules/preserve-bespoke-slots-until-refactoring.md b/.opencode/rules/preserve-bespoke-slots-until-refactoring.md new file mode 100644 index 0000000000..4df0810fe3 --- /dev/null +++ b/.opencode/rules/preserve-bespoke-slots-until-refactoring.md @@ -0,0 +1,32 @@ +# Rule: Preserve Bespoke Slots Until Refactoring + +**Identifier**: `preserve-bespoke-slots-until-refactoring` +**Severity**: **CRITICAL** + +## Core Directive + +**DO NOT remove or migrate "additional" bespoke slots during generic migration passes unless they are the specific target of the current task.** + +## Context + +When migrating a specific slot (e.g., `has_approval_date`), you may encounter other bespoke or legacy slots in the same class file (e.g., `innovation_budget`, `operating_budget`). + +**YOU MUST**: +* ✅ Migrate ONLY the specific slot you were instructed to work on. +* ✅ Leave other bespoke slots exactly as they are. +* ✅ Focus strictly on the current migration target. + +**YOU MUST NOT**: +* ❌ Proactively migrate "nearby" slots just because they look like they need refactoring. +* ❌ Remove slots that seem unused or redundant without specific instruction. +* ❌ "Clean up" the class file by removing legacy attributes. + +## Rationale + +Refactoring is a separate, planned phase. Mixing opportunistic refactoring with systematic slot migration increases the risk of regression and makes changes harder to review. "We will refactor those later." + +## Workflow + +1. **Identify Target**: Identify the specific slot(s) assigned for migration (from `slot_fixes.yaml` or user prompt). +2. **Execute Migration**: Apply changes ONLY for those slots. +3. **Ignore Others**: Do not touch other slots in the file, even if they violate other rules (like Rule 39 or Rule 53). Those will be handled in their own dedicated tasks. diff --git a/backend/rag/hybrid_retriever.py b/backend/rag/hybrid_retriever.py new file mode 100644 index 0000000000..946eef891d --- /dev/null +++ b/backend/rag/hybrid_retriever.py @@ -0,0 +1,2534 @@ +""" +Hybrid Retriever: Vector Search + Knowledge Graph Expansion + +Combines Qdrant vector similarity search with Oxigraph SPARQL graph expansion +to provide semantically-aware and structurally-enriched retrieval. + +Architecture: + 1. Vector Search (Qdrant) - Find semantically similar institutions AND persons + 2. Graph Expansion (Oxigraph) - Expand via relationships: + - Same city/region + - Same institution type + - Related collections + - Organizational relationships + 3. Re-ranking - Combine scores for final ranking + 4. Query Routing - Detect if query is about institutions or persons + +Collections: + - heritage_custodians: Institution data (27K+ records) + - heritage_persons: Staff/person data (10K+ records) + +Example usage: + retriever = HybridRetriever( + qdrant_host="localhost", + qdrant_port=6333, + sparql_endpoint="http://localhost:7878/query" + ) + + # Institution search + results = retriever.search("museums with Dutch colonial history") + + # Person search (auto-detected or explicit) + results = retriever.search("Who works at the Nationaal Archief?") + results = retriever.search_persons("archivist at Rijksmuseum") +""" + +import hashlib +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from typing import Any, TYPE_CHECKING + +import httpx + +# Polygon filter for geographic containment testing (Dutch provinces) +from glam_extractor.geocoding.polygon_filter import ( + get_polygon_filter, + ProvincePolygonFilter, +) + +if TYPE_CHECKING: + from qdrant_client import QdrantClient + from openai import OpenAI + from sentence_transformers import SentenceTransformer + # Forward reference as string to avoid circular imports + MultiEmbeddingRetriever = Any # Actually from glam_extractor.api.multi_embedding_retriever + EmbeddingModel = Any # Actually from glam_extractor.api.multi_embedding_retriever + +logger = logging.getLogger(__name__) + + +# SPARQL endpoint configuration +DEFAULT_SPARQL_ENDPOINT = os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query") +DEFAULT_SPARQL_TIMEOUT = 30.0 + +# Ontology prefixes used in Oxigraph +SPARQL_PREFIXES = """ +PREFIX hc: +PREFIX hcc: +PREFIX ghc: +PREFIX skos: +PREFIX wdt: +PREFIX wd: +PREFIX schema: +PREFIX geo: +PREFIX rdfs: +PREFIX rdf: +""" + + +@dataclass +class RetrievedInstitution: + """A retrieved heritage institution with combined scores.""" + + ghcid: str + name: str + uri: str + vector_score: float = 0.0 + graph_score: float = 0.0 + combined_score: float = 0.0 + + # Metadata from vector search + institution_type: str | None = None + country: str | None = None + city: str | None = None + description: str | None = None + + # Geographic coordinates + latitude: float | None = None + longitude: float | None = None + + # Graph expansion data + related_institutions: list[str] = field(default_factory=list) + expansion_reason: str | None = None # e.g., "same_city", "same_type", "related_collection" + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for API responses.""" + return { + "ghcid": self.ghcid, + "name": self.name, + "uri": self.uri, + "scores": { + "vector": round(self.vector_score, 4), + "graph": round(self.graph_score, 4), + "combined": round(self.combined_score, 4), + }, + "metadata": { + "institution_type": self.institution_type, + "country": self.country, + "city": self.city, + "description": self.description, + "latitude": self.latitude, + "longitude": self.longitude, + }, + "graph_expansion": { + "related_institutions": self.related_institutions, + "expansion_reason": self.expansion_reason, + } + } + + +# =================================================================== +# Linked Data URI Generation Utilities +# =================================================================== +# Generate stable ontology-aligned URIs for Person and PersonObservation +# following the LinkML schema at schemas/20251121/linkml/ +# Namespace: https://nde.nl/ontology/hc/ +# =================================================================== + +import re +import unicodedata + +# Ontology namespaces +ONTOLOGY_BASE = "https://nde.nl/ontology/hc" +PERSON_HUB_PREFIX = f"{ONTOLOGY_BASE}/person" +PERSON_OBS_PREFIX = f"{ONTOLOGY_BASE}/person-obs" +CUSTODIAN_PREFIX = f"{ONTOLOGY_BASE}/custodian" + +# JSON-LD context for person search responses +PERSON_JSONLD_CONTEXT = { + "@vocab": f"{ONTOLOGY_BASE}/", + "schema": "http://schema.org/", + "pico": "https://personsincontext.org/model#", + "prov": "http://www.w3.org/ns/prov#", + "foaf": "http://xmlns.com/foaf/0.1/", + "name": "schema:name", + "jobTitle": "schema:jobTitle", + "affiliation": "schema:affiliation", + "sameAs": "schema:sameAs", + "refers_to_person": "pico:observationOf", + "observation_source": "prov:hadPrimarySource", +} + + +def generate_slug(text: str) -> str: + """Generate URL-safe slug from text. + + Examples: + "Kitty Bogte" → "kitty-bogte" + "Dr. Jane Smith" → "dr-jane-smith" + "Taco Dibbits" → "taco-dibbits" + """ + if not text: + return "unknown" + + # Normalize unicode (NFD decomposition) and remove diacritics + normalized = unicodedata.normalize('NFD', text) + ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Convert to lowercase + lowercase = ascii_text.lower() + + # Replace non-alphanumeric with hyphens + slug = re.sub(r'[^a-z0-9]+', '-', lowercase) + + # Collapse multiple hyphens and strip leading/trailing + slug = re.sub(r'-+', '-', slug).strip('-') + + return slug or "unknown" + + +def generate_role_slug(headline: str | None) -> str: + """Generate role slug from job title/headline. + + Examples: + "Programmer/curator" → "programmer-curator" + "Senior Archivist" → "senior-archivist" + None → "staff" + """ + if not headline: + return "staff" + return generate_slug(headline) + + +def generate_person_hub_uri(name: str, linkedin_slug: str | None = None) -> str: + """Generate Person hub URI (abstract identity). + + Format: https://nde.nl/ontology/hc/person/{person-slug} + + Uses LinkedIn slug if available for stability, otherwise derives from name. + + Examples: + generate_person_hub_uri("Kitty Bogte", "kittybogte") + → "https://nde.nl/ontology/hc/person/kittybogte" + generate_person_hub_uri("Dr. Jane Smith") + → "https://nde.nl/ontology/hc/person/dr-jane-smith" + """ + if linkedin_slug: + slug = linkedin_slug + else: + slug = generate_slug(name) + + return f"{PERSON_HUB_PREFIX}/{slug}" + + +def generate_observation_uri( + custodian_slug: str | None, + person_name: str, + role_slug: str | None = None, + linkedin_slug: str | None = None +) -> str: + """Generate PersonObservation URI. + + Format: https://nde.nl/ontology/hc/person-obs/{custodian-slug}/{person-slug}/{role-slug} + + Examples: + generate_observation_uri("nl-ga-nationaal-archief", "Kitty Bogte", "programmer-curator") + → "https://nde.nl/ontology/hc/person-obs/nl-ga-nationaal-archief/kitty-bogte/programmer-curator" + """ + custodian = custodian_slug or "unknown-custodian" + person = linkedin_slug or generate_slug(person_name) + role = role_slug or "staff" + + return f"{PERSON_OBS_PREFIX}/{custodian}/{person}/{role}" + + +def generate_custodian_uri(custodian_slug: str | None, ghcid: str | None = None) -> str | None: + """Generate Custodian URI. + + Format: https://nde.nl/ontology/hc/custodian/{ghcid-or-slug} + """ + if ghcid: + return f"{CUSTODIAN_PREFIX}/{ghcid}" + elif custodian_slug: + return f"{CUSTODIAN_PREFIX}/{custodian_slug}" + return None + + +def extract_linkedin_slug(linkedin_url: str | None) -> str | None: + """Extract slug from LinkedIn URL. + + Examples: + "https://www.linkedin.com/in/kittybogte" → "kittybogte" + "https://linkedin.com/in/jane-smith-12345" → "jane-smith-12345" + """ + if not linkedin_url: + return None + + match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url) + return match.group(1) if match else None + + +@dataclass +class RetrievedPerson: + """A retrieved person/staff member with search scores and linked data URIs.""" + + person_id: str + name: str + vector_score: float = 0.0 + combined_score: float = 0.0 + richness_score: float = 0.0 # Metadata richness score (0-1) + + # Metadata from vector search + headline: str | None = None # Job title/role + custodian_name: str | None = None # Organization they work at + custodian_slug: str | None = None + location: str | None = None + heritage_relevant: bool = False + heritage_type: str | None = None # GLAMORCUBESFIXPHDNT code + source_type: str | None = None # "staff_list" or "entity_profile" + linkedin_url: str | None = None + has_wcms: bool = False # WCMS-registered profile (heritage sector user) + + # WCMS-specific fields for display on review page + wcms_user_id: str | None = None + wcms_abs_id: str | None = None # NAN identifier + wcms_crm_id: str | None = None + wcms_username: str | None = None + wcms_username_url: str | None = None + wcms_status: str | None = None # "Active" or "Blocked" + wcms_roles: list[str] | None = None + wcms_registered_since: str | None = None + wcms_last_access: str | None = None + + # Contact details + email: str | None = None + email_domain: str | None = None + + # Linked data fields (generated) + linkedin_profile_path: str | None = None # Path to entity JSON file + + @property + def linkedin_slug(self) -> str | None: + """Extract LinkedIn slug from URL.""" + return extract_linkedin_slug(self.linkedin_url) + + @property + def person_hub_uri(self) -> str: + """Generate Person hub URI (abstract identity).""" + return generate_person_hub_uri(self.name, self.linkedin_slug) + + @property + def observation_uri(self) -> str: + """Generate PersonObservation URI.""" + role_slug = generate_role_slug(self.headline) + return generate_observation_uri( + self.custodian_slug, + self.name, + role_slug, + self.linkedin_slug + ) + + @property + def custodian_uri(self) -> str | None: + """Generate Custodian URI.""" + return generate_custodian_uri(self.custodian_slug) + + def to_dict(self, include_jsonld: bool = True) -> dict[str, Any]: + """Convert to dictionary for API responses. + + Args: + include_jsonld: If True, include JSON-LD linked data fields (@id, @type, etc.) + """ + result = { + "person_id": self.person_id, + "name": self.name, + "scores": { + "vector": round(self.vector_score, 4), + "combined": round(self.combined_score, 4), + "richness": round(self.richness_score, 4), + }, + "metadata": { + "headline": self.headline, + "custodian_name": self.custodian_name, + "custodian_slug": self.custodian_slug, + "location": self.location, + "heritage_relevant": self.heritage_relevant, + "heritage_type": self.heritage_type, + "source_type": self.source_type, + "linkedin_url": self.linkedin_url, + "has_wcms": self.has_wcms, + # WCMS fields for review page + "wcms_user_id": self.wcms_user_id, + "wcms_abs_id": self.wcms_abs_id, + "wcms_crm_id": self.wcms_crm_id, + "wcms_username": self.wcms_username, + "wcms_username_url": self.wcms_username_url, + "wcms_status": self.wcms_status, + "wcms_roles": self.wcms_roles, + "wcms_registered_since": self.wcms_registered_since, + "wcms_last_access": self.wcms_last_access, + # Contact details + "email": self.email, + "email_domain": self.email_domain, + } + } + + if include_jsonld: + # Add JSON-LD linked data fields + result["@id"] = self.observation_uri + result["@type"] = "pico:PersonObservation" + result["refers_to_person"] = self.person_hub_uri + + # Add custodian affiliation if available + if self.custodian_uri: + result["unit_affiliation"] = self.custodian_uri + + # Add schema:sameAs for LinkedIn URL + if self.linkedin_url: + result["schema:sameAs"] = self.linkedin_url + + # Add linkedin_profile_path if available + if self.linkedin_profile_path: + result["linkedin_profile_path"] = self.linkedin_profile_path + + return result + + +# Query type detection patterns +PERSON_QUERY_PATTERNS = [ + # Dutch + "wie werkt", "wie werk", "werken in", "werken bij", "medewerker", "personeel", + "staff", "werknemer", "expert", "experts", "specialist", "specialisten", + "directeur", "curator", "archivaris", "bibliothecaris", "conservator", + "team", "collega", "collegas", "mensen bij", "werkzaam", + # English + "who works", "staff at", "employees", "team at", "people at", "work at", + "director of", "curator at", "archivist", "librarian", "works at", + "experts at", "specialists", "professionals at", + # Generic + "linkedin", "person", "professional", +] + +# =================================================================== +# Dutch Province/Subdivision Code Mapping (ISO 3166-2:NL) +# =================================================================== +# Maps province names (lowercase, various spellings) to ISO 3166-2 codes +# Used for filtering Qdrant queries by region +# Qdrant payload field: "region" (stores short codes like "NH", "ZH") +# =================================================================== + +DUTCH_PROVINCE_CODES: dict[str, str] = { + # Noord-Holland + "noord-holland": "NH", + "noordholland": "NH", + "north holland": "NH", + "north-holland": "NH", + # Zuid-Holland + "zuid-holland": "ZH", + "zuidholland": "ZH", + "south holland": "ZH", + "south-holland": "ZH", + # Utrecht + "utrecht": "UT", + # Gelderland + "gelderland": "GE", + # Noord-Brabant + "noord-brabant": "NB", + "noordbrabant": "NB", + "brabant": "NB", + "north brabant": "NB", + # Limburg + "limburg": "LI", + # Overijssel + "overijssel": "OV", + # Friesland / Fryslân + "friesland": "FR", + "fryslân": "FR", + "fryslan": "FR", + # Groningen + "groningen": "GR", + # Drenthe + "drenthe": "DR", + # Flevoland + "flevoland": "FL", + # Zeeland + "zeeland": "ZE", +} + + +def get_province_code(province_name: str | None) -> str | None: + """Convert Dutch province name to ISO 3166-2 subdivision code (without country prefix). + + Args: + province_name: Province name in Dutch or English (case-insensitive) + + Returns: + Two-letter province code (e.g., "NH", "ZH") or None if not found + + Example: + >>> get_province_code("Noord-Holland") + 'NH' + >>> get_province_code("south holland") + 'ZH' + >>> get_province_code("Bavaria") + None + """ + if not province_name: + return None + return DUTCH_PROVINCE_CODES.get(province_name.lower().strip()) + +def looks_like_person_name(query: str) -> bool: + """Detect if query looks like a person's name for name-boosted search. + + A query looks like a person name if it: + - Contains 2-4 capitalized words (first/last name pattern) + - Does NOT contain common non-name words (institutions, locations, etc.) + - Does NOT contain question words (who, what, where, etc.) + + Args: + query: Search query string + + Returns: + True if query appears to be a person name + + Examples: + >>> looks_like_person_name("Kitty Bogte") + True + >>> looks_like_person_name("Who works at the Rijksmuseum?") + False + >>> looks_like_person_name("archivist at Nationaal Archief") + False + """ + # Skip if query contains question words or common phrases + non_name_indicators = [ + # Question words + "who", "what", "where", "which", "how", "why", + "wie", "wat", "waar", "welk", "hoe", "waarom", + # Role/job indicators + "works at", "working at", "werkt bij", "werkzaam", + "archivist", "curator", "director", "librarian", + "archivaris", "directeur", "bibliothecaris", + # Prepositions indicating context + " at ", " in ", " of ", " for ", " the ", + " bij ", " in ", " van ", " voor ", " de ", " het ", + # Punctuation that indicates non-name queries + "?", "!", + ] + + query_lower = query.lower() + for indicator in non_name_indicators: + if indicator in query_lower: + return False + + # Check for capitalized word pattern (typical of names) + words = query.strip().split() + if len(words) < 2 or len(words) > 4: + return False + + # Check if words look like name components (capitalized or all letters) + capitalized_count = sum(1 for w in words if w[0].isupper() and w.isalpha()) + + # Most name words should be capitalized + return capitalized_count >= len(words) - 1 # Allow one lowercase (e.g., "van", "de") + + +def calculate_name_match_boost(query: str, name: str) -> float: + """Calculate a score boost for name matching. + + Uses case-insensitive substring matching to boost results where + the query matches part or all of the person's name. + + Args: + query: Search query (potential name) + name: Person's name from search result + + Returns: + Boost factor (1.0 = no boost, >1.0 = boosted) + - 3.0: Exact match (case-insensitive) + - 2.5: Query contains full name or name contains full query + - 2.0: Partial match (first or last name matches) + - 1.0: No match + """ + query_lower = query.lower().strip() + name_lower = name.lower().strip() + + # Exact match + if query_lower == name_lower: + return 3.0 + + # Query is substring of name or vice versa + if query_lower in name_lower or name_lower in query_lower: + return 2.5 + + # Check for partial matches (first or last name) + query_parts = set(query_lower.split()) + name_parts = set(name_lower.split()) + + # How many query parts match name parts? + matching_parts = query_parts & name_parts + if matching_parts: + # More matching parts = higher boost + match_ratio = len(matching_parts) / max(len(query_parts), len(name_parts)) + return 1.0 + match_ratio # 1.5-2.0 range for partial matches + + return 1.0 # No boost + + +def detect_query_type(query: str, dspy_entity_type: str | None = None) -> str: + """Detect if query is about institutions or persons. + + Uses DSPy LLM classification if provided, falls back to keyword heuristics. + + Args: + query: Search query string + dspy_entity_type: Optional entity_type from DSPy HeritageQueryRouter + ("person", "institution", or "both") + + Returns: + "person" or "institution" + """ + # Prefer DSPy semantic classification when available + if dspy_entity_type: + if dspy_entity_type in ("person", "both"): + return "person" + if dspy_entity_type == "institution": + return "institution" + + # Fallback to keyword heuristics + query_lower = query.lower() + + for pattern in PERSON_QUERY_PATTERNS: + if pattern in query_lower: + return "person" + + return "institution" + + +# =================================================================== +# Schema-Aware Filter Mapping for DSPy Heritage Query Router +# =================================================================== +# +# These mappings are now loaded DYNAMICALLY from the LinkML schema files +# via the ontology_mapping module. This ensures: +# 1. Schema is the single source of truth (no hardcoded values) +# 2. Multilingual support (Dutch, German, French, Spanish, etc.) +# 3. Automatic updates when schema changes +# +# The ontology_mapping module extracts synonyms from YAML comments +# and provides fuzzy matching for natural language queries. +# =================================================================== + +def _get_custodian_type_mapping() -> dict[str, str]: + """Get custodian type to heritage code mapping from schema. + + Dynamically loads from CustodianPrimaryTypeEnum in LinkML schema. + Falls back to minimal hardcoded mapping if schema unavailable. + + Returns: + Dict mapping custodian type (e.g., "MUSEUM") to heritage code (e.g., "M") + """ + try: + # Try backend.rag path first (when backend is in Python path) + from backend.rag.ontology_mapping import get_custodian_type_mapping + mapping = get_custodian_type_mapping() + if mapping: + return mapping + except ImportError: + try: + # Fallback: try direct import (when ontology_mapping is in sys.path) + from ontology_mapping import get_custodian_type_mapping # type: ignore[import-not-found] + mapping = get_custodian_type_mapping() + if mapping: + return mapping + except ImportError: + logger.warning("ontology_mapping not available, using fallback mapping") + except Exception as e: + logger.warning(f"Failed to load custodian type mapping from schema: {e}") + + # Fallback: minimal GLAMORCUBESFIXPHDNT mapping + return { + "GALLERY": "G", "LIBRARY": "L", "ARCHIVE": "A", "MUSEUM": "M", + "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C", + "UNKNOWN": "U", "BIO_CUSTODIAN": "B", "EDUCATION_PROVIDER": "E", + "COLLECTING_SOCIETY": "S", "FEATURE": "F", "INTANGIBLE_HERITAGE_GROUP": "I", + "MIXED": "X", "PERSONAL_COLLECTION": "P", "HOLY_SITE": "H", + "DIGITAL_PLATFORM": "D", "NGO": "N", "TASTE_SMELL_HERITAGE": "T", + } + + +def _get_role_category_keywords() -> dict[str, list[str]]: + """Get role category keywords from schema. + + Dynamically loads from RoleCategoryEnum in LinkML schema. + Falls back to hardcoded keywords if schema unavailable. + + Returns: + Dict mapping role category (e.g., "CURATORIAL") to keywords list + """ + try: + # Try backend.rag path first (when backend is in Python path) + from backend.rag.ontology_mapping import get_role_keywords + keywords = get_role_keywords() + if keywords: + return keywords + except ImportError: + try: + # Fallback: try direct import (when ontology_mapping is in sys.path) + from ontology_mapping import get_role_keywords # type: ignore[import-not-found] + keywords = get_role_keywords() + if keywords: + return keywords + except ImportError: + logger.warning("ontology_mapping not available, using fallback role keywords") + except Exception as e: + logger.warning(f"Failed to load role keywords from schema: {e}") + + # Fallback: essential role category keywords (hardcoded) + return { + "CURATORIAL": [ + "curator", "curatorial", "collectie", "collection", "tentoonstellingen", + "exhibitions", "acquisitions", "registrar", "museum professional" + ], + "CONSERVATION": [ + "conservator", "conservation", "restaurator", "restoration", "preservatie", + "preservation", "materiaal", "material", "preventive" + ], + "ARCHIVAL": [ + "archivist", "archivaris", "archief", "archive", "records", "documentalist", + "erfgoed", "heritage records", "acquisitie", "beschrijving" + ], + "LIBRARY": [ + "bibliothecaris", "librarian", "bibliotheek", "library", "catalogus", + "cataloging", "metadata", "special collections", "reference" + ], + "DIGITAL": [ + "digital", "digitaal", "developer", "data", "software", "IT", "tech", + "engineer", "digitalisering", "digitization", "web", "database" + ], + "EDUCATION": [ + "educatie", "education", "learning", "museum educator", "outreach", + "public programs", "docent", "teacher", "rondleiding", "guide" + ], + "GOVERNANCE": [ + "bestuur", "board", "governance", "trustee", "raad", "council", + "advisory", "commissie", "committee" + ], + "LEADERSHIP": [ + "director", "directeur", "manager", "head of", "hoofd", "chief", + "CEO", "president", "leider", "leadership" + ], + "RESEARCH": [ + "onderzoek", "research", "researcher", "wetenschapper", "scientist", + "academic", "scholar", "fellow", "postdoc", "PhD" + ], + "TECHNICAL": [ + "technical", "technisch", "facilities", "installation", "AV", + "audiovisual", "lighting", "security", "beveiliging" + ], + "SUPPORT": [ + "support", "admin", "administratie", "office", "HR", "finance", + "marketing", "communications", "front desk", "visitor services" + ], + "CREATIVE": [ + "design", "ontwerp", "creative", "graphic", "exhibition design", + "multimedia", "artist", "kunstenaar", "visual" + ], + "EXTERNAL": [ + "volunteer", "vrijwilliger", "intern", "stagiair", "consultant", + "advisor", "external", "contractor", "freelance" + ], + } + + +# Lazy-loaded module-level caches (populated on first access) +_CUSTODIAN_TYPE_MAPPING: dict[str, str] | None = None +_ROLE_CATEGORY_KEYWORDS: dict[str, list[str]] | None = None + + +def get_custodian_type_to_heritage_code() -> dict[str, str]: + """Get cached custodian type to heritage code mapping.""" + global _CUSTODIAN_TYPE_MAPPING + if _CUSTODIAN_TYPE_MAPPING is None: + _CUSTODIAN_TYPE_MAPPING = _get_custodian_type_mapping() + return _CUSTODIAN_TYPE_MAPPING + + +def get_role_category_keywords() -> dict[str, list[str]]: + """Get cached role category keywords.""" + global _ROLE_CATEGORY_KEYWORDS + if _ROLE_CATEGORY_KEYWORDS is None: + _ROLE_CATEGORY_KEYWORDS = _get_role_category_keywords() + return _ROLE_CATEGORY_KEYWORDS + + +def build_schema_aware_person_filter( + heritage_type_code: str | None = None, + heritage_relevant_only: bool = False, + custodian_slug: str | None = None, + only_wcms: bool = False, +) -> dict[str, Any] | None: + """Build Qdrant filter conditions for schema-aware person search. + + Args: + heritage_type_code: Single-letter heritage type code (M, A, L, etc.) + heritage_relevant_only: Only return heritage-relevant staff + custodian_slug: Filter by specific custodian + only_wcms: Only return WCMS-registered profiles (heritage sector users) + + Returns: + Dict of filter conditions for Qdrant, or None if no filters + """ + filters: dict[str, Any] = {} + + if heritage_type_code and heritage_type_code not in ("U", "UNKNOWN", "UNSPECIFIED"): + filters["heritage_type"] = heritage_type_code + + if heritage_relevant_only: + filters["heritage_relevant"] = True + + if custodian_slug: + filters["custodian_slug"] = custodian_slug + + if only_wcms: + filters["has_wcms"] = True + + return filters if filters else None + + +def filter_by_role_category_keywords( + results: list["RetrievedPerson"], + role_category: str | None, +) -> list["RetrievedPerson"]: + """Post-filter search results by role category using headline keywords. + + Since role_category is not indexed in Qdrant, we use headline keyword matching + to filter results after vector search. + + Args: + results: List of RetrievedPerson from vector search + role_category: Target role category (CURATORIAL, ARCHIVAL, etc.) + + Returns: + Filtered list of RetrievedPerson matching the role category + """ + if not role_category or role_category in ("UNKNOWN", "UNSPECIFIED"): + return results + + keywords = get_role_category_keywords().get(role_category, []) + if not keywords: + return results + + filtered = [] + for person in results: + headline = (person.headline or "").lower() + # Check if any keyword matches the headline + if any(kw.lower() in headline for kw in keywords): + filtered.append(person) + + # If filtering removed all results, return original (don't be too strict) + if not filtered: + logger.info(f"Role category filter '{role_category}' removed all results, returning unfiltered") + return results + + logger.info(f"Role category filter '{role_category}' reduced results from {len(results)} to {len(filtered)}") + return filtered + + +def get_heritage_type_code(custodian_type: str | None) -> str | None: + """Convert CustodianPrimaryTypeEnum value to single-letter heritage code. + + Args: + custodian_type: Custodian type from DSPy router (e.g., "MUSEUM", "ARCHIVE") + + Returns: + Single-letter heritage code (e.g., "M", "A") or None if not mappable + """ + if not custodian_type or custodian_type in ("UNKNOWN", "UNSPECIFIED"): + return None + return get_custodian_type_to_heritage_code().get(custodian_type) + + +class SPARQLClient: + """Client for querying Oxigraph SPARQL endpoint.""" + + def __init__( + self, + endpoint: str = DEFAULT_SPARQL_ENDPOINT, + timeout: float = DEFAULT_SPARQL_TIMEOUT, + max_connections: int = 20 # Allow concurrent connections for parallel queries + ): + self.endpoint = endpoint + self.timeout = timeout + self.max_connections = max_connections + self._client: httpx.Client | None = None + + @property + def client(self) -> httpx.Client: + """Lazy-initialize HTTP client with connection pooling.""" + if self._client is None: + # Configure connection pool for parallel SPARQL queries + limits = httpx.Limits( + max_keepalive_connections=self.max_connections, + max_connections=self.max_connections, + keepalive_expiry=30.0 # Keep connections alive for reuse + ) + self._client = httpx.Client( + timeout=self.timeout, + limits=limits, + http2=False # HTTP/1.1 is often faster for small queries + ) + return self._client + + def query(self, sparql: str, log_timing: bool = False) -> list[dict[str, Any]]: + """Execute SPARQL query and return results. + + Args: + sparql: SPARQL query string + log_timing: Whether to log query execution time + + Returns: + List of result bindings as dictionaries + """ + full_query = SPARQL_PREFIXES + sparql + start_time = time.time() if log_timing else 0 + + try: + response = self.client.post( + self.endpoint, + data={"query": full_query}, + headers={"Accept": "application/sparql-results+json"} + ) + response.raise_for_status() + + data = response.json() + bindings = data.get("results", {}).get("bindings", []) + + # Convert bindings to simple dicts + results = [] + for binding in bindings: + row = {} + for key, value in binding.items(): + row[key] = value.get("value", "") + results.append(row) + + if log_timing: + duration_ms = (time.time() - start_time) * 1000 + logger.debug(f"SPARQL query completed: {len(results)} results in {duration_ms:.0f}ms") + + return results + + except httpx.HTTPError as e: + logger.error(f"SPARQL query failed: {e}") + return [] + except Exception as e: + logger.error(f"Unexpected error in SPARQL query: {e}") + return [] + + def close(self) -> None: + """Close the HTTP client.""" + if self._client: + self._client.close() + self._client = None + + +class HybridRetriever: + """Hybrid retriever combining vector search with knowledge graph expansion. + + The retrieval process: + 1. Vector search finds semantically similar institutions + 2. For each result, SPARQL expands to find related institutions: + - Institutions in the same city + - Institutions of the same type + - Institutions with related collections + 3. Results are re-ranked based on combined vector + graph scores + + Embedding Models: + - If OpenAI API key is available AND collection uses 1536-dim vectors: use OpenAI + - Otherwise: use sentence-transformers (all-MiniLM-L6-v2, 384-dim) + + Multi-Embedding Support: + Set use_multi_embedding=True to enable support for multiple embedding models + via Qdrant's named vectors feature. This allows: + - A/B testing different embedding models + - Seamless migration between models + - Specifying which model to use per query + + Args: + qdrant_host: Qdrant server hostname + qdrant_port: Qdrant REST API port + sparql_endpoint: Oxigraph SPARQL endpoint URL + vector_weight: Weight for vector similarity scores (0-1) + graph_weight: Weight for graph expansion scores (0-1) + collection_name: Qdrant collection name + embedding_model: Embedding model name (auto-detected if not specified) + k_vector: Number of initial vector search results + k_expand: Number of graph expansion results per seed + k_final: Final number of results to return + use_multi_embedding: Enable multi-embedding mode with named vectors + preferred_embedding_model: Preferred model for multi-embedding mode + """ + + # Class-level type annotations for instance attributes + qdrant_host: str + qdrant_port: int + sparql_endpoint: str + vector_weight: float + graph_weight: float + collection_name: str + k_vector: int + k_expand: int + k_final: int + openai_api_key: str | None + use_production_qdrant: bool + use_multi_embedding: bool + preferred_embedding_model: str | None + sparql_client: "SPARQLClient" + embedding_model: str + + # Private attributes with lazy initialization + _qdrant_client: "QdrantClient | None" + _openai_client: "OpenAI | None" + _st_model: "SentenceTransformer | None" + _use_sentence_transformers: bool + _collection_vector_size: int | None + _multi_retriever: "MultiEmbeddingRetriever | None" + _selected_multi_model: "EmbeddingModel | None" + + def __init__( + self, + qdrant_host: str = "localhost", + qdrant_port: int = 6333, + sparql_endpoint: str = DEFAULT_SPARQL_ENDPOINT, + vector_weight: float = 0.7, + graph_weight: float = 0.3, + collection_name: str = "heritage_custodians", + embedding_model: str | None = None, # Auto-detect if None + k_vector: int = 10, + k_expand: int = 5, + k_final: int = 10, + openai_api_key: str | None = None, + use_production_qdrant: bool = False, + use_multi_embedding: bool = False, + preferred_embedding_model: str | None = None, + ): + self.qdrant_host = qdrant_host + self.qdrant_port = qdrant_port + self.sparql_endpoint = sparql_endpoint + self.vector_weight = vector_weight + self.graph_weight = graph_weight + self.collection_name = collection_name + self.k_vector = k_vector + self.k_expand = k_expand + self.k_final = k_final + self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY") + self.use_production_qdrant = use_production_qdrant + self.use_multi_embedding = use_multi_embedding + self.preferred_embedding_model = preferred_embedding_model + + # Initialize SPARQL client + self.sparql_client = SPARQLClient(endpoint=sparql_endpoint) + + # Lazy-load Qdrant, OpenAI, and sentence-transformers clients + self._qdrant_client = None + self._openai_client = None + self._st_model = None + self._use_sentence_transformers = False + self._collection_vector_size: int | None = None + + # Multi-embedding retriever (lazy-loaded) + self._multi_retriever = None + + # Currently selected multi-embedding model (for multi-embedding mode) + self._selected_multi_model = None + + # Determine embedding model to use + self.embedding_model = embedding_model or self._auto_detect_embedding_model() + + logger.info( + f"Initialized HybridRetriever: " + f"Qdrant={qdrant_host}:{qdrant_port}, " + f"SPARQL={sparql_endpoint}, " + f"embedding_model={self.embedding_model}, " + f"multi_embedding={use_multi_embedding}, " + f"weights=vector:{vector_weight}/graph:{graph_weight}" + ) + + @property + def qdrant_client(self) -> "QdrantClient": + """Lazy-load Qdrant client.""" + if self._qdrant_client is None: + from qdrant_client import QdrantClient + + if self.use_production_qdrant: + # Connect via HTTPS to production + self._qdrant_client = QdrantClient( + host="bronhouder.nl", + port=443, + https=True, + prefix="qdrant", + prefer_grpc=False, + timeout=30 + ) + else: + self._qdrant_client = QdrantClient( + host=self.qdrant_host, + port=self.qdrant_port + ) + return self._qdrant_client + + @property + def openai_client(self) -> "OpenAI": + """Lazy-load OpenAI client.""" + if self._openai_client is None: + if not self.openai_api_key: + raise RuntimeError( + "OpenAI API key not available. Set OPENAI_API_KEY or use sentence-transformers." + ) + import openai + self._openai_client = openai.OpenAI(api_key=self.openai_api_key) + return self._openai_client + + def _get_collection_vector_size(self) -> int | None: + """Get the vector size of the Qdrant collection.""" + try: + info = self.qdrant_client.get_collection(self.collection_name) + if hasattr(info.config.params, 'vectors'): + vectors_config = info.config.params.vectors + if isinstance(vectors_config, dict): + # Named vectors + first_config = next(iter(vectors_config.values()), None) + return first_config.size if first_config else None + elif vectors_config is not None: + # Single vector config + return vectors_config.size + return None + except Exception as e: + logger.warning(f"Could not get collection vector size: {e}") + return None + + def _auto_detect_embedding_model(self) -> str: + """Auto-detect which embedding model to use based on collection and available APIs. + + Detection priority: + 1. Check main collection (heritage_custodians) vector size + 2. If main collection doesn't exist, check heritage_persons collection + 3. If OpenAI key available and collection uses 1536-dim, use OpenAI + 4. Otherwise use sentence-transformers (384-dim, all-MiniLM-L6-v2) + """ + # Check main collection vector size first + vector_size = self._get_collection_vector_size() + self._collection_vector_size = vector_size + + # If main collection doesn't exist, try heritage_persons collection + if vector_size is None: + logger.info(f"Collection '{self.collection_name}' not found, checking heritage_persons") + person_vector_size = self._get_person_collection_vector_size() + if person_vector_size: + vector_size = person_vector_size + logger.info(f"Using heritage_persons collection vector size: {vector_size}") + + if vector_size == 384: + # Collection uses sentence-transformers dimensions + self._use_sentence_transformers = True + logger.info("Auto-detected 384-dim vectors, using sentence-transformers") + return "all-MiniLM-L6-v2" + elif vector_size == 1536 and self.openai_api_key: + # Collection uses OpenAI dimensions and we have API key + self._use_sentence_transformers = False + logger.info("Auto-detected 1536-dim vectors with OpenAI key, using OpenAI") + return "text-embedding-3-small" + elif self.openai_api_key: + # Default to OpenAI if we have key + self._use_sentence_transformers = False + return "text-embedding-3-small" + else: + # Fallback to sentence-transformers + self._use_sentence_transformers = True + logger.info("No OpenAI key, falling back to sentence-transformers") + return "all-MiniLM-L6-v2" + + def _load_sentence_transformer(self) -> "SentenceTransformer": + """Lazy-load sentence-transformers model.""" + if self._st_model is None: + try: + from sentence_transformers import SentenceTransformer + self._st_model = SentenceTransformer(self.embedding_model) + logger.info(f"Loaded sentence-transformers model: {self.embedding_model}") + except ImportError: + raise RuntimeError("sentence-transformers not installed. Run: pip install sentence-transformers") + return self._st_model + + @property + def multi_retriever(self) -> "MultiEmbeddingRetriever | None": + """Lazy-load MultiEmbeddingRetriever when multi-embedding mode is enabled. + + Returns: + MultiEmbeddingRetriever instance or None if not in multi-embedding mode + """ + if not self.use_multi_embedding: + return None + + if self._multi_retriever is None: + from glam_extractor.api.multi_embedding_retriever import ( + MultiEmbeddingRetriever, + MultiEmbeddingConfig, + EmbeddingModel, + ) + + # Create config matching current settings + config = MultiEmbeddingConfig( + qdrant_host=self.qdrant_host, + qdrant_port=self.qdrant_port, + qdrant_https=self.use_production_qdrant, + qdrant_prefix="qdrant" if self.use_production_qdrant else None, + openai_api_key=self.openai_api_key, + institutions_collection=self.collection_name, + ) + + self._multi_retriever = MultiEmbeddingRetriever(config) + + # Auto-select model if not specified + if self.preferred_embedding_model: + try: + self._selected_multi_model = EmbeddingModel(self.preferred_embedding_model) + except ValueError: + logger.warning(f"Unknown embedding model: {self.preferred_embedding_model}") + assert self._multi_retriever is not None # Set above + self._selected_multi_model = self._multi_retriever.select_model(self.collection_name) + else: + assert self._multi_retriever is not None # Set above + self._selected_multi_model = self._multi_retriever.select_model(self.collection_name) + + logger.info(f"MultiEmbeddingRetriever initialized, selected model: {self._selected_multi_model}") + + return self._multi_retriever + + def _get_embedding(self, text: str, using: str | None = None) -> list[float]: + """Get embedding vector for text using the appropriate model. + + Args: + text: Text to embed + using: Optional embedding model name (for multi-embedding mode) + + Returns: + Embedding vector as list of floats + """ + # If multi-embedding mode, delegate to MultiEmbeddingRetriever + if self.use_multi_embedding and self.multi_retriever: + from glam_extractor.api.multi_embedding_retriever import EmbeddingModel + + # Determine which model to use + if using: + try: + model = EmbeddingModel(using) + except ValueError: + logger.warning(f"Unknown model '{using}', using default") + model = self._selected_multi_model + else: + model = self._selected_multi_model + + if model: + return self.multi_retriever.get_embedding(text, model) + else: + # Fallback to legacy mode + logger.warning("No multi-embedding model available, falling back to legacy") + + # Legacy single-model embedding + if self._use_sentence_transformers: + model = self._load_sentence_transformer() + embedding = model.encode(text) + return embedding.tolist() + else: + response = self.openai_client.embeddings.create( + input=text, + model=self.embedding_model + ) + return response.data[0].embedding + + def _vector_search( + self, + query: str, + k: int, + using: str | None = None, + region_codes: list[str] | None = None, + cities: list[str] | None = None, + institution_types: list[str] | None = None, + use_polygon_filter: bool = True, + ) -> list[RetrievedInstitution]: + """Perform vector similarity search in Qdrant. + + Args: + query: Search query text + k: Number of results to retrieve + using: Optional embedding model name (for multi-embedding mode) + region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"]) + cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"]) + institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"]) + use_polygon_filter: If True, apply polygon-based geographic filtering + using actual province boundaries (default: True) + + Returns: + List of RetrievedInstitution with vector scores + """ + query_vector = self._get_embedding(query, using=using) + + # When polygon filtering is enabled and regions are specified, + # over-fetch to ensure we have enough results after polygon filtering + effective_limit = k + if use_polygon_filter and region_codes: + effective_limit = k * 3 # Over-fetch 3x for polygon filtering + logger.debug(f"Over-fetching {effective_limit} results for polygon filtering") + + # Build query parameters + search_params = { + "collection_name": self.collection_name, + "query": query_vector, + "limit": effective_limit, + "with_payload": True, + } + + # Build geographic/type filter if any criteria provided + # NOTE: Always apply region metadata filter to Qdrant first to get relevant results. + # The polygon filter (if enabled) is an additional precision filter applied afterward. + # Previously we disabled metadata region filter when polygon filter was enabled, + # but this caused vector search to return results from wrong regions. + if region_codes or cities or institution_types: + from glam_extractor.ontology.qdrant_filters import QdrantFilterBuilder + + # Convert institution types from full names (LIBRARY, MUSEUM) to single-letter codes (L, M) + # because Qdrant stores institution_type as single-letter codes per GLAMORCUBESFIXPHDNT + type_codes = None + if institution_types: + type_mapping = get_custodian_type_to_heritage_code() + type_codes = [type_mapping.get(t, t) for t in institution_types] + # Filter out any that didn't map (keep original if 1 char already) + type_codes = [c for c in type_codes if c and len(c) == 1] + logger.debug(f"Converted institution types: {institution_types} -> {type_codes}") + + builder = QdrantFilterBuilder() + filter_dict = builder.combined_filter( + primary_types=type_codes, # Use single-letter codes + region_codes=region_codes, # Always apply region filter to get relevant results + cities=cities, + combine_mode="must", + ) + if filter_dict: + query_filter = QdrantFilterBuilder.to_qdrant_models(filter_dict) + search_params["query_filter"] = query_filter + logger.info( + f"Applied Qdrant filter: types={type_codes}, " + f"regions={region_codes}, cities={cities}" + ) + + # Add named vector 'using' ONLY if collection actually has named vectors + # Single-vector collections will error with "Not existing vector name" otherwise + if self.use_multi_embedding and self.multi_retriever: + uses_named = self.multi_retriever.uses_named_vectors(self.collection_name) + if uses_named: + if using: + search_params["using"] = using + elif self._selected_multi_model: + search_params["using"] = self._selected_multi_model.value + # else: single-vector collection, don't add 'using' parameter + + results = self.qdrant_client.query_points(**search_params) + + institutions = [] + for point in results.points: + payload = point.payload or {} + + inst = RetrievedInstitution( + ghcid=payload.get("ghcid", ""), + name=payload.get("name", ""), + uri=payload.get("uri", f"https://nde.nl/ontology/hc/custodian/{payload.get('ghcid', '')}"), + vector_score=point.score, + institution_type=payload.get("institution_type"), + country=payload.get("country"), + city=payload.get("city"), + description=payload.get("text", "")[:200] if payload.get("text") else None, + latitude=payload.get("latitude"), + longitude=payload.get("longitude"), + ) + institutions.append(inst) + + # Apply polygon-based geographic filtering if enabled and regions specified + if use_polygon_filter and region_codes and institutions: + institutions = self._apply_polygon_filter(institutions, region_codes, k) + + return institutions + + def _apply_polygon_filter( + self, + institutions: list[RetrievedInstitution], + region_codes: list[str], + k: int, + ) -> list[RetrievedInstitution]: + """Filter institutions by polygon containment in specified regions. + + Uses actual province boundary polygons to ensure results are + geographically within the requested regions, not just metadata matching. + + Args: + institutions: List of retrieved institutions with lat/lon + region_codes: List of ISO 3166-2 region codes (e.g., ["NH", "ZH"]) + k: Maximum number of results to return + + Returns: + Filtered list of institutions within the specified regions + """ + polygon_filter = get_polygon_filter() + + # Handle case where polygon filter module is not available or not loaded + if polygon_filter is None: + logger.warning("Polygon filter not available, skipping geographic filtering") + return institutions[:k] + + if not polygon_filter.is_loaded: + logger.warning("Polygon filter not loaded, skipping geographic filtering") + return institutions[:k] + + filtered = [] + for inst in institutions: + if inst.latitude is None or inst.longitude is None: + # No coordinates, check if metadata region matches + if inst.country == "NL": + # For Dutch institutions without coords, fallback to metadata + # Extract region from GHCID (format: NL-{REGION}-...) + if inst.ghcid and len(inst.ghcid) > 3: + ghcid_region = inst.ghcid.split("-")[1] if "-" in inst.ghcid else None + if ghcid_region and ghcid_region.upper() in [r.upper() for r in region_codes]: + filtered.append(inst) + continue + + # Check if point is within any of the requested regions + for region_code in region_codes: + if polygon_filter.point_in_province(inst.latitude, inst.longitude, region_code): + filtered.append(inst) + break # Don't add same institution multiple times + + logger.info( + f"Polygon filter: {len(filtered)}/{len(institutions)} institutions " + f"in regions {region_codes}" + ) + + # Return up to k results + return filtered[:k] + + def _build_batched_expansion_query( + self, + seed_institutions: list[RetrievedInstitution], + exclude_ghcids: set[str], + limit_per_expansion: int = 5 + ) -> tuple[str, dict[str, dict]]: + """Build a single SPARQL query with UNION clauses for all expansions. + + DEDUPLICATES by city code and type+country to avoid redundant query patterns. + For example, if 5 seeds are all from Amsterdam with type MUSEUM, we only + create ONE city expansion (for AMS) and ONE type expansion (for NL + M), + not 10 redundant UNIONs. + + Args: + seed_institutions: Seed institutions to expand from + exclude_ghcids: GHCIDs to exclude from results + limit_per_expansion: Max results per expansion type + + Returns: + Tuple of (SPARQL query string, expansion_metadata dict) + expansion_metadata maps expansion_key -> {seed, type, city/type_code} + """ + unions = [] + expansion_metadata = {} + + # Track unique patterns to avoid duplicate queries + seen_city_codes: set[str] = set() + seen_type_patterns: set[str] = set() # "country-type_code" pattern + + seeds_to_expand = seed_institutions[:5] + city_idx = 0 + type_idx = 0 + + for seed in seeds_to_expand: + # City expansion - deduplicate by city code + if seed.city: + city_code = seed.city[:3].upper() + if city_code not in seen_city_codes: + seen_city_codes.add(city_code) + expansion_key = f"city_{city_idx}" + city_idx += 1 + unions.append(f""" + {{ + SELECT ?s ?name ?ghcid ?type ("{expansion_key}" AS ?expansion_key) WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + FILTER(CONTAINS(?ghcid, "-{city_code}-")) + OPTIONAL {{ ?s hc:institutionType ?type }} + }} + LIMIT {limit_per_expansion + len(exclude_ghcids)} + }} + """) + expansion_metadata[expansion_key] = { + "seed": seed, + "type": "city", + "city": seed.city, + "city_code": city_code + } + + # Type expansion - deduplicate by country + type_code pattern + if seed.institution_type and seed.country: + type_code = get_custodian_type_to_heritage_code().get(seed.institution_type, "") + if type_code: + pattern_key = f"{seed.country}-{type_code}" + if pattern_key not in seen_type_patterns: + seen_type_patterns.add(pattern_key) + expansion_key = f"type_{type_idx}" + type_idx += 1 + unions.append(f""" + {{ + SELECT ?s ?name ?ghcid ?city ("{expansion_key}" AS ?expansion_key) WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + FILTER(STRSTARTS(?ghcid, "{seed.country}-")) + FILTER(CONTAINS(?ghcid, "-{type_code}-")) + OPTIONAL {{ ?s schema:location ?city }} + }} + LIMIT {limit_per_expansion + len(exclude_ghcids)} + }} + """) + expansion_metadata[expansion_key] = { + "seed": seed, + "type": "type", + "institution_type": seed.institution_type, + "type_code": type_code, + "country": seed.country + } + + if not unions: + return "", {} + + # Log deduplication stats + logger.info(f"Batched SPARQL: {len(unions)} UNIONs (deduplicated from max {len(seeds_to_expand) * 2}). " + f"Unique cities: {seen_city_codes}, Unique types: {seen_type_patterns}") + + # Combine all unions into a single query + query = f""" + SELECT ?s ?name ?ghcid ?type ?city ?expansion_key WHERE {{ + {" UNION ".join(unions)} + }} + """ + + return query, expansion_metadata + + def _graph_expand_batched( + self, + seed_institutions: list[RetrievedInstitution] + ) -> list[RetrievedInstitution]: + """Expand seed results using a SINGLE batched SPARQL query. + + This is a significant optimization over the parallel ThreadPoolExecutor + approach. Instead of 10 HTTP requests (even in parallel), we execute + ONE SPARQL query with UNION clauses. + + Performance comparison: + - Sequential: 10 queries × ~100ms = 4+ seconds + - Parallel (ThreadPool): ~500ms-1s (limited by GIL/connection pool) + - Batched (this method): ONE query ~150-300ms + + Args: + seed_institutions: Initial vector search results + + Returns: + Additional institutions found via graph expansion + """ + start_time = time.time() + exclude_ghcids = {inst.ghcid for inst in seed_institutions} + expanded = [] + seen_ghcids = set(exclude_ghcids) + + # Build batched query + query, expansion_metadata = self._build_batched_expansion_query( + seed_institutions, exclude_ghcids, limit_per_expansion=self.k_expand + ) + + if not query: + logger.debug("No graph expansion tasks to execute") + return expanded + + # Execute single batched query + query_start = time.time() + results = self.sparql_client.query(query) + query_duration = (time.time() - query_start) * 1000 + + logger.debug(f"Batched SPARQL query: {len(results)} raw results in {query_duration:.0f}ms") + + # Group results by expansion_key + results_by_expansion: dict[str, list[dict]] = {} + for row in results: + exp_key = row.get("expansion_key", "") + if exp_key: + if exp_key not in results_by_expansion: + results_by_expansion[exp_key] = [] + results_by_expansion[exp_key].append(row) + + # Process results, filtering and creating RetrievedInstitution objects + for exp_key, rows in results_by_expansion.items(): + if exp_key not in expansion_metadata: + continue + + meta = expansion_metadata[exp_key] + seed = meta["seed"] + exp_type = meta["type"] + + count = 0 + for row in rows: + ghcid = row.get("ghcid", "") + if not ghcid or ghcid in seen_ghcids: + continue + + if count >= self.k_expand: + break + + seen_ghcids.add(ghcid) + count += 1 + + if exp_type == "city": + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.8, # High score for same city + institution_type=row.get("type"), + expansion_reason="same_city", + related_institutions=[seed.ghcid] + )) + elif exp_type == "type": + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.5, # Medium score for same type + institution_type=seed.institution_type, + city=row.get("city"), + expansion_reason="same_type", + related_institutions=[seed.ghcid] + )) + + logger.debug(f"Expansion {exp_key}: {count} results for {seed.ghcid}") + + total_time = (time.time() - start_time) * 1000 + logger.info(f"Graph expansion (batched): 1 query, {len(results)} raw results, " + f"{len(expanded)} expanded in {total_time:.0f}ms") + + return expanded + + def _expand_by_city(self, city: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]: + """Find other institutions in the same city via SPARQL. + + Note: This method is kept for backwards compatibility and direct calls. + For batch operations, use _graph_expand_batched() instead. + + Args: + city: City name to search for + exclude_ghcids: GHCIDs to exclude from results + limit: Maximum number of results + + Returns: + List of institution data dicts + """ + if not city: + return [] + + query = f""" + SELECT ?s ?name ?ghcid ?type WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + + # Match city in GHCID (format: CC-RR-CCC-T-ABBR) + FILTER(CONTAINS(?ghcid, "-{city[:3].upper()}-")) + + OPTIONAL {{ ?s hc:institutionType ?type }} + }} + LIMIT {limit + len(exclude_ghcids)} + """ + + results = self.sparql_client.query(query) + + # Filter out excluded GHCIDs + filtered = [] + for row in results: + ghcid = row.get("ghcid", "") + if ghcid not in exclude_ghcids: + filtered.append(row) + if len(filtered) >= limit: + break + + return filtered + + def _expand_by_type(self, institution_type: str, country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]: + """Find other institutions of the same type in the same country. + + Args: + institution_type: Institution type (MUSEUM, LIBRARY, etc.) + country: Country code (ISO 3166-1 alpha-2) + exclude_ghcids: GHCIDs to exclude + limit: Maximum number of results + + Returns: + List of institution data dicts + """ + if not institution_type: + return [] + + # Map institution type to GHCID type code using dynamic schema mapping + type_code = get_custodian_type_to_heritage_code().get(institution_type, "") + + if not type_code or not country: + return [] + + query = f""" + SELECT ?s ?name ?ghcid ?city WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + + # Match country and type in GHCID + FILTER(STRSTARTS(?ghcid, "{country}-")) + FILTER(CONTAINS(?ghcid, "-{type_code}-")) + + OPTIONAL {{ ?s schema:location ?city }} + }} + LIMIT {limit + len(exclude_ghcids)} + """ + + results = self.sparql_client.query(query) + + filtered = [] + for row in results: + ghcid = row.get("ghcid", "") + if ghcid not in exclude_ghcids: + filtered.append(row) + if len(filtered) >= limit: + break + + return filtered + + def _expand_by_wikidata_country(self, wikidata_country: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]: + """Find institutions in the same country using Wikidata P17 property. + + Args: + wikidata_country: Wikidata entity ID for country (e.g., Q55 for Netherlands) + exclude_ghcids: GHCIDs to exclude + limit: Maximum number of results + + Returns: + List of institution data dicts + """ + if not wikidata_country: + return [] + + query = f""" + SELECT ?s ?name ?ghcid ?type WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid ; + wdt:P17 wd:{wikidata_country} . + + OPTIONAL {{ ?s hc:institutionType ?type }} + }} + LIMIT {limit + len(exclude_ghcids)} + """ + + results = self.sparql_client.query(query) + + filtered = [] + for row in results: + ghcid = row.get("ghcid", "") + if ghcid not in exclude_ghcids: + filtered.append(row) + if len(filtered) >= limit: + break + + return filtered + + def _graph_expand( + self, + seed_institutions: list[RetrievedInstitution], + use_batched: bool = True + ) -> list[RetrievedInstitution]: + """Expand seed results using knowledge graph relationships. + + By default uses batched SPARQL (single query with UNION) for best performance. + Falls back to parallel ThreadPoolExecutor if batched fails. + + Performance comparison: + - Sequential: 10 queries × ~100ms = 4+ seconds + - Parallel (ThreadPool): ~500ms-3s (limited by GIL/connection pool) + - Batched (UNION query): ONE query ~150-300ms ← DEFAULT + + Args: + seed_institutions: Initial vector search results + use_batched: If True (default), use batched SPARQL query. + If False, use parallel ThreadPoolExecutor. + + Returns: + Additional institutions found via graph expansion + """ + if use_batched: + try: + return self._graph_expand_batched(seed_institutions) + except Exception as e: + logger.warning(f"Batched graph expansion failed, falling back to parallel: {e}") + # Fall through to parallel implementation + + return self._graph_expand_parallel(seed_institutions) + + def _graph_expand_parallel( + self, + seed_institutions: list[RetrievedInstitution] + ) -> list[RetrievedInstitution]: + """Expand seed results using parallel SPARQL queries (fallback method). + + Uses ThreadPoolExecutor to parallelize SPARQL queries. This is slower than + the batched approach but serves as a fallback. + + Args: + seed_institutions: Initial vector search results + + Returns: + Additional institutions found via graph expansion + """ + start_time = time.time() + exclude_ghcids = {inst.ghcid for inst in seed_institutions} + expanded = [] + seen_ghcids = set(exclude_ghcids) + + # Prepare all expansion tasks + # Each task is a tuple: (task_type, seed, query_params) + tasks = [] + seeds_to_expand = seed_institutions[:5] # Expand top 5 seeds + + for seed in seeds_to_expand: + # City expansion task + if seed.city: + tasks.append(("city", seed, {"city": seed.city})) + + # Type expansion task + if seed.institution_type and seed.country: + tasks.append(("type", seed, { + "institution_type": seed.institution_type, + "country": seed.country + })) + + if not tasks: + logger.debug("No graph expansion tasks to execute") + return expanded + + # Execute SPARQL queries in parallel + # Use min(10, len(tasks)) workers to avoid over-parallelization + max_workers = min(10, len(tasks)) + + def execute_expansion(task): + """Execute a single expansion task and return results with metadata.""" + task_type, seed, params = task + task_start = time.time() + + try: + if task_type == "city": + results = self._expand_by_city( + params["city"], exclude_ghcids, limit=self.k_expand + ) + return { + "task_type": task_type, + "seed": seed, + "results": results, + "duration_ms": (time.time() - task_start) * 1000 + } + elif task_type == "type": + results = self._expand_by_type( + params["institution_type"], + params["country"], + exclude_ghcids, + limit=self.k_expand + ) + return { + "task_type": task_type, + "seed": seed, + "results": results, + "duration_ms": (time.time() - task_start) * 1000 + } + except Exception as e: + logger.warning(f"Graph expansion task failed: {task_type} for {seed.ghcid}: {e}") + return { + "task_type": task_type, + "seed": seed, + "results": [], + "duration_ms": (time.time() - task_start) * 1000, + "error": str(e) + } + + # Run all tasks in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(execute_expansion, task): task for task in tasks} + + for future in as_completed(futures): + result = future.result() + if result is None: + continue + + task_type = result["task_type"] + seed = result["seed"] + rows = result["results"] + duration = result.get("duration_ms", 0) + + logger.debug(f"Graph expansion {task_type} for {seed.ghcid}: " + f"{len(rows)} results in {duration:.0f}ms") + + # Process results based on task type + if task_type == "city": + for row in rows: + ghcid = row.get("ghcid", "") + if ghcid and ghcid not in seen_ghcids: + seen_ghcids.add(ghcid) + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.8, # High score for same city + institution_type=row.get("type"), + expansion_reason="same_city", + related_institutions=[seed.ghcid] + )) + elif task_type == "type": + for row in rows: + ghcid = row.get("ghcid", "") + if ghcid and ghcid not in seen_ghcids: + seen_ghcids.add(ghcid) + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.5, # Medium score for same type + institution_type=seed.institution_type, + city=row.get("city"), + expansion_reason="same_type", + related_institutions=[seed.ghcid] + )) + + total_time = (time.time() - start_time) * 1000 + logger.info(f"Graph expansion completed: {len(tasks)} queries, " + f"{len(expanded)} results in {total_time:.0f}ms (parallel)") + + return expanded + + def _combine_and_rank( + self, + vector_results: list[RetrievedInstitution], + graph_results: list[RetrievedInstitution], + k: int + ) -> list[RetrievedInstitution]: + """Combine vector and graph results with weighted scoring and graph inheritance. + + This method implements a hybrid scoring approach: + 1. Direct merge: If a graph result matches a vector result (same GHCID), + the graph_score is directly applied + 2. Graph inheritance: Vector results inherit a portion of graph scores from + related institutions found via graph expansion (same city/type) + + Args: + vector_results: Results from vector search + graph_results: Results from graph expansion + k: Number of final results + + Returns: + Combined and ranked results + """ + # Debug logging for investigation + logger.debug(f"Combining {len(vector_results)} vector + {len(graph_results)} graph results") + + # Create lookup by GHCID for merging + results_by_ghcid: dict[str, RetrievedInstitution] = {} + + # Track which vector GHCIDs we have for inheritance + vector_ghcids = set() + + # Add vector results + for inst in vector_results: + if inst.ghcid: + results_by_ghcid[inst.ghcid] = inst + vector_ghcids.add(inst.ghcid) + logger.debug(f" Vector: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) " + f"v={inst.vector_score:.3f} g={inst.graph_score:.3f}") + + # Track direct merges and inheritance candidates + direct_merges = 0 + inheritance_boosts = [] + + # Merge graph results and build inheritance map + # inheritance_map: vector_ghcid -> list of (related_ghcid, graph_score, reason) + inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids} + + for inst in graph_results: + logger.debug(f" Graph: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) " + f"g={inst.graph_score:.3f} reason={inst.expansion_reason} " + f"related_to={inst.related_institutions}") + + if inst.ghcid in results_by_ghcid: + # Direct merge: graph result matches existing vector result + existing = results_by_ghcid[inst.ghcid] + old_graph_score = existing.graph_score + existing.graph_score = max(existing.graph_score, inst.graph_score) + existing.related_institutions.extend(inst.related_institutions) + if inst.expansion_reason: + existing.expansion_reason = inst.expansion_reason + direct_merges += 1 + logger.debug(f" -> Direct merge! {inst.ghcid} graph_score: {old_graph_score:.3f} -> {existing.graph_score:.3f}") + else: + # New institution from graph expansion + results_by_ghcid[inst.ghcid] = inst + + # Build inheritance: this graph result was expanded FROM a vector result + # The related_institutions field contains the seed GHCID(s) it was expanded from + for seed_ghcid in inst.related_institutions: + if seed_ghcid in inheritance_map: + inheritance_map[seed_ghcid].append( + (inst.ghcid, inst.graph_score, inst.expansion_reason or "related") + ) + + logger.debug(f"Direct merges: {direct_merges}") + + # Apply graph score inheritance to vector results + # Vector results inherit a portion of graph scores from their related institutions + INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores + + for vector_ghcid, related_list in inheritance_map.items(): + if related_list and vector_ghcid in results_by_ghcid: + inst = results_by_ghcid[vector_ghcid] + + # Calculate inherited score: average of related graph scores * inheritance factor + related_scores = [score for _, score, _ in related_list] + inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR + + old_graph_score = inst.graph_score + # Inherit: take max of current graph_score and inherited score + inst.graph_score = max(inst.graph_score, inherited_score) + + if inst.graph_score > old_graph_score: + # Track related institutions for context + related_ghcids = [ghcid for ghcid, _, _ in related_list] + inst.related_institutions.extend(related_ghcids[:3]) # Add up to 3 related + + inheritance_boosts.append({ + "ghcid": vector_ghcid, + "name": inst.name, + "old_graph": old_graph_score, + "new_graph": inst.graph_score, + "inherited_from": len(related_list), + "reasons": list(set(r for _, _, r in related_list)) + }) + logger.debug(f" Inheritance: {vector_ghcid} graph_score: {old_graph_score:.3f} -> " + f"{inst.graph_score:.3f} (from {len(related_list)} related institutions)") + + if inheritance_boosts: + logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: " + f"{[b['ghcid'] for b in inheritance_boosts[:3]]}...") + + # Calculate combined scores + for inst in results_by_ghcid.values(): + inst.combined_score = ( + self.vector_weight * inst.vector_score + + self.graph_weight * inst.graph_score + ) + + # Sort by combined score + ranked = sorted( + results_by_ghcid.values(), + key=lambda x: x.combined_score, + reverse=True + ) + + # Log top results for debugging + logger.debug(f"Top {min(5, len(ranked))} combined results:") + for i, inst in enumerate(ranked[:5]): + logger.debug(f" {i+1}. {inst.ghcid} ({inst.name[:25] if inst.name else '?'}...) " + f"combined={inst.combined_score:.3f} (v={inst.vector_score:.3f}, g={inst.graph_score:.3f})") + + return ranked[:k] + + def _get_person_collection_vector_size(self) -> int | None: + """Get the vector size of the person collection.""" + try: + info = self.qdrant_client.get_collection("heritage_persons") + if hasattr(info.config.params, 'vectors'): + vectors_config = info.config.params.vectors + if isinstance(vectors_config, dict): + first_config = next(iter(vectors_config.values()), None) + return first_config.size if first_config else None + elif vectors_config is not None: + return vectors_config.size # type: ignore[union-attr] + return None + except Exception as e: + logger.warning(f"Could not get person collection vector size: {e}") + return None + + def _person_vector_search( + self, + query: str, + k: int, + using: str | None = None, + filter_conditions: dict[str, Any] | None = None, + ) -> list[RetrievedPerson]: + """Perform vector similarity search in Qdrant heritage_persons collection. + + Args: + query: Search query text + k: Number of results to retrieve + using: Optional embedding model name (for multi-embedding mode) + filter_conditions: Optional dict of field->value filters for Qdrant + + Returns: + List of RetrievedPerson with vector scores + """ + from qdrant_client.http import models + + # Check person collection vector size and use appropriate model + person_vector_size = self._get_person_collection_vector_size() + person_model = using + + if person_vector_size == 384 and not using: + # Person collection uses MiniLM (384-dim), override model selection + person_model = "minilm_384" + logger.info(f"Person collection uses 384-dim vectors, using MiniLM model") + elif person_vector_size == 1536 and not using: + person_model = "openai_1536" + elif person_vector_size == 768 and not using: + person_model = "bge_768" + + query_vector = self._get_embedding(query, using=person_model) + + try: + # Build query parameters + search_params: dict[str, Any] = { + "collection_name": "heritage_persons", + "query": query_vector, + "limit": k, + "with_payload": True, + } + + # Add named vector 'using' ONLY if collection actually has named vectors + # Single-vector collections will error with "Not existing vector name" otherwise + if self.use_multi_embedding and self.multi_retriever: + uses_named = self.multi_retriever.uses_named_vectors("heritage_persons") + if uses_named: + if using: + search_params["using"] = using + elif self._selected_multi_model: + search_params["using"] = self._selected_multi_model.value + # else: single-vector collection, don't add 'using' parameter + + # Add schema-aware filters if provided + if filter_conditions: + filter_list = [] + for key, value in filter_conditions.items(): + # Handle advanced match filters (e.g. {"email": {"match": {"text": "nos"}}}) + if isinstance(value, dict) and "match" in value: + filter_list.append( + models.FieldCondition( + key=key, + match=models.MatchText(**value["match"]) + ) + ) + else: + # Standard exact match value + filter_list.append( + models.FieldCondition( + key=key, + match=models.MatchValue(value=value), + ) + ) + + search_params["query_filter"] = models.Filter(must=filter_list) + logger.info(f"[Qdrant] Applied person filters: {filter_conditions}") + + logger.info(f"[Qdrant] Searching '{search_params['collection_name']}' with params: query_filter={filter_conditions}, limit={k}") + + results = self.qdrant_client.query_points(**search_params) + except Exception as e: + logger.warning(f"Person collection search failed: {e}") + return [] + + persons = [] + for point in results.points: + payload = point.payload or {} + + # Extract richness score from payload (indexed by index_persons_qdrant.py) + richness_score = payload.get("richness_score", 0.0) + + person = RetrievedPerson( + person_id=payload.get("staff_id", "") or hashlib.md5( + f"{payload.get('custodian_slug', '')}:{payload.get('name', '')}".encode() + ).hexdigest()[:16], + name=payload.get("name", ""), + vector_score=point.score, + richness_score=richness_score, + headline=payload.get("headline"), + custodian_name=payload.get("custodian_name"), + custodian_slug=payload.get("custodian_slug"), + location=payload.get("location"), + heritage_relevant=payload.get("heritage_relevant", False), + heritage_type=payload.get("heritage_type"), + source_type=payload.get("source_type"), + linkedin_url=payload.get("linkedin_url"), + has_wcms=payload.get("has_wcms", False), + # WCMS-specific fields + wcms_user_id=payload.get("wcms_user_id"), + wcms_abs_id=payload.get("wcms_abs_id"), + wcms_crm_id=payload.get("wcms_crm_id"), + wcms_username=payload.get("wcms_username"), + wcms_username_url=payload.get("wcms_username_url"), + wcms_status=payload.get("wcms_status"), + wcms_roles=payload.get("wcms_roles"), + wcms_registered_since=payload.get("wcms_registered_since"), + wcms_last_access=payload.get("wcms_last_access"), + # Contact details + email=payload.get("email"), + email_domain=payload.get("email_domain"), + ) + + # Apply richness score boosting + # Formula: combined_score = vector_score * (0.7 + 0.3 * richness_score) + # - Profiles with richness_score=0 get 70% of vector score + # - Profiles with richness_score=1 get 100% of vector score + # This ensures rich profiles rank higher than sparse ones at similar similarity + richness_boost = 0.7 + 0.3 * richness_score + person.combined_score = person.vector_score * richness_boost + + # Apply name-matching boost for queries that look like person names + # This ensures that searching for "Kitty Bogte" returns Kitty Bogte first, + # even if vector similarity ranks other Dutch names higher + if looks_like_person_name(query) and person.name: + name_boost = calculate_name_match_boost(query, person.name) + if name_boost > 1.0: + logger.debug(f"Name match boost {name_boost}x for '{person.name}' (query: '{query}')") + person.combined_score *= name_boost + + persons.append(person) + + # Re-sort by combined score after name boosting + persons.sort(key=lambda p: p.combined_score, reverse=True) + + return persons + + def search_persons( + self, + query: str, + k: int | None = None, + filter_custodian: str | None = None, + only_heritage_relevant: bool = False, + only_wcms: bool = False, + using: str | None = None, + # Schema-aware filter parameters (from DSPy HeritageQueryRouter) + target_role_category: str | None = None, + target_custodian_type: str | None = None, + # Extra filters for robust domain search (e.g. email substring) + extra_filters: dict[str, Any] | None = None, + ) -> list[RetrievedPerson]: + """Search for persons/staff in the heritage_persons collection. + + Args: + query: Natural language search query + k: Number of results to return (default: k_final) + filter_custodian: Optional custodian slug to filter by + only_heritage_relevant: Only return heritage-relevant staff + only_wcms: Only return WCMS-registered profiles (heritage sector users) + using: Optional embedding model name (for multi-embedding mode). + One of: "openai_1536", "minilm_384", "bge_768" + target_role_category: Role category from DSPy router (CURATORIAL, ARCHIVAL, etc.) + Used for headline-based post-filtering since not indexed in Qdrant. + target_custodian_type: Custodian type from DSPy router (MUSEUM, ARCHIVE, etc.) + Converted to heritage_type code for Qdrant filtering. + extra_filters: Optional extra Qdrant filters (e.g. {"email": {"match": {"text": "nos"}}}) + + Returns: + List of RetrievedPerson with scores + """ + k = k or self.k_final + + # Build Qdrant filter conditions from schema-aware parameters + heritage_type_code = get_heritage_type_code(target_custodian_type) + filter_conditions = build_schema_aware_person_filter( + heritage_type_code=heritage_type_code, + heritage_relevant_only=only_heritage_relevant, + custodian_slug=filter_custodian, + only_wcms=only_wcms, + ) or {} + + # Merge extra filters if provided (e.g. email match) + if extra_filters: + filter_conditions.update(extra_filters) + + if not filter_conditions: + filter_conditions = None + + logger.info(f"Person search for: {query[:50]}... (model: {using or 'auto'}, role_category: {target_role_category}, custodian_type: {target_custodian_type}, extras: {extra_filters})") + + # Over-fetch to allow for post-filtering and name boosting + # - Base multiplier: 2x for general queries + # - Role category filter: 3x (need more candidates for keyword filtering) + # - Name queries: fetch minimum 100 to ensure name boost can find exact matches + # (vector similarity often ranks similar-sounding names higher than exact matches) + is_name_query = looks_like_person_name(query) + fetch_multiplier = 3 if target_role_category else 2 + fetch_count = max(k * fetch_multiplier, 100 if is_name_query else 0) + results = self._person_vector_search(query, fetch_count, using=using, filter_conditions=filter_conditions) + logger.info(f"Found {len(results)} person results after Qdrant filtering") + + # Apply role category post-filtering (keyword-based since not indexed) + if target_role_category: + results = filter_by_role_category_keywords(results, target_role_category) + + # Sort by combined score and limit + results.sort(key=lambda x: x.combined_score, reverse=True) + return results[:k] + + def search( + self, + query: str, + k: int | None = None, + expand_graph: bool = True, + filter_conditions: dict[str, Any] | None = None, + auto_route: bool = True, + using: str | None = None, + region_codes: list[str] | None = None, + cities: list[str] | None = None, + institution_types: list[str] | None = None, + ) -> list[RetrievedInstitution] | list[RetrievedPerson]: + """Perform hybrid vector + graph search with automatic query routing. + + If auto_route is True, automatically detects if query is about persons + (e.g., "Who works at Rijksmuseum?") and routes to person search. + + Args: + query: Natural language search query + k: Number of results to return (default: k_final) + expand_graph: Whether to perform graph expansion (institution search only) + filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params) + auto_route: Automatically detect and route person queries + using: Optional embedding model name (for multi-embedding mode). + One of: "openai_1536", "minilm_384", "bge_768" + region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"]) + for filtering by province/subdivision + cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"]) + institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"]) + + Returns: + List of RetrievedInstitution or RetrievedPerson with combined scores + """ + k = k or self.k_final + + # Auto-route person queries + if auto_route: + query_type = detect_query_type(query) + if query_type == "person": + logger.info(f"Auto-routing to person search for: {query[:50]}...") + return self.search_persons(query, k=k, using=using) + + # Institution search (original behavior) + filter_info = [] + if region_codes: + filter_info.append(f"regions={region_codes}") + if cities: + filter_info.append(f"cities={cities}") + if institution_types: + filter_info.append(f"types={institution_types}") + filter_str = f" [{', '.join(filter_info)}]" if filter_info else "" + + logger.info(f"Vector search for: {query[:50]}...{filter_str} (model: {using or 'auto'})") + vector_results = self._vector_search( + query, + self.k_vector, + using=using, + region_codes=region_codes, + cities=cities, + institution_types=institution_types, + ) + logger.info(f"Found {len(vector_results)} vector results") + + # Step 2: Graph expansion (if enabled) + graph_results = [] + if expand_graph and vector_results: + logger.info("Expanding via knowledge graph...") + graph_results = self._graph_expand(vector_results) + logger.info(f"Found {len(graph_results)} graph expansion results") + + # Step 3: Combine and rank + final_results = self._combine_and_rank(vector_results, graph_results, k) + logger.info(f"Returning {len(final_results)} combined results") + + return final_results + + def search_institutions( + self, + query: str, + k: int | None = None, + expand_graph: bool = True, + filter_conditions: dict[str, Any] | None = None, + using: str | None = None, + region_codes: list[str] | None = None, + cities: list[str] | None = None, + institution_types: list[str] | None = None, + ) -> list[RetrievedInstitution]: + """Explicit institution search (bypasses auto-routing). + + Args: + query: Natural language search query + k: Number of results to return (default: k_final) + expand_graph: Whether to perform graph expansion + filter_conditions: Optional Qdrant filter conditions (legacy, prefer new params) + using: Optional embedding model name (for multi-embedding mode). + One of: "openai_1536", "minilm_384", "bge_768" + region_codes: Optional list of ISO 3166-2 region codes (e.g., ["NH", "ZH"]) + for filtering by province/subdivision + cities: Optional list of city names (e.g., ["Amsterdam", "Rotterdam"]) + institution_types: Optional list of institution types (e.g., ["ARCHIVE", "MUSEUM"]) + + Returns: + List of RetrievedInstitution with combined scores + """ + # auto_route=False ensures we get RetrievedInstitution, not RetrievedPerson + results = self.search( + query, + k=k, + expand_graph=expand_graph, + filter_conditions=filter_conditions, + auto_route=False, + using=using, + region_codes=region_codes, + cities=cities, + institution_types=institution_types, + ) + return results # type: ignore[return-value] + + def __call__(self, query: str, k: int | None = None) -> list[str]: + """DSPy-compatible interface returning passage texts. + + Supports both institution and person queries with auto-routing. + + Args: + query: Search query + k: Number of results + + Returns: + List of passage texts (institution/person descriptions) + """ + results = self.search(query, k=k) + + passages = [] + for r in results: + if isinstance(r, RetrievedPerson): + # Person result + org = f" at {r.custodian_name}" if r.custodian_name else "" + role = r.headline or "Unknown role" + passages.append(f"{r.name} ({role}{org})") + else: + # Institution result + inst_type = r.institution_type or "Unknown type" + desc = r.description or "No description" + passages.append(f"{r.name} ({inst_type}) - {desc}") + + return passages + + def get_stats(self) -> dict[str, Any]: + """Get retriever statistics. + + Returns: + Dict with Qdrant and Oxigraph stats + """ + stats = { + "qdrant": { + "institutions": {}, + "persons": {}, + }, + "oxigraph": {}, + "config": { + "vector_weight": self.vector_weight, + "graph_weight": self.graph_weight, + "k_vector": self.k_vector, + "k_expand": self.k_expand, + "k_final": self.k_final + } + } + + # Qdrant institution collection stats + try: + info = self.qdrant_client.get_collection(self.collection_name) + stats["qdrant"]["institutions"] = { + "collection": self.collection_name, + "points_count": info.points_count, + "status": info.status.value if info.status else "unknown" + } + except Exception as e: + stats["qdrant"]["institutions"]["error"] = str(e) + + # Qdrant person collection stats + try: + info = self.qdrant_client.get_collection("heritage_persons") + stats["qdrant"]["persons"] = { + "collection": "heritage_persons", + "points_count": info.points_count, + "status": info.status.value if info.status else "unknown" + } + except Exception as e: + stats["qdrant"]["persons"]["error"] = str(e) + + # Oxigraph stats + try: + result = self.sparql_client.query( + "SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s a hcc:Custodian }" + ) + if result: + stats["oxigraph"]["custodian_count"] = int(result[0].get("count", 0)) + except Exception as e: + stats["oxigraph"]["error"] = str(e) + + return stats + + def close(self): + """Clean up resources.""" + self.sparql_client.close() + if self._qdrant_client: + self._qdrant_client.close() + + +def create_hybrid_retriever( + use_production: bool = False, + **kwargs +) -> HybridRetriever: + """Factory function to create a hybrid retriever. + + Args: + use_production: If True, connect to production endpoints + **kwargs: Additional arguments for HybridRetriever + + Returns: + Configured HybridRetriever instance + """ + if use_production: + return HybridRetriever( + qdrant_host="bronhouder.nl", + qdrant_port=443, + sparql_endpoint="https://bronhouder.nl/sparql", + use_production_qdrant=True, + **kwargs + ) + else: + return HybridRetriever( + qdrant_host=os.getenv("QDRANT_HOST", "localhost"), + qdrant_port=int(os.getenv("QDRANT_PORT", "6333")), + sparql_endpoint=os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query"), + **kwargs + ) diff --git a/backend/rag/main.py b/backend/rag/main.py index 2c16f3d5bd..0a15f6e35d 100644 --- a/backend/rag/main.py +++ b/backend/rag/main.py @@ -1660,6 +1660,7 @@ class MultiSourceRetriever: only_heritage_relevant: bool = False, only_wcms: bool = False, using: str | None = None, + extra_filters: dict[str, Any] | None = None, ) -> list[Any]: """Search for persons/staff in the heritage_persons collection. @@ -1672,20 +1673,29 @@ class MultiSourceRetriever: only_heritage_relevant: Only return heritage-relevant staff only_wcms: Only return WCMS-registered profiles using: Optional embedding model to use (e.g., 'minilm_384', 'openai_1536') + extra_filters: Optional extra filters for Qdrant Returns: List of RetrievedPerson objects """ if self.qdrant: try: - return self.qdrant.search_persons( # type: ignore[no-any-return] - query=query, - k=k, - filter_custodian=filter_custodian, - only_heritage_relevant=only_heritage_relevant, - only_wcms=only_wcms, - using=using, - ) + # Dynamically check if qdrant.search_persons supports extra_filters + # This handles case where HybridRetriever signature varies + import inspect + sig = inspect.signature(self.qdrant.search_persons) + kwargs = { + "query": query, + "k": k, + "filter_custodian": filter_custodian, + "only_heritage_relevant": only_heritage_relevant, + "only_wcms": only_wcms, + "using": using, + } + if "extra_filters" in sig.parameters: + kwargs["extra_filters"] = extra_filters + + return self.qdrant.search_persons(**kwargs) # type: ignore[no-any-return] except Exception as e: logger.error(f"Person search failed: {e}") return [] @@ -2755,11 +2765,18 @@ async def person_search(request: PersonSearchRequest) -> PersonSearchResponse: # Augment query for better recall on domain names if it looks like a domain search # "nos" -> "nos email domain nos" to guide vector search towards email addresses search_query = request.query + extra_filters = None + + # Check for single word domain-like queries if len(search_query.split()) == 1 and len(search_query) > 2 and "@" not in search_query: # Heuristic: single word queries might be domain searches - # We append "email domain" context to guide the embedding - search_query = f"{search_query} email domain {search_query}" + # We use MatchText filtering on email field to find substring matches + # Qdrant "match": {"text": "nos"} performs token-based matching + extra_filters = {"email": {"match": {"text": search_query}}} + logger.info(f"[PersonSearch] Potential domain search detected for '{search_query}'. Applying strict email filter: {extra_filters}") + logger.info(f"[PersonSearch] Executing search for '{search_query}' (extra_filters={extra_filters})") + # Use the hybrid retriever's person search results = retriever.search_persons( query=search_query, @@ -2768,8 +2785,27 @@ async def person_search(request: PersonSearchRequest) -> PersonSearchResponse: only_heritage_relevant=request.only_heritage_relevant, only_wcms=request.only_wcms, using=request.embedding_model, # Pass embedding model + extra_filters=extra_filters, ) + # FALLBACK: If strict domain filter yielded no results, try standard vector search + # This fixes the issue where searching for names like "willem" (which look like domains) + # would fail because they don't appear in emails. + if extra_filters and not results: + logger.info(f"[PersonSearch] No results with email filter for '{search_query}'. Falling back to standard vector search.") + results = retriever.search_persons( + query=search_query, + k=request.k, + filter_custodian=request.filter_custodian, + only_heritage_relevant=request.only_heritage_relevant, + only_wcms=request.only_wcms, + using=request.embedding_model, + extra_filters=None, # Disable filter for fallback + ) + logger.info(f"[PersonSearch] Fallback search returned {len(results)} results") + + logger.info(f"[PersonSearch] Final result count: {len(results)}") + # Determine which embedding model was actually used embedding_model_used = None qdrant = retriever.qdrant diff --git a/backend/rag/multi_embedding_retriever.py b/backend/rag/multi_embedding_retriever.py new file mode 100644 index 0000000000..b7ca264693 --- /dev/null +++ b/backend/rag/multi_embedding_retriever.py @@ -0,0 +1,846 @@ +""" +Multi-Embedding Retriever for Heritage Data + +Supports multiple embedding models using Qdrant's named vectors feature. +This enables: +- A/B testing different embedding models +- Cost optimization (cheap local embeddings vs paid API embeddings) +- Gradual migration between embedding models +- Fallback when one model is unavailable + +Supported Embedding Models: + - openai_1536: text-embedding-3-small (1536-dim, $0.02/1M tokens) + - minilm_384: all-MiniLM-L6-v2 (384-dim, free/local) + - bge_768: bge-base-en-v1.5 (768-dim, free/local, high quality) + +Collection Architecture: + Each collection has named vectors for each embedding model: + + heritage_custodians: + vectors: + "openai_1536": VectorParams(size=1536) + "minilm_384": VectorParams(size=384) + payload: {name, ghcid, institution_type, ...} + + heritage_persons: + vectors: + "openai_1536": VectorParams(size=1536) + "minilm_384": VectorParams(size=384) + payload: {name, headline, custodian_name, ...} + +Usage: + retriever = MultiEmbeddingRetriever() + + # Search with default model (auto-select based on availability) + results = retriever.search("museums in Amsterdam") + + # Search with specific model + results = retriever.search("museums in Amsterdam", using="minilm_384") + + # A/B test comparison + comparison = retriever.compare_models("museums in Amsterdam") +""" + +import hashlib +import logging +import os +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Literal + +logger = logging.getLogger(__name__) + + +class EmbeddingModel(str, Enum): + """Supported embedding models with their configurations.""" + + OPENAI_1536 = "openai_1536" + MINILM_384 = "minilm_384" + BGE_768 = "bge_768" + + @property + def dimension(self) -> int: + """Get the vector dimension for this model.""" + dims = { + "openai_1536": 1536, + "minilm_384": 384, + "bge_768": 768, + } + return dims[self.value] + + @property + def model_name(self) -> str: + """Get the actual model name for loading.""" + names = { + "openai_1536": "text-embedding-3-small", + "minilm_384": "all-MiniLM-L6-v2", + "bge_768": "BAAI/bge-base-en-v1.5", + } + return names[self.value] + + @property + def is_local(self) -> bool: + """Check if this model runs locally (no API calls).""" + return self.value in ("minilm_384", "bge_768") + + @property + def cost_per_1m_tokens(self) -> float: + """Approximate cost per 1M tokens (0 for local models).""" + costs = { + "openai_1536": 0.02, + "minilm_384": 0.0, + "bge_768": 0.0, + } + return costs[self.value] + + +@dataclass +class MultiEmbeddingConfig: + """Configuration for multi-embedding retriever.""" + + # Qdrant connection + qdrant_host: str = "localhost" + qdrant_port: int = 6333 + qdrant_https: bool = False + qdrant_prefix: str | None = None + + # API keys + openai_api_key: str | None = None + + # Default embedding model preference order + # First available model is used if no explicit model is specified + model_preference: list[EmbeddingModel] = field(default_factory=lambda: [ + EmbeddingModel.MINILM_384, # Free, fast, good quality + EmbeddingModel.OPENAI_1536, # Higher quality, paid + EmbeddingModel.BGE_768, # Free, high quality, slower + ]) + + # Collection names + institutions_collection: str = "heritage_custodians" + persons_collection: str = "heritage_persons" + + # Search defaults + default_k: int = 10 + + +class MultiEmbeddingRetriever: + """Retriever supporting multiple embedding models via Qdrant named vectors. + + This class manages multiple embedding models and allows searching with + any available model. It handles: + - Model lazy-loading + - Automatic model selection based on availability + - Named vector creation and search + - A/B testing between models + """ + + def __init__(self, config: MultiEmbeddingConfig | None = None): + """Initialize multi-embedding retriever. + + Args: + config: Configuration options. If None, uses environment variables. + """ + self.config = config or self._config_from_env() + + # Lazy-loaded clients + self._qdrant_client = None + self._openai_client = None + self._st_models: dict[str, Any] = {} # Sentence transformer models + + # Track available models per collection + self._available_models: dict[str, set[EmbeddingModel]] = {} + + # Track whether each collection uses named vectors (vs single unnamed vector) + self._uses_named_vectors: dict[str, bool] = {} + + logger.info(f"MultiEmbeddingRetriever initialized with preference: {[m.value for m in self.config.model_preference]}") + + @staticmethod + def _config_from_env() -> MultiEmbeddingConfig: + """Create configuration from environment variables.""" + use_production = os.getenv("QDRANT_USE_PRODUCTION", "false").lower() == "true" + + if use_production: + return MultiEmbeddingConfig( + qdrant_host=os.getenv("QDRANT_PROD_HOST", "bronhouder.nl"), + qdrant_port=443, + qdrant_https=True, + qdrant_prefix=os.getenv("QDRANT_PROD_PREFIX", "qdrant"), + openai_api_key=os.getenv("OPENAI_API_KEY"), + ) + else: + return MultiEmbeddingConfig( + qdrant_host=os.getenv("QDRANT_HOST", "localhost"), + qdrant_port=int(os.getenv("QDRANT_PORT", "6333")), + openai_api_key=os.getenv("OPENAI_API_KEY"), + ) + + @property + def qdrant_client(self): + """Lazy-load Qdrant client.""" + if self._qdrant_client is None: + from qdrant_client import QdrantClient + + if self.config.qdrant_https: + self._qdrant_client = QdrantClient( + host=self.config.qdrant_host, + port=self.config.qdrant_port, + https=True, + prefix=self.config.qdrant_prefix, + prefer_grpc=False, + timeout=30, + ) + logger.info(f"Connected to Qdrant: https://{self.config.qdrant_host}/{self.config.qdrant_prefix or ''}") + else: + self._qdrant_client = QdrantClient( + host=self.config.qdrant_host, + port=self.config.qdrant_port, + ) + logger.info(f"Connected to Qdrant: {self.config.qdrant_host}:{self.config.qdrant_port}") + + return self._qdrant_client + + @property + def openai_client(self): + """Lazy-load OpenAI client.""" + if self._openai_client is None: + if not self.config.openai_api_key: + raise RuntimeError("OpenAI API key not configured") + + import openai + self._openai_client = openai.OpenAI(api_key=self.config.openai_api_key) + + return self._openai_client + + def _load_sentence_transformer(self, model: EmbeddingModel) -> Any: + """Lazy-load a sentence-transformers model. + + Args: + model: The embedding model to load + + Returns: + Loaded SentenceTransformer model + """ + if model.value not in self._st_models: + try: + from sentence_transformers import SentenceTransformer + self._st_models[model.value] = SentenceTransformer(model.model_name) + logger.info(f"Loaded sentence-transformers model: {model.model_name}") + except ImportError: + raise RuntimeError( + "sentence-transformers not installed. Run: pip install sentence-transformers" + ) + + return self._st_models[model.value] + + def get_embedding(self, text: str, model: EmbeddingModel) -> list[float]: + """Get embedding vector for text using specified model. + + Args: + text: Text to embed + model: Embedding model to use + + Returns: + Embedding vector as list of floats + """ + if model == EmbeddingModel.OPENAI_1536: + response = self.openai_client.embeddings.create( + input=text, + model=model.model_name, + ) + return response.data[0].embedding + + elif model in (EmbeddingModel.MINILM_384, EmbeddingModel.BGE_768): + st_model = self._load_sentence_transformer(model) + embedding = st_model.encode(text) + return embedding.tolist() + + else: + raise ValueError(f"Unknown embedding model: {model}") + + def get_embeddings_batch( + self, + texts: list[str], + model: EmbeddingModel, + batch_size: int = 32, + ) -> list[list[float]]: + """Get embedding vectors for multiple texts. + + Args: + texts: List of texts to embed + model: Embedding model to use + batch_size: Batch size for processing + + Returns: + List of embedding vectors + """ + if not texts: + return [] + + if model == EmbeddingModel.OPENAI_1536: + # OpenAI batch API (max 2048 per request) + all_embeddings = [] + for i in range(0, len(texts), 2048): + batch = texts[i:i + 2048] + response = self.openai_client.embeddings.create( + input=batch, + model=model.model_name, + ) + batch_embeddings = [item.embedding for item in sorted(response.data, key=lambda x: x.index)] + all_embeddings.extend(batch_embeddings) + return all_embeddings + + elif model in (EmbeddingModel.MINILM_384, EmbeddingModel.BGE_768): + st_model = self._load_sentence_transformer(model) + embeddings = st_model.encode(texts, batch_size=batch_size, show_progress_bar=len(texts) > 100) + return embeddings.tolist() + + else: + raise ValueError(f"Unknown embedding model: {model}") + + def get_available_models(self, collection_name: str) -> set[EmbeddingModel]: + """Get the embedding models available for a collection. + + Checks which named vectors exist in the collection. + For single-vector collections, returns models matching the dimension. + + Args: + collection_name: Name of the Qdrant collection + + Returns: + Set of available EmbeddingModel values + """ + if collection_name in self._available_models: + return self._available_models[collection_name] + + try: + info = self.qdrant_client.get_collection(collection_name) + vectors_config = info.config.params.vectors + + available = set() + uses_named_vectors = False + + # Check for named vectors (dict of vector configs) + if isinstance(vectors_config, dict): + # Named vectors - each key is a vector name + uses_named_vectors = True + for vector_name in vectors_config.keys(): + try: + model = EmbeddingModel(vector_name) + available.add(model) + except ValueError: + logger.warning(f"Unknown vector name in collection: {vector_name}") + else: + # Single unnamed vector - check dimension to find compatible model + # Note: This doesn't mean we can use `using=model.value` in queries + uses_named_vectors = False + if hasattr(vectors_config, 'size'): + dim = vectors_config.size + for model in EmbeddingModel: + if model.dimension == dim: + available.add(model) + + # Store both available models and whether named vectors are used + self._available_models[collection_name] = available + self._uses_named_vectors[collection_name] = uses_named_vectors + + if uses_named_vectors: + logger.info(f"Collection '{collection_name}' uses named vectors: {[m.value for m in available]}") + else: + logger.info(f"Collection '{collection_name}' uses single vector (compatible with: {[m.value for m in available]})") + + return available + + except Exception as e: + logger.warning(f"Could not get available models for {collection_name}: {e}") + return set() + + def uses_named_vectors(self, collection_name: str) -> bool: + """Check if a collection uses named vectors (vs single unnamed vector). + + Args: + collection_name: Name of the Qdrant collection + + Returns: + True if collection has named vectors, False for single-vector collections + """ + # Ensure models are loaded (populates _uses_named_vectors) + self.get_available_models(collection_name) + return self._uses_named_vectors.get(collection_name, False) + + def select_model( + self, + collection_name: str, + preferred: EmbeddingModel | None = None, + ) -> EmbeddingModel | None: + """Select the best available embedding model for a collection. + + Args: + collection_name: Name of the collection + preferred: Preferred model (used if available) + + Returns: + Selected EmbeddingModel or None if none available + """ + available = self.get_available_models(collection_name) + + if not available: + # No named vectors - check if we can use any model + # This happens for legacy single-vector collections + try: + info = self.qdrant_client.get_collection(collection_name) + vectors_config = info.config.params.vectors + + # Get vector dimension + dim = None + if hasattr(vectors_config, 'size'): + dim = vectors_config.size + elif isinstance(vectors_config, dict): + # Get first vector config + first_config = next(iter(vectors_config.values()), None) + if first_config and hasattr(first_config, 'size'): + dim = first_config.size + + if dim: + for model in self.config.model_preference: + if model.dimension == dim: + return model + except Exception: + pass + + return None + + # If preferred model is available, use it + if preferred and preferred in available: + return preferred + + # Otherwise, follow preference order + for model in self.config.model_preference: + if model in available: + # Check if model is usable (has API key if needed) + if model == EmbeddingModel.OPENAI_1536 and not self.config.openai_api_key: + continue + return model + + return None + + def search( + self, + query: str, + collection_name: str | None = None, + k: int | None = None, + using: EmbeddingModel | str | None = None, + filter_conditions: dict[str, Any] | None = None, + ) -> list[dict[str, Any]]: + """Search for similar documents using specified or auto-selected model. + + Args: + query: Search query text + collection_name: Collection to search (default: institutions) + k: Number of results + using: Embedding model to use (auto-selected if None) + filter_conditions: Optional Qdrant filter conditions + + Returns: + List of results with scores and payloads + """ + collection_name = collection_name or self.config.institutions_collection + k = k or self.config.default_k + + # Resolve model + if using is not None: + if isinstance(using, str): + model = EmbeddingModel(using) + else: + model = using + else: + model = self.select_model(collection_name) + + if model is None: + raise RuntimeError(f"No compatible embedding model for collection '{collection_name}'") + + logger.info(f"Searching '{collection_name}' with {model.value}: {query[:50]}...") + + # Get query embedding + query_vector = self.get_embedding(query, model) + + # Build filter + from qdrant_client.http import models + + query_filter = None + if filter_conditions: + query_filter = models.Filter( + must=[ + models.FieldCondition( + key=key, + match=models.MatchValue(value=value), + ) + for key, value in filter_conditions.items() + ] + ) + + # Check if collection uses named vectors (not just single unnamed vector) + # Only pass `using=model.value` if collection has actual named vectors + use_named_vector = self.uses_named_vectors(collection_name) + + # Search + if use_named_vector: + results = self.qdrant_client.query_points( + collection_name=collection_name, + query=query_vector, + using=model.value, + limit=k, + with_payload=True, + query_filter=query_filter, + ) + else: + # Legacy single-vector search + results = self.qdrant_client.query_points( + collection_name=collection_name, + query=query_vector, + limit=k, + with_payload=True, + query_filter=query_filter, + ) + + return [ + { + "id": str(point.id), + "score": point.score, + "model": model.value, + "payload": point.payload or {}, + } + for point in results.points + ] + + def search_persons( + self, + query: str, + k: int | None = None, + using: EmbeddingModel | str | None = None, + filter_custodian: str | None = None, + only_heritage_relevant: bool = False, + only_wcms: bool = False, + ) -> list[dict[str, Any]]: + """Search for persons/staff in the heritage_persons collection. + + Args: + query: Search query text + k: Number of results + using: Embedding model to use + filter_custodian: Optional custodian slug to filter by + only_heritage_relevant: Only return heritage-relevant staff + only_wcms: Only return WCMS-registered profiles (heritage sector users) + + Returns: + List of person results with scores + """ + k = k or self.config.default_k + + # Build filters + filters = {} + if filter_custodian: + filters["custodian_slug"] = filter_custodian + if only_wcms: + filters["has_wcms"] = True + + # Search with over-fetch for post-filtering + results = self.search( + query=query, + collection_name=self.config.persons_collection, + k=k * 2, + using=using, + filter_conditions=filters if filters else None, + ) + + # Post-filter for heritage_relevant if needed + if only_heritage_relevant: + results = [r for r in results if r.get("payload", {}).get("heritage_relevant", False)] + + # Format results + formatted = [] + for r in results[:k]: + payload = r.get("payload", {}) + formatted.append({ + "person_id": payload.get("staff_id", "") or hashlib.md5( + f"{payload.get('custodian_slug', '')}:{payload.get('name', '')}".encode() + ).hexdigest()[:16], + "name": payload.get("name", ""), + "headline": payload.get("headline"), + "custodian_name": payload.get("custodian_name"), + "custodian_slug": payload.get("custodian_slug"), + "location": payload.get("location"), + "heritage_relevant": payload.get("heritage_relevant", False), + "heritage_type": payload.get("heritage_type"), + "linkedin_url": payload.get("linkedin_url"), + "score": r["score"], + "model": r["model"], + }) + + return formatted + + def compare_models( + self, + query: str, + collection_name: str | None = None, + k: int = 10, + models: list[EmbeddingModel] | None = None, + ) -> dict[str, Any]: + """A/B test comparison of multiple embedding models. + + Args: + query: Search query + collection_name: Collection to search + k: Number of results per model + models: Models to compare (default: all available) + + Returns: + Dict with results per model and overlap analysis + """ + collection_name = collection_name or self.config.institutions_collection + + # Determine which models to compare + available = self.get_available_models(collection_name) + if models: + models_to_test = [m for m in models if m in available] + else: + models_to_test = list(available) + + if not models_to_test: + return {"error": "No models available for comparison"} + + results = {} + all_ids = {} + + for model in models_to_test: + try: + model_results = self.search( + query=query, + collection_name=collection_name, + k=k, + using=model, + ) + results[model.value] = model_results + all_ids[model.value] = {r["id"] for r in model_results} + except Exception as e: + results[model.value] = {"error": str(e)} + all_ids[model.value] = set() + + # Calculate overlap between models + overlap = {} + model_values = list(all_ids.keys()) + for i, m1 in enumerate(model_values): + for m2 in model_values[i + 1:]: + if all_ids[m1] and all_ids[m2]: + intersection = all_ids[m1] & all_ids[m2] + union = all_ids[m1] | all_ids[m2] + jaccard = len(intersection) / len(union) if union else 0 + overlap[f"{m1}_vs_{m2}"] = { + "jaccard_similarity": round(jaccard, 3), + "common_results": len(intersection), + "total_unique": len(union), + } + + return { + "query": query, + "collection": collection_name, + "k": k, + "results": results, + "overlap_analysis": overlap, + } + + def create_multi_embedding_collection( + self, + collection_name: str, + models: list[EmbeddingModel] | None = None, + ) -> bool: + """Create a new collection with named vectors for multiple embedding models. + + Args: + collection_name: Name for the new collection + models: Embedding models to support (default: all) + + Returns: + True if created successfully + """ + from qdrant_client.http.models import Distance, VectorParams + + models = models or list(EmbeddingModel) + + vectors_config = { + model.value: VectorParams( + size=model.dimension, + distance=Distance.COSINE, + ) + for model in models + } + + try: + self.qdrant_client.create_collection( + collection_name=collection_name, + vectors_config=vectors_config, + ) + logger.info(f"Created multi-embedding collection '{collection_name}' with {[m.value for m in models]}") + + # Clear cache + self._available_models.pop(collection_name, None) + + return True + + except Exception as e: + logger.error(f"Failed to create collection: {e}") + return False + + def add_documents_multi_embedding( + self, + documents: list[dict[str, Any]], + collection_name: str, + models: list[EmbeddingModel] | None = None, + batch_size: int = 100, + ) -> int: + """Add documents with embeddings from multiple models. + + Args: + documents: List of documents with 'text' and optional 'metadata' fields + collection_name: Target collection + models: Models to generate embeddings for (default: all available) + batch_size: Batch size for processing + + Returns: + Number of documents added + """ + from qdrant_client.http import models as qmodels + + # Determine which models to use + available = self.get_available_models(collection_name) + if models: + models_to_use = [m for m in models if m in available] + else: + models_to_use = list(available) + + if not models_to_use: + raise RuntimeError(f"No embedding models available for collection '{collection_name}'") + + # Filter valid documents + valid_docs = [d for d in documents if d.get("text")] + total_indexed = 0 + + for i in range(0, len(valid_docs), batch_size): + batch = valid_docs[i:i + batch_size] + texts = [d["text"] for d in batch] + + # Generate embeddings for each model + embeddings_by_model = {} + for model in models_to_use: + try: + embeddings_by_model[model] = self.get_embeddings_batch(texts, model) + except Exception as e: + logger.warning(f"Failed to get {model.value} embeddings: {e}") + + if not embeddings_by_model: + continue + + # Create points with named vectors + points = [] + for j, doc in enumerate(batch): + text = doc["text"] + metadata = doc.get("metadata", {}) + point_id = doc.get("id") or hashlib.md5(text.encode()).hexdigest() + + # Build named vectors dict + vectors = {} + for model, model_embeddings in embeddings_by_model.items(): + vectors[model.value] = model_embeddings[j] + + points.append(qmodels.PointStruct( + id=point_id, + vector=vectors, + payload={ + "text": text, + **metadata, + } + )) + + # Upsert batch + self.qdrant_client.upsert( + collection_name=collection_name, + points=points, + ) + total_indexed += len(points) + logger.info(f"Indexed {total_indexed}/{len(valid_docs)} documents with {len(models_to_use)} models") + + return total_indexed + + def get_stats(self) -> dict[str, Any]: + """Get statistics about collections and available models. + + Returns: + Dict with collection stats and model availability + """ + stats = { + "config": { + "qdrant_host": self.config.qdrant_host, + "qdrant_port": self.config.qdrant_port, + "model_preference": [m.value for m in self.config.model_preference], + "openai_available": bool(self.config.openai_api_key), + }, + "collections": {}, + } + + for collection_name in [self.config.institutions_collection, self.config.persons_collection]: + try: + info = self.qdrant_client.get_collection(collection_name) + available_models = self.get_available_models(collection_name) + selected_model = self.select_model(collection_name) + + stats["collections"][collection_name] = { + "vectors_count": info.vectors_count, + "points_count": info.points_count, + "status": info.status.value if info.status else "unknown", + "available_models": [m.value for m in available_models], + "selected_model": selected_model.value if selected_model else None, + } + except Exception as e: + stats["collections"][collection_name] = {"error": str(e)} + + return stats + + def close(self): + """Close all connections.""" + if self._qdrant_client: + self._qdrant_client.close() + self._qdrant_client = None + self._st_models.clear() + self._available_models.clear() + self._uses_named_vectors.clear() + + +def create_multi_embedding_retriever(use_production: bool | None = None) -> MultiEmbeddingRetriever: + """Factory function to create a MultiEmbeddingRetriever. + + Args: + use_production: If True, connect to production Qdrant. + Defaults to QDRANT_USE_PRODUCTION env var. + + Returns: + Configured MultiEmbeddingRetriever instance + """ + if use_production is None: + use_production = os.getenv("QDRANT_USE_PRODUCTION", "").lower() in ("true", "1", "yes") + + if use_production: + config = MultiEmbeddingConfig( + qdrant_host=os.getenv("QDRANT_PROD_HOST", "bronhouder.nl"), + qdrant_port=443, + qdrant_https=True, + qdrant_prefix=os.getenv("QDRANT_PROD_PREFIX", "qdrant"), + openai_api_key=os.getenv("OPENAI_API_KEY"), + ) + else: + config = MultiEmbeddingConfig( + qdrant_host=os.getenv("QDRANT_HOST", "localhost"), + qdrant_port=int(os.getenv("QDRANT_PORT", "6333")), + openai_api_key=os.getenv("OPENAI_API_KEY"), + ) + + return MultiEmbeddingRetriever(config) diff --git a/frontend/public/schemas/20251121/linkml/manifest.json b/frontend/public/schemas/20251121/linkml/manifest.json index e1938ec888..76a6332c6a 100644 --- a/frontend/public/schemas/20251121/linkml/manifest.json +++ b/frontend/public/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2026-01-27T08:03:23.376Z", + "generated": "2026-01-27T08:04:51.838Z", "schemaRoot": "/schemas/20251121/linkml", "totalFiles": 3014, "categoryCounts": { diff --git a/schemas/20251121/linkml/manifest.json b/schemas/20251121/linkml/manifest.json index 76a6332c6a..4f4ecd25f2 100644 --- a/schemas/20251121/linkml/manifest.json +++ b/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2026-01-27T08:04:51.838Z", + "generated": "2026-01-27T09:07:17.016Z", "schemaRoot": "/schemas/20251121/linkml", "totalFiles": 3014, "categoryCounts": { diff --git a/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml b/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml new file mode 100644 index 0000000000..f4ca301e51 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/APIEndpoint.yaml @@ -0,0 +1,7 @@ +classes: + APIEndpoint: + class_uri: schema:EntryPoint + description: "An API endpoint." + slots: + - has_or_had_url + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/APIRequest.yaml b/schemas/20251121/linkml/modules/classes/APIRequest.yaml new file mode 100644 index 0000000000..015f01f5dc --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/APIRequest.yaml @@ -0,0 +1,8 @@ +classes: + APIRequest: + class_uri: prov:Activity + description: "An API request event." + slots: + - has_or_had_provenance + - has_or_had_endpoint + - has_or_had_version diff --git a/schemas/20251121/linkml/modules/classes/APIVersion.yaml b/schemas/20251121/linkml/modules/classes/APIVersion.yaml new file mode 100644 index 0000000000..2b50a4cf53 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/APIVersion.yaml @@ -0,0 +1,7 @@ +classes: + APIVersion: + class_uri: schema:SoftwareApplication + description: "Version of an API." + slots: + - has_or_had_label + - has_or_had_identifier diff --git a/schemas/20251121/linkml/modules/classes/Altitude.yaml b/schemas/20251121/linkml/modules/classes/Altitude.yaml new file mode 100644 index 0000000000..517bbf0f44 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Altitude.yaml @@ -0,0 +1,7 @@ +classes: + Altitude: + class_uri: schema:QuantitativeValue + description: "The altitude of a place." + slots: + - has_or_had_value + - has_or_had_unit diff --git a/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml b/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml new file mode 100644 index 0000000000..d8c86aa7f9 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/AmendmentEvent.yaml @@ -0,0 +1,8 @@ +classes: + AmendmentEvent: + class_uri: prov:Activity + description: "An event where a document or agreement was amended." + slots: + - temporal_extent + - has_or_had_description + - has_or_had_identifier diff --git a/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml b/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml new file mode 100644 index 0000000000..915bf4a18d --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/AnnexCreationEvent.yaml @@ -0,0 +1,8 @@ +classes: + AnnexCreationEvent: + class_uri: prov:Activity + description: "An event where an annex was created or established." + slots: + - temporal_extent + - has_or_had_description + - has_or_had_reason diff --git a/schemas/20251121/linkml/modules/classes/AppellationType.yaml b/schemas/20251121/linkml/modules/classes/AppellationType.yaml new file mode 100644 index 0000000000..a466a2cb07 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/AppellationType.yaml @@ -0,0 +1,6 @@ +classes: + AppellationType: + class_uri: skos:Concept + description: "Type of appellation/name." + slots: + - has_or_had_label diff --git a/schemas/20251121/linkml/modules/classes/Archdiocese.yaml b/schemas/20251121/linkml/modules/classes/Archdiocese.yaml new file mode 100644 index 0000000000..28b9d218e3 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Archdiocese.yaml @@ -0,0 +1,6 @@ +classes: + Archdiocese: + class_uri: schema:AdministrativeArea + description: "An archdiocese." + slots: + - has_or_had_label diff --git a/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml b/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml new file mode 100644 index 0000000000..611a73f11b --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/ArchitecturalStyle.yaml @@ -0,0 +1,7 @@ +classes: + ArchitecturalStyle: + class_uri: skos:Concept + description: "An architectural style." + slots: + - has_or_had_label + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml b/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml new file mode 100644 index 0000000000..9c1391def8 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/ArchivalReference.yaml @@ -0,0 +1,7 @@ +classes: + ArchivalReference: + class_uri: rico:Identifier + description: "An archival reference code." + slots: + - has_or_had_identifier + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/Arrangement.yaml b/schemas/20251121/linkml/modules/classes/Arrangement.yaml new file mode 100644 index 0000000000..53a269f2a9 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Arrangement.yaml @@ -0,0 +1,9 @@ +classes: + Arrangement: + class_uri: rico:Arrangement + description: "The arrangement of a collection." + slots: + - has_or_had_description + - has_or_had_type + - has_or_had_level + - has_or_had_note diff --git a/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml b/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml new file mode 100644 index 0000000000..242a1b3e40 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/ArrangementLevel.yaml @@ -0,0 +1,7 @@ +classes: + ArrangementLevel: + class_uri: skos:Concept + description: "Level of arrangement." + slots: + - has_or_had_label + - has_or_had_rank diff --git a/schemas/20251121/linkml/modules/classes/ArrangementType.yaml b/schemas/20251121/linkml/modules/classes/ArrangementType.yaml new file mode 100644 index 0000000000..8e680b92dc --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/ArrangementType.yaml @@ -0,0 +1,6 @@ +classes: + ArrangementType: + class_uri: skos:Concept + description: "Type of arrangement." + slots: + - has_or_had_label diff --git a/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml b/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml index 2d817742ef..2d8059688b 100644 --- a/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml +++ b/schemas/20251121/linkml/modules/classes/ArticlesOfAssociation.yaml @@ -16,11 +16,15 @@ imports: - ../slots/supersede_articles # was: supersede, superseded_by - migrated to class-specific slots 2026-01-16 - ../slots/is_or_was_effective_at - ./ReconstructedEntity - - ../slots/has_amendment_history + - ../slots/is_or_was_amended_through # was: has_amendment_history - migrated per Rule 53 (2026-01-27) + - ./AmendmentEvent - ../slots/is_or_was_archived_in - - ../slots/has_articles_archival_stage - - ../slots/has_articles_document_format - - ../slots/has_articles_document_url + - ../slots/has_or_had_status # was: has_articles_archival_stage - migrated per Rule 53 (2026-01-27) + - ../slots/has_or_had_format # was: has_articles_document_format - migrated per Rule 53 (2026-01-27) + - ../slots/has_or_had_url # was: has_articles_document_url - migrated per Rule 53 (2026-01-27) + - ./RecordCycleStatus + - ./DocumentFormat + - ./URL - ../slots/is_or_was_included_in # was: collected_in - migrated per Rule 53 (2026-01-19) - ../slots/has_or_had_description - ./Description @@ -129,11 +133,11 @@ classes: - prov:Entity - rov:orgType slots: - - has_amendment_history + - is_or_was_amended_through # was: has_amendment_history - migrated per Rule 53 (2026-01-27) - is_or_was_archived_in - - has_articles_archival_stage - - has_articles_document_format - - has_articles_document_url + - has_or_had_status # was: has_articles_archival_stage + - has_or_had_format # was: has_articles_document_format + - has_or_had_url # was: has_articles_document_url - is_or_was_included_in # was: collected_in - migrated per Rule 53 (2026-01-19) - has_or_had_description - has_or_had_title diff --git a/schemas/20251121/linkml/modules/classes/Budget.yaml b/schemas/20251121/linkml/modules/classes/Budget.yaml index bd4d11f8f0..a9c46d43bb 100644 --- a/schemas/20251121/linkml/modules/classes/Budget.yaml +++ b/schemas/20251121/linkml/modules/classes/Budget.yaml @@ -10,7 +10,9 @@ imports: - ./OrganizationalStructure - ./ReconstructedEntity - ../slots/revision_date - - ../slots/has_approval_date + - ../slots/is_or_was_approved_on + - ../classes/Timestamp + - ../classes/TimeSpan - ../slots/has_or_had_acquisition_budget - ../slots/is_or_was_approved_by # MIGRATED: was ../slots/approved_by (2026-01-15) # REMOVED - migrated to has_or_had_currency (Rule 53) @@ -470,7 +472,8 @@ classes: has_or_had_label: "External Grants & Subsidies" internal_funding: 25000000.0 has_or_had_endowment_draw: 5000000.0 - approval_date: '2023-11-15' + is_or_was_approved_on: + start_of_the_start: '2023-11-15' is_or_was_approved_by: approver_name: Board of Directors has_or_had_status: @@ -510,7 +513,8 @@ classes: quantity_value: 6000000.0 has_or_had_label: "Province Subsidy" internal_funding: 2500000.0 - approval_date: '2024-03-01' + is_or_was_approved_on: + start_of_the_start: '2024-03-01' is_or_was_approved_by: approver_name: Province of Noord-Holland has_or_had_status: diff --git a/schemas/20251121/linkml/modules/classes/CallForApplication.yaml b/schemas/20251121/linkml/modules/classes/CallForApplication.yaml index 46f62e6ad4..b03030f1b6 100644 --- a/schemas/20251121/linkml/modules/classes/CallForApplication.yaml +++ b/schemas/20251121/linkml/modules/classes/CallForApplication.yaml @@ -17,8 +17,10 @@ imports: - ./FundingRequirement - ../slots/contact_email - ../slots/keyword - - ../slots/has_application_deadline - - ../slots/has_application_opening_date + - ../slots/is_or_was_due_on + - ../slots/end_of_the_end + - ../slots/is_or_was_opened_on + - ../slots/start_of_the_start # REMOVED 2026-01-17: call_description - migrated to has_or_had_description per Rule 53 # REMOVED 2026-01-17: call_id, call_identifier - migrated to has_or_had_identifier per Rule 53 # REMOVED 2026-01-17: call_short_name, call_title - migrated to has_or_had_label per Rule 53 @@ -111,146 +113,29 @@ classes: - schema:Action - dcterms:BibliographicResource slots: - - has_application_deadline - - has_application_opening_date - - has_or_had_description # was: call_description - migrated per Rule 53 (2026-01-17) - - has_or_had_identifier # was: call_id, call_identifier - migrated per Rule 53 (2026-01-17) - - has_or_had_label # was: call_short_name, call_title - migrated per Rule 53 (2026-01-17) - - has_or_had_status # was: call_status - migrated per Rule 53 (2026-01-17) - - has_or_had_url # was: call_url - migrated per Rule 53 (2026-01-17) - # REMOVED 2026-01-19: co_funding_required - migrated to requires_or_required + CoFunding (Rule 53) - - requires_or_required # was: co_funding_required - migrated per Rule 53 (2026-01-19) - - contact_email - - eligible_applicant - - eligible_country - - has_or_had_funded # was: funded_project - migrated per Rule 53 (2026-01-26) - - offers_or_offered # was: funding_rate - migrated per Rule 53 (2026-01-26) - - heritage_type - - info_session_date - - issuing_organisation - - keyword - - minimum_partner - - parent_programme - - partnership_required - - programme_year - - related_call - - has_or_had_requirement - - results_expected_date - - specificity_annotation - - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) - - is_or_was_categorized_as # was: thematic_area - migrated per Rule 53 - - has_or_had_budget # was: total_budget - migrated per Rule 53 (2026-01-15) - - has_or_had_range - - has_or_had_provenance # was: web_observation - migrated per Rule 53 + - is_or_was_due_on + - is_or_was_opened_on slot_usage: - has_or_had_identifier: - identifier: true - required: true - range: Identifier - multivalued: true - inlined: true - inlined_as_list: true + is_or_was_due_on: + range: TimeSpan description: | - Unique identifier(s) for this funding call. - MIGRATED from call_id, call_identifier per slot_fixes.yaml (Rule 53, 2026-01-17). - - Consolidates: - - call_id (dcterms:identifier) - Primary call identifier (identifier: true) - - call_identifier (dcterms:identifier) - External identifiers (EU F&T, etc.) - - Format: https://nde.nl/ontology/hc/call/{issuing-org-slug}/{call-code} + Deadline for submitting applications. + Replaces has_application_deadline per Rule 53. + Use end_of_the_end for the exact deadline timestamp. examples: - value: - identifier_value: https://nde.nl/ontology/hc/call/ec/cl2-2025-heritage-01 - identifier_scheme: URI - description: Horizon Europe CL2 heritage call (primary identifier) - - value: - identifier_value: HORIZON-CL2-2025-HERITAGE-01 - identifier_scheme: EU_FUNDING_TENDERS - description: EU Funding & Tenders portal ID - - value: - identifier_value: https://nde.nl/ontology/hc/call/nlhf/medium-grants-2025 - identifier_scheme: URI - description: National Lottery Heritage Fund medium grants - has_or_had_label: - required: true - range: string - multivalued: true + end_of_the_end: "2023-12-31T23:59:59Z" + description: Application deadline + is_or_was_opened_on: + range: TimeSpan description: | - Human-readable labels for this funding call. - MIGRATED from call_title, call_short_name per slot_fixes.yaml (Rule 53, 2026-01-17). - - Consolidates: - - call_title (dcterms:title) - Official call title (required) - - call_short_name (skos:altLabel) - Short name/code - - First label should be the official title, additional labels are short names/codes. - examples: - - value: Cultural heritage, cultural and creative industries - description: Horizon Europe Cluster 2 call title (official) - - value: HORIZON-CL2-2025-HERITAGE-01 - description: Horizon Europe call code (short name) - - value: European Cooperation Projects - description: Creative Europe call title (official) - - value: CREA-CULT-2025-COOP - description: Creative Europe cooperation call code - has_or_had_status: - required: true - range: CallForApplicationStatusEnum - description: | - Current lifecycle status of the funding call. - MIGRATED from call_status per slot_fixes.yaml (Rule 53, 2026-01-17). - - See CallForApplicationStatusEnum for status values: - - ANNOUNCED: Call published, not yet open - - OPEN: Currently accepting applications - - CLOSING_SOON: < 30 days until deadline - - CLOSED: Deadline passed - - UNDER_REVIEW: Evaluation in progress - - RESULTS_PUBLISHED: Decisions announced - - CANCELLED: Call terminated - - REOPENED: Previously closed call reactivated - examples: - - value: OPEN - description: Currently accepting applications - - value: CLOSING_SOON - description: Deadline approaching - has_or_had_description: - range: string - description: | - Detailed description of the funding call and its objectives. - MIGRATED from call_description per slot_fixes.yaml (Rule 53, 2026-01-17). - - Maps to dcterms:description for grant/funding opportunity descriptions. - examples: - - value: | - This call supports research and innovation addressing cultural heritage - preservation, digitisation, and access. Projects should develop new - methods, technologies, and approaches for safeguarding tangible and - intangible cultural heritage. - description: Horizon Europe heritage call description - has_or_had_url: - range: URL - multivalued: true - inlined: true - inlined_as_list: true - description: | - Official call documentation or application portal URL(s). - MIGRATED from call_url per slot_fixes.yaml (Rule 53, 2026-01-17). - - Maps to schema:url for web addresses. + Date when applications opened. + Replaces has_application_opening_date per Rule 53. + Use start_of_the_start for the opening timestamp. examples: - value: - url_value: https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01 - url_type: application_portal - description: Horizon Europe call application portal - - value: - url_value: https://www.heritagefund.org.uk/funding/medium-grants - url_type: documentation - description: National Lottery Heritage Fund documentation - has_application_deadline: - required: true - range: date + start_of_the_start: "2023-01-01T00:00:00Z" + description: Opening date examples: - value: '2025-09-16' description: Horizon Europe CL2 2025 deadline diff --git a/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml b/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml index 504acf98c2..c39483b66a 100644 --- a/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml +++ b/schemas/20251121/linkml/modules/classes/CustodianLegalStatus.yaml @@ -58,7 +58,8 @@ imports: - ../slots/is_or_was_revision_of # was: was_revision_of - migrated per Rule 53 (2026-01-15) - ../slots/identifier - ../slots/is_or_was_responsible_for # was: collections_under_responsibility - migrated per Rule 53 (2026-01-19) - - ../slots/has_articles_of_association + - ../slots/has_or_had_document # was: has_articles_of_association - migrated per Rule 53 (2026-01-27) + - ./ArticlesOfAssociation - ../slots/registration_date - ../slots/specificity_annotation - ../slots/has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) @@ -117,7 +118,7 @@ classes: - is_or_was_responsible_for # was: collections_under_responsibility - migrated per Rule 53 (2026-01-19) - is_or_was_dissolved_by - defines_or_defined - - has_articles_of_association + - has_or_had_document # was: has_articles_of_association - identifier - legal_entity_type - legal_form @@ -270,8 +271,12 @@ classes: has_or_had_type: hierarchical has_or_had_description: Board of trustees with director-led departments description: Museum governance structure - has_articles_of_association: + has_or_had_document: range: ArticlesOfAssociation + inlined: true + description: >- + Articles of Association or other founding documents. + MIGRATED from has_articles_of_association per Rule 53 (2026-01-27). multivalued: true required: false examples: diff --git a/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml b/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml index 4e50a90e2b..f8c05931c4 100644 --- a/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml +++ b/schemas/20251121/linkml/modules/classes/GeoSpatialPlace.yaml @@ -13,7 +13,8 @@ imports: - ../metadata - ../slots/has_or_had_coordinates # was: latitude, longitude, accuracy - migrated per Rule 53 (2026-01-26) - ./Coordinates - - ../slots/has_altitude + - ../slots/has_or_had_altitude # was: has_altitude - migrated per Rule 53 (2026-01-27) + - ./Altitude - ../slots/has_or_had_geographic_extent # was: bounding_box - migrated per Rule 53/56 (2026-01-17) - ../slots/has_or_had_identifier - ../slots/coordinate_reference_system @@ -164,7 +165,7 @@ classes: - crm:E53_Place slots: - has_or_had_coordinates # was: latitude, longitude, accuracy - - has_altitude + - has_or_had_altitude # was: has_altitude - migrated per Rule 53 (2026-01-27) - has_or_had_geographic_extent # was: bounding_box - migrated per Rule 53/56 (2026-01-17) - has_or_had_identifier - coordinate_reference_system diff --git a/schemas/20251121/linkml/modules/classes/Loan.yaml b/schemas/20251121/linkml/modules/classes/Loan.yaml index ac26824bac..205d61ea40 100644 --- a/schemas/20251121/linkml/modules/classes/Loan.yaml +++ b/schemas/20251121/linkml/modules/classes/Loan.yaml @@ -14,7 +14,8 @@ imports: - ../metadata - ./TimeSpan - ../enums/LoanStatusEnum - - ../slots/has_approval_date + - ../slots/is_or_was_approved_on + - ../classes/Timestamp - ../slots/has_actual_return_date - ../slots/is_or_was_based_on - ../classes/Agreement @@ -101,133 +102,18 @@ classes: slots: - temporal_extent # was: has_actual_return_date - migrated per Rule 53 (2026-01-26) - is_or_was_based_on - - has_approval_date - - custody_received_by # was: borrower - migrated per Rule 53/56 (2026-01-17) - - has_or_had_contact_point # was: borrower_contact - migrated per Rule 53/56 (2026-01-17) - # MIGRATED 2026-01-22: condition_on_return → is_or_was_returned + ReturnEvent (Rule 53) - - is_or_was_returned - - courier_detail - - courier_required - - has_or_had_custodian_type - - is_or_was_displayed_at - - has_or_had_objective # was: exhibition_ref - migrated per Rule 53 (2026-01-26) - - is_or_was_extended - - insurance_currency - - insurance_provider - - insurance_value - - lender - - lender_contact - - loan_agreement_url - - loan_end_date - - loan_id - - loan_note - - loan_number - - loan_purpose - - loan_start_date - - loan_status - - loan_timespan - - loan_type - - has_or_had_loaned_object - - original_end_date - - outbound_condition_report_url - - request_date - - return_condition_report_url - - shipping_method - - special_requirement - - specificity_annotation - - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) + - is_or_was_approved_on slot_usage: - is_or_was_based_on: - range: Agreement + is_or_was_approved_on: + range: TimeSpan description: | - The formal agreement governing the loan. - Replaces has_agreement_signed_date per Rule 53. + Date when the loan was approved. + Replaces has_approval_date per Rule 53. + Use start_of_the_start for the approval timestamp. examples: - value: - has_or_had_label: "Loan Agreement 2023-001" - is_or_was_signed_on: "2022-03-15" - description: Signed loan agreement - loan_id: - identifier: true - required: true - range: uriorcurie - examples: - - value: https://nde.nl/ontology/hc/loan/mauritshuis-rijksmuseum-2023-001 - - value: https://nde.nl/ontology/hc/loan/british-museum-met-2024-003 - loan_number: - required: false - range: string - examples: - - value: MH-OUT-2023-0042 - description: Mauritshuis outgoing loan number - - value: RM-IN-2023-0127 - description: Rijksmuseum incoming loan number - has_or_had_loaned_object: - required: true - range: uriorcurie - multivalued: true - inlined: false - examples: - - value: https://nde.nl/ontology/hc/object/mauritshuis-girl-pearl-earring - - value: https://nde.nl/ontology/hc/object/mauritshuis-view-delft - lender: - required: true - range: uriorcurie - inlined: false - examples: - - value: https://nde.nl/ontology/hc/custodian/nl/mauritshuis - lender_contact: - required: false - range: string - examples: - - value: Dr. Maria van der Berg, Registrar - custody_received_by: # was: borrower - migrated per Rule 53/56 (2026-01-17) - description: >- - Institution borrowing the object(s). - CIDOC-CRM: P29_custody_received_by - identifies the E39 Actor who receives custody. - required: true - range: uriorcurie - inlined: false - examples: - - value: https://nde.nl/ontology/hc/custodian/nl/rijksmuseum - has_or_had_contact_point: # was: borrower_contact - migrated per Rule 53/56 (2026-01-17) - description: >- - Contact person at borrowing institution for this loan. - required: false - range: string - examples: - - value: Anna de Wit, Exhibition Coordinator - loan_status: - required: true - range: LoanStatusEnum - examples: - - value: CLOSED - description: Completed loan - - value: ON_LOAN - description: Object currently at borrower - loan_type: - required: false - range: string - examples: - - value: EXHIBITION_LOAN - - value: STUDY_LOAN - - value: LONG_TERM_LOAN - loan_purpose: - required: false - range: string - examples: - - value: Major Vermeer retrospective exhibition marking 350th anniversary - - value: Technical examination for catalogue raisonné research - request_date: - required: false - range: date - examples: - - value: '2021-06-15' - has_approval_date: - required: false - range: date - examples: - - value: '2021-09-20' + start_of_the_start: "2021-09-20" + description: Approval date has_agreement_signed_date: required: false range: date diff --git a/schemas/20251121/linkml/modules/classes/Memento.yaml b/schemas/20251121/linkml/modules/classes/Memento.yaml new file mode 100644 index 0000000000..0f0dcb2f0c --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Memento.yaml @@ -0,0 +1,7 @@ +classes: + Memento: + class_uri: schema:WebPage + description: "A web archive memento." + slots: + - has_or_had_url + - temporal_extent diff --git a/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml b/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml new file mode 100644 index 0000000000..b5081aeb62 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/ProvenancePath.yaml @@ -0,0 +1,6 @@ +classes: + ProvenancePath: + class_uri: prov:Plan + description: "A path or chain of provenance." + slots: + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/Reason.yaml b/schemas/20251121/linkml/modules/classes/Reason.yaml new file mode 100644 index 0000000000..57f1fe7653 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Reason.yaml @@ -0,0 +1,7 @@ +classes: + Reason: + class_uri: skos:Concept + description: "A reason or justification." + slots: + - has_or_had_label + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml b/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml new file mode 100644 index 0000000000..0ec8729f73 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/RecordCycleStatus.yaml @@ -0,0 +1,7 @@ +classes: + RecordCycleStatus: + class_uri: skos:Concept + description: "The status of a record within its lifecycle." + slots: + - has_or_had_label + - has_or_had_description diff --git a/schemas/20251121/linkml/modules/classes/SearchScore.yaml b/schemas/20251121/linkml/modules/classes/SearchScore.yaml new file mode 100644 index 0000000000..c4cbf6bb12 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/SearchScore.yaml @@ -0,0 +1,6 @@ +classes: + SearchScore: + class_uri: schema:Rating + description: "A search relevance score." + slots: + - has_or_had_value diff --git a/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml index 0c02efc7e7..ac63d38a5b 100644 --- a/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml +++ b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml @@ -79,9 +79,9 @@ classes: - as:Activity - schema:ClaimReview slots: - - has_annotation_motivation - - has_annotation_segment - - has_annotation_type + - has_or_had_rationale + - contains_or_contained + - has_or_had_type # MIGRATED 2026-01-25: detection_count, detection_threshold → filters_or_filtered (Rule 53) - filters_or_filtered # REMOVED 2026-01-22: frame_sample_rate - migrated to analyzes_or_analyzed + VideoFrame + has_or_had_quantity (Rule 53) @@ -94,20 +94,36 @@ classes: - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) - analyzes_or_analyzed slot_usage: - has_annotation_type: - range: AnnotationTypeEnum + has_or_had_type: + range: AnnotationType required: true + description: Type of annotation (Object detection, Scene detection, etc.) examples: - - value: OBJECT_DETECTION + - value: + has_or_had_code: OBJECT_DETECTION + has_or_had_label: Object Detection description: Object and face detection annotation - has_annotation_segment: - range: VideoTimeSegment + contains_or_contained: + range: Segment multivalued: true required: false inlined_as_list: true + description: >- + Segments (temporal or spatial) identified by the annotation. + MIGRATED from has_annotation_segment per Rule 53. examples: - - value: '[{start_seconds: 30.0, end_seconds: 35.0, segment_text: ''Night Watch painting visible''}]' + - value: + has_or_had_label: 'Night Watch painting visible' + has_or_had_description: '30.0 - 35.0 seconds' description: Object detection segment + has_or_had_rationale: + range: Rationale + required: false + description: Motivation for the annotation. + examples: + - value: + has_or_had_label: ClassifyingMotivation + description: Annotation for classification purposes # DEPRECATED 2026-01-25: detection_threshold, detection_count → filters_or_filtered + DetectedEntity (Rule 53) # Old: detection_threshold: 0.5, detection_count: 342 # New: filters_or_filtered with DetectedEntity containing Quantity and DetectionThreshold @@ -146,13 +162,6 @@ classes: has_or_had_label: "High Precision" description: "89 high-confidence detections" # MIGRATED 2026-01-22: frame_sample_rate → analyzes_or_analyzed + VideoFrame + has_or_had_quantity (Rule 53) - # frame_sample_rate: - # range: float - # required: false - # minimum_value: 0.0 - # examples: - # - value: 1.0 - # description: Analyzed 1 frame per second analyzes_or_analyzed: description: | MIGRATED 2026-01-22: Now supports VideoFrame class for frame_sample_rate migration. @@ -216,12 +225,6 @@ classes: examples: - value: false description: No segmentation masks included - has_annotation_motivation: - range: AnnotationMotivationType - required: false - examples: - - value: ClassifyingMotivation - description: Annotation for classification purposes comments: - Abstract base for all CV/multimodal video annotations - Extends VideoTextContent with frame-based analysis parameters diff --git a/schemas/20251121/linkml/modules/slots/administrative_context.yaml b/schemas/20251121/linkml/modules/slots/archive/administrative_context.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/administrative_context.yaml rename to schemas/20251121/linkml/modules/slots/archive/administrative_context.yaml diff --git a/schemas/20251121/linkml/modules/slots/based_on_claim.yaml b/schemas/20251121/linkml/modules/slots/archive/based_on_claim.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/based_on_claim.yaml rename to schemas/20251121/linkml/modules/slots/archive/based_on_claim.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_altitude.yaml b/schemas/20251121/linkml/modules/slots/archive/has_altitude.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_altitude.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_altitude.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_amendment_history.yaml b/schemas/20251121/linkml/modules/slots/archive/has_amendment_history.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_amendment_history.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_amendment_history.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annex_description.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_description.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annex_description.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annex_description.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annex_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_name.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annex_name.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annex_name.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annex_reason.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annex_reason.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annex_reason.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annex_reason.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_motivation.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_motivation_archived_20260127.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annotation_motivation.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_motivation_archived_20260127.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_segment.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_segment_archived_20260127.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annotation_segment.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_segment_archived_20260127.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_annotation_type.yaml b/schemas/20251121/linkml/modules/slots/archive/has_annotation_type_archived_20260127.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_annotation_type.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_annotation_type_archived_20260127.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_api_version.yaml b/schemas/20251121/linkml/modules/slots/archive/has_api_version.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_api_version.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_api_version.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_language.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_language.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_appellation_language.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_language.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_type.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_type.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_appellation_type.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_type.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_appellation_value.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appellation_value.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_appellation_value.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_appellation_value.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_applicable_country.yaml b/schemas/20251121/linkml/modules/slots/archive/has_applicable_country.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_applicable_country.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_applicable_country.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_application_deadline.yaml b/schemas/20251121/linkml/modules/slots/archive/has_application_deadline.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_application_deadline.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_application_deadline.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_application_opening_date.yaml b/schemas/20251121/linkml/modules/slots/archive/has_application_opening_date.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_application_opening_date.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_application_opening_date.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_appraisal_note.yaml b/schemas/20251121/linkml/modules/slots/archive/has_appraisal_note.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_appraisal_note.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_appraisal_note.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_approval_date.yaml b/schemas/20251121/linkml/modules/slots/archive/has_approval_date.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_approval_date.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_approval_date.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archdiocese_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archdiocese_name.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archdiocese_name.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archdiocese_name.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_architectural_style.yaml b/schemas/20251121/linkml/modules/slots/archive/has_architectural_style.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_architectural_style.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_architectural_style.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archival_reference.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archival_reference.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archival_reference.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archival_reference.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archive_description.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_description.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archive_description.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archive_description.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archive_memento_uri.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_memento_uri.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archive_memento_uri.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archive_memento_uri.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archive_name.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_name.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archive_name.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archive_name.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archive_path.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_path.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archive_path.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archive_path.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_archive_search_score.yaml b/schemas/20251121/linkml/modules/slots/archive/has_archive_search_score.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_archive_search_score.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_archive_search_score.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_arrangement.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement_level.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement_level.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_arrangement_level.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement_level.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_arrangement_note.yaml b/schemas/20251121/linkml/modules/slots/archive/has_arrangement_note.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_arrangement_note.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_arrangement_note.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_articles_archival_stage.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_archival_stage.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_articles_archival_stage.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_articles_archival_stage.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_articles_document_format.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_document_format.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_articles_document_format.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_articles_document_format.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_articles_document_url.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_document_url.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_articles_document_url.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_articles_document_url.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_articles_of_association.yaml b/schemas/20251121/linkml/modules/slots/archive/has_articles_of_association.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/has_articles_of_association.yaml rename to schemas/20251121/linkml/modules/slots/archive/has_articles_of_association.yaml diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml new file mode 100644 index 0000000000..039a32c3f8 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_altitude.yaml @@ -0,0 +1,5 @@ +name: has_or_had_altitude +description: The altitude of a place. +slot_uri: wgs84:alt +range: Altitude +multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml new file mode 100644 index 0000000000..a9e537ee8d --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_annotation.yaml @@ -0,0 +1,5 @@ +name: has_or_had_annotation +description: An annotation on the entity. +slot_uri: oa:hasAnnotation +range: Annotation +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml new file mode 100644 index 0000000000..fcb618c1b9 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement.yaml @@ -0,0 +1,5 @@ +name: has_or_had_arrangement +description: The arrangement of the collection. +slot_uri: rico:hasArrangement +range: Arrangement +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml new file mode 100644 index 0000000000..7ef9c92a4e --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_document.yaml @@ -0,0 +1,5 @@ +name: has_or_had_document +description: A document associated with the entity. +slot_uri: foaf:isPrimaryTopicOf +range: ArticlesOfAssociation +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml index 1024665e2a..6f68efe8a4 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_provenance_path.yaml @@ -14,7 +14,7 @@ prefixes: imports: - linkml:types - - ../classes/XPath + - ../classes/ProvenancePath default_prefix: hc slots: @@ -38,7 +38,7 @@ slots: Typically used within a Provenance class to link the provenance activity to the specific document location from which data was extracted. - range: XPath + range: ProvenancePath slot_uri: prov:atLocation inlined: true @@ -65,4 +65,4 @@ slots: comments: - Created from slot_fixes.yaml migration (2026-01-14) - Replaces direct xpath slot usage with structured path object - - Links Provenance class to XPath class + - Links Provenance class to ProvenancePath class diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml index 9b890f0241..6f953978b2 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_rationale.yaml @@ -19,10 +19,11 @@ default_prefix: hc imports: - linkml:types + - ../classes/Rationale slots: has_or_had_rationale: - slot_uri: hc:hasOrHadRationale + slot_uri: prov:used description: | The rationale or justification for a decision or mapping. @@ -33,22 +34,17 @@ slots: - Explanation notes **Ontological Alignment**: - - **Primary** (`slot_uri`): `hc:hasOrHadRationale` - Heritage Custodian ObjectProperty - for class-valued Rationale range + - **Primary** (`slot_uri`): `prov:used` (per 2026-01-26 update) - **Close**: `skos:note` - SKOS note (DatatypeProperty) - - **Close**: `prov:wasInfluencedBy` - PROV-O provenance - **Note**: slot_uri changed from skos:note to hc:hasOrHadRationale (2026-01-16) - to allow class-valued ranges when classes use Rationale class. - - range: uriorcurie # Broadened per Rule 55 (2026-01-16) - Any allows both literals and class instances - implements: - - owl:ObjectProperty # Force OWL ObjectProperty to avoid ambiguous type warning (2026-01-16) + range: Rationale + multivalued: true close_mappings: - skos:note - prov:wasInfluencedBy examples: - - value: "Mapped to Q123456 based on exact name match and location verification" + - value: + has_or_had_label: "Mapped to Q123456 based on exact name match" description: Wikidata mapping rationale diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml new file mode 100644 index 0000000000..20c13a8d68 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_reason.yaml @@ -0,0 +1,5 @@ +name: has_or_had_reason +description: The reason for an activity or state. +slot_uri: prov:used +range: Reason +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml new file mode 100644 index 0000000000..65a8787733 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/has_or_had_style.yaml @@ -0,0 +1,5 @@ +name: has_or_had_style +description: The style of the entity. +slot_uri: schema:genre +range: ArchitecturalStyle +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml new file mode 100644 index 0000000000..4c3deb9390 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_amended_through.yaml @@ -0,0 +1,5 @@ +name: is_or_was_amended_through +description: The event through which the entity was amended. +slot_uri: prov:wasInfluencedBy +range: AmendmentEvent +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml new file mode 100644 index 0000000000..7f2b3e415a --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_approved_on.yaml @@ -0,0 +1,5 @@ +name: is_or_was_approved_on +description: The approval date. +slot_uri: schema:datePublished +range: TimeSpan +multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml new file mode 100644 index 0000000000..264fdc45be --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_archived_as.yaml @@ -0,0 +1,5 @@ +name: is_or_was_archived_as +description: The archived version (memento) of the resource. +slot_uri: schema:archivedAt +range: Memento +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml new file mode 100644 index 0000000000..b4ed00aeb7 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_due_on.yaml @@ -0,0 +1,5 @@ +name: is_or_was_due_on +description: The deadline or due date. +slot_uri: schema:endDate +range: TimeSpan +multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml new file mode 100644 index 0000000000..1e5027a882 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_opened_on.yaml @@ -0,0 +1,5 @@ +name: is_or_was_opened_on +description: The opening date of an application or event. +slot_uri: schema:startDate +range: TimeSpan +multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml new file mode 100644 index 0000000000..2c171e71c7 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_used_in.yaml @@ -0,0 +1,5 @@ +name: is_or_was_used_in +description: The context in which something is used. +slot_uri: prov:wasUsedBy +range: GovernanceStructure +multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml index bfd240c372..408264165f 100644 --- a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml +++ b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml @@ -27,10 +27,6 @@ fixes: type: slot - label: TimeSpan type: class - processed: - status: true - date: '2026-01-26' - notes: Migrated to is_or_was_acquired_through + AcquisitionEvent. Slot archived. - original_slot_id: https://nde.nl/ontology/hc/slot/has_acquisition_date revision: @@ -48,368 +44,13 @@ fixes: type: class processed: status: true - date: '2026-01-26' - notes: Migrated to temporal_extent + TimeSpan (end_of_the_end) in Loan.yaml. Slot archived. -- original_slot_id: https://nde.nl/ontology/hc/slot/has_actual_return_date - - revision: - - label: temporal_extent - type: slot - - label: TimeSpan - type: class - - label: end_of_the_end - type: slot - - label: Timestamp - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_address - revision: - - label: has_or_had_address - type: slot - - label: Address - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_description - revision: - - label: has_or_had_description - type: slot - - label: Description - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_identifier - revision: - - label: has_or_had_identifier - type: slot - - label: Identifier - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_admin_office_name - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class - processed: - status: true - date: '2026-01-26' - notes: Migrated to has_or_had_label + Label in CustodianAdministration.yaml. Slot archived. -- original_slot_id: https://nde.nl/ontology/hc/slot/has_administration_name - - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class -- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_administrative_level - revision: - - label: is_or_was_part_of - type: slot - - label: GovernmentHierarchy - type: class - - label: has_or_had_tier - type: slot - - label: AdministrativeLevel - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_adoption_context - revision: - - label: describes_or_described - type: slot - - label: Policy - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_age - revision: - - label: has_or_had_age - type: slot - - label: Age - type: class - processed: - status: true - date: '2026-01-26' - notes: Migrated to has_or_had_age + Age in PersonObservation.yaml. Slot archived. -- original_slot_id: https://nde.nl/ontology/hc/slot/has_age -nda_description - revision: - - label: has_or_had_description - type: slot - - label: Description - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_document_url - revision: - - label: has_or_had_url - type: slot - - label: URL - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_short_name - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class - - label: has_or_had_type - type: slot - - label: LabelType - type: class - - label: includes_or_included - type: slot - - label: LabelTypes - type: class - note: AbbreviationLabel class is defined in the LinkML file -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_title - revision: - - label: has_or_had_title - type: slot - - label: Title - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agenda_url - revision: - - label: has_or_had_url - type: slot - - label: URL - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agent_name - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agent_type - revision: - - label: has_or_had_type - type: slot - - label: AgentType - type: class - - label: includes_or_included - type: slot - - label: AgentTypes - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_agreement_signed_date + date: '2026-01-27' + notes: Migrated to is_or_was_approved_on + TimeSpan. Slot archived. +- original_slot_id: https://nde.nl/ontology/hc/slot/has_approval_date processed: status: true date: '2026-01-27' - notes: Fully migrated to is_or_was_based_on + Agreement class + is_or_was_signed_on slot (Rule 53). Loan.yaml updated. Slot archived. - revision: - - label: is_or_was_based_on - type: slot - - label: Agreement - type: class - - label: is_or_was_signed_on - type: slot - - label: TimeSpan - type: class - - label: start_of_the_start - type: slot - - label: Timestamp - type: class -- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_air_changes_per_hour - processed: - status: true - date: '2026-01-27' - notes: Fully migrated to specifies_or_specified + Ventilation class + AirChanges class (Rule 53). StorageConditionPolicy.yaml updated. Slot archived. - revision: - - label: specifies_or_specified - type: slot - - label: Ventilation - type: class - - label: requires_or_required - type: slot - - label: AirChanges - type: class - - label: has_or_had_quantity - type: slot - - label: Quantity - type: class - - label: has_or_had_unit - type: slot - - label: Unit - type: class - value: air changes per hour -- original_slot_id: https://nde.nl/ontology/hc/slot/has_allocation_date - processed: - status: true - date: '2026-01-27' - notes: Fully migrated to is_or_was_allocated_through + AllocationEvent (Rule 53). CustodianIdentifier.yaml updated. Slot archived. - revision: - - label: is_or_was_allocated_through - type: slot - - label: AllocationEvent - type: class - - label: temporal_extent - type: slot - - label: TimeSpan - type: class - - label: temporal_extent - type: slot - - label: TimeSpan - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_2_code - processed: - status: true - date: '2026-01-27' - notes: Fully migrated to has_or_had_identifier + Alpha2Code class (Rule 53). Country.yaml updated. Slot archived. - revision: - - label: has_or_had_identifier - type: slot - - label: Alpha2Code - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_3_code - processed: - status: true - date: '2026-01-27' - notes: Fully migrated to has_or_had_identifier + Alpha3Code class (Rule 53). Country.yaml updated. Slot archived. - revision: - - label: has_or_had_identifier - type: slot - - label: Alpha3Code - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_alpha_3_code_dup - processed: - status: true - date: '2026-01-27' - notes: Duplicate entry processed. - revision: - - label: has_or_had_identifier - type: slot - - label: Alpha3Code - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_altitude - revision: - - label: has_or_had_altitude - type: slot - - label: Altitude - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_amendment_history - revision: - - label: is_or_was_amended_through - type: slot - - label: AmendmentEvent - type: class - - label: has_or_had_provenance - type: slot - - label: Provenance - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_description - revision: - - label: has_or_had_description - type: slot - - label: Description - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_name - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annex_reason - revision: - - label: is_or_was_created_through - type: slot - - label: AnnexCreationEvent - type: class - - label: has_or_had_reason - type: slot - - label: Reason - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_by - revision: - - label: contains_or_contained - type: slot - - label: Annotation - type: class - - label: is_or_was_created_by - type: slot - - label: Agent - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_motivation - revision: - - label: has_or_had_rationale - type: slot - - label: Rationale - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_segment - revision: - - label: contains_or_contained - type: slot - - label: Segment - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_annotation_type - revision: - - label: has_or_had_type - type: slot - - label: AnnotationType - type: class - - label: includes_or_included - type: slot - - label: AnnotationTypes - type: class -- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_api_version - revision: - - label: has_or_had_provenance - type: slot - - label: Provenance - type: class - - label: is_or_was_retrieved_through - type: slot - - label: APIRequest - type: class - - label: has_or_had_endpoint - type: slot - - label: APIEndpoint - type: class - - label: has_or_had_version - type: slot - - label: APIVersion - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_language - revision: - - label: has_or_had_language - type: slot - - label: Language - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_type - revision: - - label: has_or_had_type - type: slot - - label: AppellationType - type: class - - label: includes_or_included - type: slot - - label: AppellationTypes - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_appellation_value - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_applicable_country - revision: - - label: is_or_was_applicable_in - type: slot - - label: Country - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_application_deadline - revision: - - label: is_or_was_due_on - type: slot - - label: TimeSpan - type: class - - label: end_of_the_end - type: slot - - label: Timestamp - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_application_opening_date - revision: - - label: is_or_was_opened_on - type: slot - - label: TimeSpan - type: class - - label: start_of_the_start - type: slot - - label: Timestamp - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_appraisal_note - revision: - - label: has_or_had_note - type: slot - - label: Note - type: class -- orignal_slot_id: https://nde.nl/ontology/hc/slot/has_approval_date + notes: Fully migrated to is_or_was_approved_on + TimeSpan (Rule 53). Loan.yaml and Budget.yaml updated. Slot archived. revision: - label: is_or_was_approved_on type: slot @@ -419,6 +60,10 @@ nda_description type: slot - label: Timestamp type: class + - label: start_of_the_start + type: slot + - label: Timestamp + type: class - original_slot_id: https://nde.nl/ontology/hc/slot/has_archdiocese_name revision: - label: is_or_was_part_of @@ -469,94 +114,31 @@ nda_description type: slot - label: URL type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_name - revision: - - label: has_or_had_label - type: slot - - label: Label - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_path - revision: - - label: has_or_had_provenance - type: slot - - label: Provenance - type: class - - label: has_or_had_provenance_path - type: slot - - label: ProvenancePath - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_archive_search_score - revision: - - label: has_or_had_score - type: slot - - label: SearchScore - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement - revision: - - label: has_or_had_arrangement - type: slot - - label: Arrangement - type: class - - label: has_or_had_type - type: slot - - label: ArrangementType - type: class - - label: includes_or_included - type: slot - - label: ArrangementTypes - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement_level - revision: - - label: has_or_had_arrangement - type: slot - - label: Arrangement - type: class - - label: has_or_had_type - type: slot - - label: ArrangementType - type: class - - label: includes_or_included - type: slot - - label: ArrangementTypes - type: class - - label: has_or_had_level - type: slot - - label: ArrangementLevel - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_arrangement_note - revision: - - label: has_or_had_arrangement - type: slot - - label: Arrangement - type: class - - label: has_or_had_note - type: slot - - label: Note - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_archival_stage - revision: - - label: has_or_had_status - type: slot - - label: RecordCycleStatus - type: class -- original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_document_format - revision: - - label: has_or_had_format - type: slot - - label: DocumentFormat - type: class + processed: + status: true + date: '2026-01-27' + notes: Migrated to has_or_had_url + URL in ArticlesOfAssociation.yaml. Slot archived. - original_slot_id: https://nde.nl/ontology/hc/slot/has_articles_document_url + revision: - label: has_or_had_url type: slot - label: URL type: class + processed: + status: true + date: '2026-01-27' + notes: Migrated to has_or_had_url + URL in ArticlesOfAssociation.yaml. Slot archived. - orignal_slot_id: https://nde.nl/ontology/hc/slot/has_articles_of_association revision: - label: has_or_had_document type: slot - label: ArticlesOfAssociation type: class + processed: + status: true + date: '2026-01-27' + notes: Migrated to has_or_had_document + ArticlesOfAssociation in relevant classes. Slot archived. - original_slot_id: https://nde.nl/ontology/hc/slot/has_aspect_ratio revision: - label: has_or_had_degree diff --git a/schemas/20251121/linkml/update_manifest.py b/schemas/20251121/linkml/update_manifest.py index 10c09475df..a5456858ef 100644 --- a/schemas/20251121/linkml/update_manifest.py +++ b/schemas/20251121/linkml/update_manifest.py @@ -43,39 +43,17 @@ def update_manifest(add_files, remove_files): if __name__ == "__main__": # Define files to add add_files = [ - {"name": "AccessApplication", "path": "modules/classes/AccessApplication.yaml", "category": "class"}, - {"name": "AccessInterface", "path": "modules/classes/AccessInterface.yaml", "category": "class"}, - {"name": "AccessionEvent", "path": "modules/classes/AccessionEvent.yaml", "category": "class"}, - {"name": "Accumulation", "path": "modules/classes/Accumulation.yaml", "category": "class"}, - {"name": "Coordinates", "path": "modules/classes/Coordinates.yaml", "category": "class"}, - {"name": "AcquisitionEvent", "path": "modules/classes/AcquisitionEvent.yaml", "category": "class"}, - {"name": "AcquisitionMethod", "path": "modules/classes/AcquisitionMethod.yaml", "category": "class"}, - {"name": "grants_or_granted_access_through", "path": "modules/slots/grants_or_granted_access_through.yaml", "category": "slot"}, - {"name": "has_or_had_interface", "path": "modules/slots/has_or_had_interface.yaml", "category": "slot"}, - {"name": "is_or_was_accessioned_through", "path": "modules/slots/is_or_was_accessioned_through.yaml", "category": "slot"}, - {"name": "has_or_had_accumulation", "path": "modules/slots/has_or_had_accumulation.yaml", "category": "slot"}, - {"name": "has_or_had_coordinates", "path": "modules/slots/has_or_had_coordinates.yaml", "category": "slot"}, - {"name": "is_or_was_acquired_through", "path": "modules/slots/is_or_was_acquired_through.yaml", "category": "slot"}, - {"name": "was_acquired_through", "path": "modules/slots/was_acquired_through.yaml", "category": "slot"}, - {"name": "has_or_had_method", "path": "modules/slots/has_or_had_method.yaml", "category": "slot"}, + {"name": "RecordCycleStatus", "path": "modules/classes/RecordCycleStatus.yaml", "category": "class"}, + {"name": "DocumentFormat", "path": "modules/classes/DocumentFormat.yaml", "category": "class"}, + {"name": "has_or_had_document", "path": "modules/slots/has_or_had_document.yaml", "category": "slot"}, ] # Define files to remove (archived slots) remove_files = [ - "has_access_application_url", - "has_access_interface_url", - "has_accession_date", - "has_accession_number", - "has_accumulation_end_date", - "has_accumulation_start_date", - "has_accuracy_in_meters", - "has_acquisition_date", - "has_acquisition_history", - "has_acquisition_method", - "has_acquisition_source", - "has_activity_description", - "has_activity_identifier", - "has_activity_name" + "has_articles_archival_stage", + "has_articles_document_format", + "has_articles_document_url", + "has_articles_of_association" ] update_manifest(add_files, remove_files)