600 lines
19 KiB
Markdown
600 lines
19 KiB
Markdown
# Retrieval Patterns for Heritage Custodian RAG
|
|
|
|
## Overview
|
|
|
|
This document describes hybrid retrieval patterns that combine vector similarity, knowledge graph traversal, and structured queries for the Heritage Custodian domain.
|
|
|
|
## Retrieval Strategy Matrix
|
|
|
|
| Query Type | Primary Strategy | Secondary Strategy | Example Query |
|
|
|------------|-----------------|-------------------|---------------|
|
|
| **Factual** | Vector + KG lookup | SPARQL (Wikidata) | "What is the ISIL code for Rijksmuseum?" |
|
|
| **Exploratory** | Vector (broad) | KG traversal | "What museums exist in Limburg?" |
|
|
| **Relationship** | KG traversal | Vector context | "Which institutions are members of NDE?" |
|
|
| **Comparative** | Multi-vector | KG aggregation | "Compare Dutch and Belgian archive systems" |
|
|
| **Temporal** | KG (events) | Vector context | "How has Noord-Hollands Archief changed?" |
|
|
| **Collection** | Vector (semantic) | KG links | "Find archives with WWII collections" |
|
|
|
|
## 1. Vector Retrieval Patterns
|
|
|
|
### Basic Semantic Search
|
|
|
|
```python
|
|
class SemanticRetriever:
|
|
"""Basic vector similarity retrieval."""
|
|
|
|
def __init__(self, vector_store):
|
|
self.vector_store = vector_store
|
|
self.embedder = HeritageEmbedder()
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
k: int = 10,
|
|
type_filter: str = None,
|
|
country_filter: str = None,
|
|
) -> List[RetrievalResult]:
|
|
# Embed query
|
|
query_embedding = self.embedder.embed_query(query, type_filter)
|
|
|
|
# Build filter
|
|
filters = {}
|
|
if type_filter:
|
|
filters["custodian_type"] = type_filter
|
|
if country_filter:
|
|
filters["country_code"] = country_filter
|
|
|
|
# Search
|
|
results = self.vector_store.query(
|
|
query_embedding=query_embedding,
|
|
n_results=k,
|
|
where=filters if filters else None,
|
|
)
|
|
|
|
return [
|
|
RetrievalResult(
|
|
text=r["document"],
|
|
metadata=r["metadata"],
|
|
score=r["distance"],
|
|
)
|
|
for r in results
|
|
]
|
|
```
|
|
|
|
### Filtered Retrieval by Type
|
|
|
|
```python
|
|
class TypeFilteredRetriever:
|
|
"""Retrieval with GLAMORCUBESFIXPHDNT type filtering."""
|
|
|
|
TYPE_GROUPS = {
|
|
"CULTURAL": ["G", "M", "A", "L"], # Core GLAM
|
|
"HERITAGE_SOCIETIES": ["S", "I", "N"], # Community organizations
|
|
"INSTITUTIONAL": ["O", "R", "E"], # Government/academic
|
|
"SPECIALIZED": ["B", "H", "T", "F"], # Domain-specific
|
|
"DIGITAL": ["D"], # Digital platforms
|
|
"PRIVATE": ["C", "P"], # Commercial/personal
|
|
}
|
|
|
|
def retrieve_by_group(
|
|
self,
|
|
query: str,
|
|
type_group: str,
|
|
k: int = 10,
|
|
) -> List[RetrievalResult]:
|
|
types = self.TYPE_GROUPS.get(type_group, [])
|
|
|
|
# Multi-type OR filter
|
|
all_results = []
|
|
for ctype in types:
|
|
results = self.semantic_retriever.retrieve(
|
|
query=query,
|
|
k=k // len(types) + 1,
|
|
type_filter=ctype,
|
|
)
|
|
all_results.extend(results)
|
|
|
|
# Re-rank by relevance
|
|
all_results.sort(key=lambda r: r.score)
|
|
return all_results[:k]
|
|
```
|
|
|
|
### Multi-Aspect Retrieval
|
|
|
|
Retrieve across different ontology aspects (Name, Place, Collection, etc.):
|
|
|
|
```python
|
|
class MultiAspectRetriever:
|
|
"""Retrieve from multiple ontology aspect collections."""
|
|
|
|
ASPECT_COLLECTIONS = {
|
|
"custodian": "custodian_chunks",
|
|
"collection": "collection_chunks",
|
|
"place": "place_chunks",
|
|
"platform": "platform_chunks",
|
|
"project": "project_chunks",
|
|
}
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
aspects: List[str] = None,
|
|
k_per_aspect: int = 5,
|
|
) -> Dict[str, List[RetrievalResult]]:
|
|
aspects = aspects or list(self.ASPECT_COLLECTIONS.keys())
|
|
|
|
results = {}
|
|
for aspect in aspects:
|
|
collection = self.ASPECT_COLLECTIONS.get(aspect)
|
|
if collection:
|
|
results[aspect] = self.vector_store.query(
|
|
collection=collection,
|
|
query_embedding=self.embedder.embed_query(query),
|
|
n_results=k_per_aspect,
|
|
)
|
|
|
|
return results
|
|
```
|
|
|
|
## 2. Knowledge Graph Retrieval Patterns
|
|
|
|
### Entity Lookup
|
|
|
|
```python
|
|
class KGEntityRetriever:
|
|
"""Retrieve entities from TypeDB knowledge graph."""
|
|
|
|
def __init__(self, typedb_client):
|
|
self.client = typedb_client
|
|
|
|
def get_by_identifier(self, scheme: str, value: str) -> Optional[CustodianEntity]:
|
|
"""Lookup entity by identifier (ISIL, Wikidata, etc.)."""
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian;
|
|
$i isa identifier, has scheme "{scheme}", has value "{value}";
|
|
(custodian: $c, identifier: $i) isa has-identifier;
|
|
get $c;
|
|
"""
|
|
|
|
results = self.client.query(query)
|
|
if results:
|
|
return self._parse_custodian(results[0])
|
|
return None
|
|
|
|
def get_by_ghcid(self, ghcid: str) -> Optional[CustodianEntity]:
|
|
"""Lookup entity by GHCID."""
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian, has ghcid "{ghcid}";
|
|
get $c;
|
|
"""
|
|
|
|
results = self.client.query(query)
|
|
if results:
|
|
return self._parse_custodian(results[0])
|
|
return None
|
|
```
|
|
|
|
### Relationship Traversal
|
|
|
|
```python
|
|
class KGRelationshipRetriever:
|
|
"""Traverse relationships in knowledge graph."""
|
|
|
|
def get_members_of(self, body_name: str) -> List[CustodianEntity]:
|
|
"""Get all members of an encompassing body."""
|
|
|
|
query = f"""
|
|
match
|
|
$body isa encompassing-body, has name "{body_name}";
|
|
$c isa custodian;
|
|
(member: $c, body: $body) isa member-of;
|
|
get $c;
|
|
"""
|
|
|
|
return self._execute_and_parse(query)
|
|
|
|
def get_collections_of(self, custodian_ghcid: str) -> List[Collection]:
|
|
"""Get all collections managed by a custodian."""
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian, has ghcid "{custodian_ghcid}";
|
|
$coll isa collection;
|
|
(custodian: $c, collection: $coll) isa manages-collection;
|
|
get $coll;
|
|
"""
|
|
|
|
return self._execute_and_parse(query)
|
|
|
|
def get_related_projects(self, custodian_ghcid: str) -> List[Project]:
|
|
"""Get projects a custodian participated in."""
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian, has ghcid "{custodian_ghcid}";
|
|
$p isa project;
|
|
(participant: $c, project: $p) isa participated-in-project;
|
|
get $p;
|
|
"""
|
|
|
|
return self._execute_and_parse(query)
|
|
|
|
def get_change_events(self, custodian_ghcid: str) -> List[ChangeEvent]:
|
|
"""Get organizational change events for a custodian."""
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian, has ghcid "{custodian_ghcid}";
|
|
$e isa change-event;
|
|
(affected: $c, event: $e) isa affected-by-event;
|
|
get $e;
|
|
order by $e.event-date asc;
|
|
"""
|
|
|
|
return self._execute_and_parse(query)
|
|
```
|
|
|
|
### Graph-Based Exploration
|
|
|
|
```python
|
|
class KGExplorationRetriever:
|
|
"""Explore knowledge graph neighborhoods."""
|
|
|
|
def get_neighborhood(
|
|
self,
|
|
entity_ghcid: str,
|
|
depth: int = 2,
|
|
relationship_types: List[str] = None,
|
|
) -> NetworkX.Graph:
|
|
"""Get n-hop neighborhood around an entity."""
|
|
|
|
rel_filter = ""
|
|
if relationship_types:
|
|
rel_filter = f"$r type in [{', '.join(relationship_types)}];"
|
|
|
|
query = f"""
|
|
match
|
|
$c isa custodian, has ghcid "{entity_ghcid}";
|
|
$c2 isa custodian;
|
|
$r ($c, $c2) isa relation;
|
|
{rel_filter}
|
|
get $c, $c2, $r;
|
|
"""
|
|
|
|
# Build graph from results
|
|
G = nx.Graph()
|
|
for result in self.client.query(query):
|
|
G.add_edge(
|
|
result["c"]["ghcid"],
|
|
result["c2"]["ghcid"],
|
|
relationship=result["r"]["type"]
|
|
)
|
|
|
|
return G
|
|
|
|
def find_path(
|
|
self,
|
|
source_ghcid: str,
|
|
target_ghcid: str,
|
|
max_depth: int = 4,
|
|
) -> Optional[List[str]]:
|
|
"""Find shortest path between two entities."""
|
|
|
|
G = self.get_neighborhood(source_ghcid, depth=max_depth)
|
|
|
|
try:
|
|
return nx.shortest_path(G, source_ghcid, target_ghcid)
|
|
except nx.NetworkXNoPath:
|
|
return None
|
|
```
|
|
|
|
## 3. SPARQL Federation Patterns
|
|
|
|
### Wikidata Enrichment
|
|
|
|
```python
|
|
class WikidataRetriever:
|
|
"""Retrieve from Wikidata via SPARQL."""
|
|
|
|
def enrich_custodian(self, wikidata_id: str) -> dict:
|
|
"""Get additional data from Wikidata."""
|
|
|
|
query = f"""
|
|
SELECT ?instanceOf ?country ?coords ?website ?viaf ?isni WHERE {{
|
|
wd:{wikidata_id} wdt:P31 ?instanceOf .
|
|
OPTIONAL {{ wd:{wikidata_id} wdt:P17 ?country }}
|
|
OPTIONAL {{ wd:{wikidata_id} wdt:P625 ?coords }}
|
|
OPTIONAL {{ wd:{wikidata_id} wdt:P856 ?website }}
|
|
OPTIONAL {{ wd:{wikidata_id} wdt:P214 ?viaf }}
|
|
OPTIONAL {{ wd:{wikidata_id} wdt:P213 ?isni }}
|
|
}}
|
|
"""
|
|
|
|
return self._execute_sparql(query)
|
|
|
|
def find_similar_institutions(
|
|
self,
|
|
instance_of: str, # e.g., Q33506 (museum)
|
|
country: str, # e.g., Q55 (Netherlands)
|
|
limit: int = 50,
|
|
) -> List[dict]:
|
|
"""Find similar institutions in Wikidata."""
|
|
|
|
query = f"""
|
|
SELECT ?item ?itemLabel ?coords WHERE {{
|
|
?item wdt:P31 wd:{instance_of} ;
|
|
wdt:P17 wd:{country} .
|
|
OPTIONAL {{ ?item wdt:P625 ?coords }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl" }}
|
|
}}
|
|
LIMIT {limit}
|
|
"""
|
|
|
|
return self._execute_sparql(query)
|
|
```
|
|
|
|
### Cross-Source Linking
|
|
|
|
```python
|
|
class CrossSourceLinker:
|
|
"""Link entities across sources using identifiers."""
|
|
|
|
def link_by_isil(self, isil_code: str) -> dict:
|
|
"""Link entity across all sources by ISIL."""
|
|
|
|
results = {
|
|
"local_kg": self.kg_retriever.get_by_identifier("ISIL", isil_code),
|
|
"wikidata": self._search_wikidata_by_isil(isil_code),
|
|
"vector_chunks": self.vector_retriever.retrieve(
|
|
query=f"ISIL {isil_code}",
|
|
k=5,
|
|
),
|
|
}
|
|
|
|
return results
|
|
|
|
def _search_wikidata_by_isil(self, isil_code: str) -> Optional[str]:
|
|
"""Find Wikidata entity by ISIL code."""
|
|
|
|
query = f"""
|
|
SELECT ?item WHERE {{
|
|
?item wdt:P791 "{isil_code}" .
|
|
}}
|
|
"""
|
|
|
|
results = wikidata_execute_sparql(query)
|
|
if results:
|
|
return results[0]["item"]["value"].split("/")[-1]
|
|
return None
|
|
```
|
|
|
|
## 4. Hybrid Retrieval Patterns
|
|
|
|
### Query-Adaptive Retrieval
|
|
|
|
```python
|
|
class AdaptiveRetriever:
|
|
"""Select retrieval strategy based on query intent."""
|
|
|
|
def __init__(self):
|
|
self.query_router = dspy.ChainOfThought(QueryRouter)
|
|
self.semantic_retriever = SemanticRetriever()
|
|
self.kg_retriever = KGEntityRetriever()
|
|
self.sparql_retriever = WikidataRetriever()
|
|
|
|
def retrieve(self, query: str, k: int = 10) -> List[RetrievalResult]:
|
|
# Classify query intent
|
|
intent = self.query_router(query=query).intent
|
|
|
|
results = []
|
|
|
|
# Route to appropriate retrieval
|
|
if intent.intent_type == "factual":
|
|
# Try KG lookup first, then vector
|
|
if intent.entity_mentions:
|
|
for entity in intent.entity_mentions:
|
|
kg_result = self.kg_retriever.search_by_name(entity)
|
|
if kg_result:
|
|
results.append(kg_result)
|
|
|
|
# Augment with vector results
|
|
vector_results = self.semantic_retriever.retrieve(query, k=k)
|
|
results.extend(vector_results)
|
|
|
|
elif intent.intent_type == "exploratory":
|
|
# Broad vector search with type filtering
|
|
for ctype in intent.custodian_types:
|
|
type_results = self.semantic_retriever.retrieve(
|
|
query=query,
|
|
k=k // len(intent.custodian_types),
|
|
type_filter=ctype,
|
|
country_filter=intent.geographic_scope,
|
|
)
|
|
results.extend(type_results)
|
|
|
|
elif intent.intent_type == "relationship":
|
|
# KG traversal primary
|
|
for entity in intent.entity_mentions:
|
|
related = self.kg_retriever.get_neighborhood(entity)
|
|
results.extend(related)
|
|
|
|
# Context from vector
|
|
context_results = self.semantic_retriever.retrieve(query, k=k//2)
|
|
results.extend(context_results)
|
|
|
|
elif intent.intent_type == "temporal":
|
|
# KG events + vector context
|
|
for entity in intent.entity_mentions:
|
|
events = self.kg_retriever.get_change_events(entity)
|
|
results.extend(events)
|
|
|
|
vector_results = self.semantic_retriever.retrieve(query, k=k)
|
|
results.extend(vector_results)
|
|
|
|
return self._dedupe_and_rank(results)[:k]
|
|
```
|
|
|
|
### Reciprocal Rank Fusion
|
|
|
|
```python
|
|
class RRFRetriever:
|
|
"""Combine multiple retrieval strategies using Reciprocal Rank Fusion."""
|
|
|
|
def __init__(self, k: int = 60):
|
|
self.k = k # RRF parameter
|
|
self.retrievers = {
|
|
"semantic": SemanticRetriever(),
|
|
"kg": KGEntityRetriever(),
|
|
"sparse": SparseRetriever(), # BM25 or similar
|
|
}
|
|
|
|
def retrieve(self, query: str, n: int = 10) -> List[RetrievalResult]:
|
|
# Get results from all retrievers
|
|
all_rankings = {}
|
|
for name, retriever in self.retrievers.items():
|
|
results = retriever.retrieve(query, k=n * 2)
|
|
for rank, result in enumerate(results):
|
|
doc_id = result.metadata.get("ghcid") or hash(result.text)
|
|
if doc_id not in all_rankings:
|
|
all_rankings[doc_id] = {"result": result, "scores": {}}
|
|
all_rankings[doc_id]["scores"][name] = rank
|
|
|
|
# Calculate RRF scores
|
|
for doc_id, data in all_rankings.items():
|
|
rrf_score = sum(
|
|
1.0 / (self.k + rank)
|
|
for rank in data["scores"].values()
|
|
)
|
|
data["rrf_score"] = rrf_score
|
|
|
|
# Sort by RRF score
|
|
sorted_results = sorted(
|
|
all_rankings.values(),
|
|
key=lambda x: x["rrf_score"],
|
|
reverse=True,
|
|
)
|
|
|
|
return [r["result"] for r in sorted_results[:n]]
|
|
```
|
|
|
|
## 5. Context Aggregation
|
|
|
|
### Multi-Hop Context Building
|
|
|
|
```python
|
|
class ContextBuilder:
|
|
"""Build rich context from multiple retrieval results."""
|
|
|
|
def build_context(
|
|
self,
|
|
query: str,
|
|
primary_results: List[RetrievalResult],
|
|
max_tokens: int = 4000,
|
|
) -> str:
|
|
context_parts = []
|
|
token_count = 0
|
|
|
|
# Add primary results
|
|
for result in primary_results:
|
|
text = result.text
|
|
tokens = len(text.split()) * 1.3 # Rough token estimate
|
|
|
|
if token_count + tokens > max_tokens:
|
|
break
|
|
|
|
# Format with source attribution
|
|
source = result.metadata.get("source_type", "unknown")
|
|
tier = result.metadata.get("data_tier", "TIER_4")
|
|
|
|
formatted = f"[Source: {source}, Tier: {tier}]\n{text}\n"
|
|
context_parts.append(formatted)
|
|
token_count += tokens
|
|
|
|
# Add relationship context if space
|
|
if token_count < max_tokens * 0.8:
|
|
rel_context = self._get_relationship_context(primary_results)
|
|
context_parts.append(f"\n[Relationships]\n{rel_context}")
|
|
|
|
return "\n---\n".join(context_parts)
|
|
|
|
def _get_relationship_context(self, results: List[RetrievalResult]) -> str:
|
|
"""Extract and format relationship information."""
|
|
relationships = []
|
|
|
|
for result in results:
|
|
ghcid = result.metadata.get("ghcid")
|
|
if ghcid:
|
|
# Get relationships from KG
|
|
members = self.kg_retriever.get_members_of_body(ghcid)
|
|
projects = self.kg_retriever.get_related_projects(ghcid)
|
|
|
|
if members:
|
|
relationships.append(f"Members of {ghcid}: {', '.join(m.name for m in members)}")
|
|
if projects:
|
|
relationships.append(f"Projects: {', '.join(p.name for p in projects)}")
|
|
|
|
return "\n".join(relationships)
|
|
```
|
|
|
|
## Performance Optimization
|
|
|
|
### Caching Strategy
|
|
|
|
```python
|
|
class CachedRetriever:
|
|
"""Retriever with multi-level caching."""
|
|
|
|
def __init__(self):
|
|
self.query_cache = LRUCache(maxsize=1000) # Query → results
|
|
self.entity_cache = TTLCache(maxsize=5000, ttl=3600) # Entity lookups
|
|
self.embedding_cache = DiskCache("embeddings/") # Persistent embeddings
|
|
|
|
def retrieve(self, query: str, **kwargs) -> List[RetrievalResult]:
|
|
# Check query cache
|
|
cache_key = f"{query}:{json.dumps(kwargs, sort_keys=True)}"
|
|
if cache_key in self.query_cache:
|
|
return self.query_cache[cache_key]
|
|
|
|
# Get embedding (cached)
|
|
if query not in self.embedding_cache:
|
|
self.embedding_cache[query] = self.embedder.embed_query(query)
|
|
embedding = self.embedding_cache[query]
|
|
|
|
# Execute retrieval
|
|
results = self._do_retrieve(embedding, **kwargs)
|
|
|
|
# Cache results
|
|
self.query_cache[cache_key] = results
|
|
|
|
return results
|
|
```
|
|
|
|
### Batch Processing
|
|
|
|
```python
|
|
class BatchRetriever:
|
|
"""Efficient batch retrieval for multiple queries."""
|
|
|
|
def batch_retrieve(
|
|
self,
|
|
queries: List[str],
|
|
k: int = 10,
|
|
) -> Dict[str, List[RetrievalResult]]:
|
|
# Batch embed queries
|
|
embeddings = self.embedder.batch_encode(queries)
|
|
|
|
# Batch search
|
|
results = self.vector_store.batch_query(
|
|
query_embeddings=embeddings,
|
|
n_results=k,
|
|
)
|
|
|
|
return {
|
|
query: results[i]
|
|
for i, query in enumerate(queries)
|
|
}
|
|
```
|