glam/backend/rag/event_retriever.py
kempersc 98c42bf272 Fix LinkML URI conflicts and generate RDF outputs
- Fix scope_note → finding_aid_scope_note in FindingAid.yaml
- Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead)
- Remove duplicate rico_record_set_type from class_metadata_slots.yaml
- Fix range types for equals_string compatibility (uriorcurie → string)
- Move class names from close_mappings to see_also in 10 RecordSetTypes files
- Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context
- Sync schemas to frontend/public/schemas/

Files: 1,151 changed (includes prior CustodianType migration)
2026-01-07 12:32:59 +01:00

393 lines
13 KiB
Python

"""
Heritage Event Retrieval using Hypergraph Patterns
Retrieves organizational change events (mergers, foundings, etc.) using
multi-factor scoring: entity overlap + semantic similarity + temporal relevance.
Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Callable, Any
import logging
import numpy as np
logger = logging.getLogger(__name__)
@dataclass
class HeritageEvent:
"""Hyperedge representing a heritage organizational event."""
event_id: str
event_type: str
event_date: datetime
participants: dict[str, str] # role -> GHCID
description: str
affected_collections: list[str] = field(default_factory=list)
resulting_entities: list[str] = field(default_factory=list)
confidence: float = 1.0
embedding: Optional[list[float]] = None
class EventRetriever:
"""
Retrieve heritage events using hypergraph patterns.
Uses multi-factor scoring:
- Entity overlap (entities mentioned in query match event participants)
- Semantic similarity (query embedding vs event description)
- Temporal relevance (how close event date is to query date)
- Graph connectivity (how connected the event is in the knowledge graph)
"""
def __init__(
self,
oxigraph_query_fn: Callable[[str], list[dict]],
qdrant_search_fn: Callable[[str, int], list[dict]],
embed_fn: Callable[[str], list[float]]
):
"""
Args:
oxigraph_query_fn: Function to execute SPARQL queries
qdrant_search_fn: Function to search Qdrant events collection
embed_fn: Function to embed text
"""
self.sparql = oxigraph_query_fn
self.vector_search = qdrant_search_fn
self.embed = embed_fn
def retrieve(
self,
query: str,
query_entities: list[str] = None,
query_time: datetime = None,
event_type: str = None,
limit: int = 10,
weights: dict = None
) -> list[tuple[HeritageEvent, float]]:
"""
Retrieve events using multi-factor scoring.
Args:
query: Natural language query
query_entities: GHCIDs mentioned in query
query_time: Temporal constraint
event_type: Filter by event type (MERGER, FOUNDING, CLOSURE, etc.)
limit: Max results
weights: Scoring weights for each factor
Returns:
List of (event, score) tuples ordered by relevance
"""
if weights is None:
weights = {
"entity": 0.3,
"semantic": 0.4,
"temporal": 0.2,
"graph": 0.1
}
# Phase 1: Candidate generation
candidates = {}
# Entity-based candidates from SPARQL
if query_entities:
sparql_candidates = self._get_entity_candidates(query_entities, event_type)
candidates.update(sparql_candidates)
# Semantic candidates from Qdrant
vector_candidates = self._get_semantic_candidates(query, limit * 2)
candidates.update(vector_candidates)
if not candidates:
logger.info(f"No event candidates found for query: {query}")
return []
# Phase 2: Score all candidates
scored = []
for event_id, event in candidates.items():
score = self._score_event(
event, query, query_entities, query_time, weights
)
scored.append((event, score))
# Sort and return top-k
scored.sort(key=lambda x: x[1], reverse=True)
return scored[:limit]
def retrieve_by_type(
self,
event_type: str,
start_date: datetime = None,
end_date: datetime = None,
limit: int = 50
) -> list[HeritageEvent]:
"""
Retrieve events of a specific type within a date range.
Simpler retrieval for structured queries (no scoring).
"""
date_filter = ""
if start_date:
date_filter += f'FILTER(?date >= "{start_date.isoformat()}"^^xsd:date) '
if end_date:
date_filter += f'FILTER(?date <= "{end_date.isoformat()}"^^xsd:date) '
sparql = f"""
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX schema: <http://schema.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?event ?eventType ?date ?description WHERE {{
?event a hc:OrganizationalChangeEvent ;
hc:eventType ?eventType ;
hc:eventDate ?date .
OPTIONAL {{ ?event schema:description ?description }}
FILTER(?eventType = "{event_type}")
{date_filter}
}}
ORDER BY ?date
LIMIT {limit}
"""
results = self.sparql(sparql)
events = []
for row in results:
event = HeritageEvent(
event_id=row.get("event", ""),
event_type=row.get("eventType", event_type),
event_date=datetime.fromisoformat(row["date"]) if row.get("date") else datetime.now(),
participants={},
description=row.get("description", "")
)
events.append(event)
return events
def _get_entity_candidates(
self,
ghcids: list[str],
event_type: str = None
) -> dict[str, HeritageEvent]:
"""Get events involving specified entities via SPARQL."""
ghcid_filter = ", ".join(f'"{g}"' for g in ghcids)
event_type_filter = f'FILTER(?eventType = "{event_type}")' if event_type else ""
sparql = f"""
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?event ?eventType ?date ?description ?participant ?role WHERE {{
?event a hc:OrganizationalChangeEvent ;
hc:eventType ?eventType ;
hc:eventDate ?date .
OPTIONAL {{ ?event schema:description ?description }}
# Get participants
?event ?role ?participant .
FILTER(STRSTARTS(STR(?role), "http://www.cidoc-crm.org/cidoc-crm/P") ||
STRSTARTS(STR(?role), "https://nde.nl/ontology/hc/"))
{event_type_filter}
}}
"""
results = self.sparql(sparql)
return self._results_to_events(results)
def _get_semantic_candidates(
self,
query: str,
limit: int
) -> dict[str, HeritageEvent]:
"""Get events via semantic similarity."""
try:
results = self.vector_search(query, limit)
except Exception as e:
logger.warning(f"Vector search failed: {e}")
return {}
events = {}
for r in results:
payload = r.get("payload", {}) if isinstance(r, dict) else {}
event_id = r.get("id", str(id(r)))
try:
event_date = datetime.fromisoformat(
payload.get("event_date", datetime.now().isoformat())
)
except (ValueError, TypeError):
event_date = datetime.now()
event = HeritageEvent(
event_id=event_id,
event_type=payload.get("event_type", "UNKNOWN"),
event_date=event_date,
participants=payload.get("participants", {}),
description=payload.get("description", ""),
confidence=r.get("score", 0.5)
)
events[event.event_id] = event
return events
def _score_event(
self,
event: HeritageEvent,
query: str,
query_entities: list[str],
query_time: datetime,
weights: dict
) -> float:
"""Compute multi-factor relevance score."""
scores = {}
# Entity overlap
if query_entities:
event_entities = set(event.participants.values())
overlap = len(event_entities.intersection(set(query_entities)))
scores["entity"] = overlap / max(len(query_entities), 1)
else:
scores["entity"] = 0.5 # Neutral
# Semantic similarity
try:
query_emb = self.embed(query)
if event.embedding:
scores["semantic"] = self._cosine_similarity(query_emb, event.embedding)
elif event.description:
desc_emb = self.embed(event.description)
scores["semantic"] = self._cosine_similarity(query_emb, desc_emb)
else:
scores["semantic"] = 0.5
except Exception as e:
logger.warning(f"Embedding failed: {e}")
scores["semantic"] = 0.5
# Temporal relevance
if query_time and event.event_date:
days_diff = abs((query_time - event.event_date).days)
scores["temporal"] = 1.0 / (1.0 + days_diff / 365.0)
else:
scores["temporal"] = 0.5 # Neutral
# Graph connectivity (placeholder - would use SPARQL for full implementation)
scores["graph"] = 0.5
# Weighted sum
final_score = sum(weights.get(k, 0) * scores.get(k, 0.5) for k in weights)
return final_score
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
a_np = np.array(a)
b_np = np.array(b)
norm_product = np.linalg.norm(a_np) * np.linalg.norm(b_np)
if norm_product == 0:
return 0.0
return float(np.dot(a_np, b_np) / norm_product)
def _results_to_events(self, results: list[dict]) -> dict[str, HeritageEvent]:
"""Convert SPARQL results to HeritageEvent objects."""
events = {}
# Group by event ID
by_event: dict[str, dict[str, Any]] = {}
for row in results:
event_id = row.get("event")
if not event_id:
continue
if event_id not in by_event:
by_event[event_id] = {
"event_type": row.get("eventType", "UNKNOWN"),
"date": row.get("date"),
"description": row.get("description", ""),
"participants": {}
}
role = row.get("role", "")
if "/" in role:
role = role.split("/")[-1] # Extract role from URI
participant = row.get("participant")
if role and participant:
by_event[event_id]["participants"][role] = participant
# Convert to HeritageEvent objects
for event_id, data in by_event.items():
try:
event_date = datetime.fromisoformat(data["date"]) if data["date"] else datetime.now()
except (ValueError, TypeError):
event_date = datetime.now()
events[event_id] = HeritageEvent(
event_id=event_id,
event_type=data["event_type"],
event_date=event_date,
participants=data["participants"],
description=data["description"]
)
return events
# Factory function for creating EventRetriever with default dependencies
def create_event_retriever(
oxigraph_endpoint: str = "http://localhost:7878/query",
qdrant_collection: str = "heritage_events"
) -> EventRetriever:
"""
Create EventRetriever with standard GLAM dependencies.
This is a convenience factory that wires up the retriever with
default Oxigraph and Qdrant connections.
"""
# Import here to avoid circular dependencies
import requests
def sparql_query(query: str) -> list[dict]:
"""Execute SPARQL query against Oxigraph."""
response = requests.post(
oxigraph_endpoint,
data=query,
headers={
"Content-Type": "application/sparql-query",
"Accept": "application/json"
},
timeout=30
)
response.raise_for_status()
data = response.json()
# Convert bindings to simple dict format
results = []
for binding in data.get("results", {}).get("bindings", []):
row = {}
for key, val in binding.items():
row[key] = val.get("value")
results.append(row)
return results
def qdrant_search(query: str, limit: int) -> list[dict]:
"""Search Qdrant events collection."""
# Placeholder - would use actual Qdrant client
logger.warning("Qdrant search not implemented - using empty results")
return []
def embed(text: str) -> list[float]:
"""Embed text using default embedding model."""
# Placeholder - would use actual embedding model
logger.warning("Embedding not implemented - using random vector")
return list(np.random.randn(384))
return EventRetriever(
oxigraph_query_fn=sparql_query,
qdrant_search_fn=qdrant_search,
embed_fn=embed
)