- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
393 lines
13 KiB
Python
393 lines
13 KiB
Python
"""
|
|
Heritage Event Retrieval using Hypergraph Patterns
|
|
|
|
Retrieves organizational change events (mergers, foundings, etc.) using
|
|
multi-factor scoring: entity overlap + semantic similarity + temporal relevance.
|
|
|
|
Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Optional, Callable, Any
|
|
import logging
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class HeritageEvent:
|
|
"""Hyperedge representing a heritage organizational event."""
|
|
event_id: str
|
|
event_type: str
|
|
event_date: datetime
|
|
participants: dict[str, str] # role -> GHCID
|
|
description: str
|
|
affected_collections: list[str] = field(default_factory=list)
|
|
resulting_entities: list[str] = field(default_factory=list)
|
|
confidence: float = 1.0
|
|
embedding: Optional[list[float]] = None
|
|
|
|
|
|
class EventRetriever:
|
|
"""
|
|
Retrieve heritage events using hypergraph patterns.
|
|
|
|
Uses multi-factor scoring:
|
|
- Entity overlap (entities mentioned in query match event participants)
|
|
- Semantic similarity (query embedding vs event description)
|
|
- Temporal relevance (how close event date is to query date)
|
|
- Graph connectivity (how connected the event is in the knowledge graph)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
oxigraph_query_fn: Callable[[str], list[dict]],
|
|
qdrant_search_fn: Callable[[str, int], list[dict]],
|
|
embed_fn: Callable[[str], list[float]]
|
|
):
|
|
"""
|
|
Args:
|
|
oxigraph_query_fn: Function to execute SPARQL queries
|
|
qdrant_search_fn: Function to search Qdrant events collection
|
|
embed_fn: Function to embed text
|
|
"""
|
|
self.sparql = oxigraph_query_fn
|
|
self.vector_search = qdrant_search_fn
|
|
self.embed = embed_fn
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
query_entities: list[str] = None,
|
|
query_time: datetime = None,
|
|
event_type: str = None,
|
|
limit: int = 10,
|
|
weights: dict = None
|
|
) -> list[tuple[HeritageEvent, float]]:
|
|
"""
|
|
Retrieve events using multi-factor scoring.
|
|
|
|
Args:
|
|
query: Natural language query
|
|
query_entities: GHCIDs mentioned in query
|
|
query_time: Temporal constraint
|
|
event_type: Filter by event type (MERGER, FOUNDING, CLOSURE, etc.)
|
|
limit: Max results
|
|
weights: Scoring weights for each factor
|
|
|
|
Returns:
|
|
List of (event, score) tuples ordered by relevance
|
|
"""
|
|
if weights is None:
|
|
weights = {
|
|
"entity": 0.3,
|
|
"semantic": 0.4,
|
|
"temporal": 0.2,
|
|
"graph": 0.1
|
|
}
|
|
|
|
# Phase 1: Candidate generation
|
|
candidates = {}
|
|
|
|
# Entity-based candidates from SPARQL
|
|
if query_entities:
|
|
sparql_candidates = self._get_entity_candidates(query_entities, event_type)
|
|
candidates.update(sparql_candidates)
|
|
|
|
# Semantic candidates from Qdrant
|
|
vector_candidates = self._get_semantic_candidates(query, limit * 2)
|
|
candidates.update(vector_candidates)
|
|
|
|
if not candidates:
|
|
logger.info(f"No event candidates found for query: {query}")
|
|
return []
|
|
|
|
# Phase 2: Score all candidates
|
|
scored = []
|
|
for event_id, event in candidates.items():
|
|
score = self._score_event(
|
|
event, query, query_entities, query_time, weights
|
|
)
|
|
scored.append((event, score))
|
|
|
|
# Sort and return top-k
|
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|
return scored[:limit]
|
|
|
|
def retrieve_by_type(
|
|
self,
|
|
event_type: str,
|
|
start_date: datetime = None,
|
|
end_date: datetime = None,
|
|
limit: int = 50
|
|
) -> list[HeritageEvent]:
|
|
"""
|
|
Retrieve events of a specific type within a date range.
|
|
|
|
Simpler retrieval for structured queries (no scoring).
|
|
"""
|
|
date_filter = ""
|
|
if start_date:
|
|
date_filter += f'FILTER(?date >= "{start_date.isoformat()}"^^xsd:date) '
|
|
if end_date:
|
|
date_filter += f'FILTER(?date <= "{end_date.isoformat()}"^^xsd:date) '
|
|
|
|
sparql = f"""
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX schema: <http://schema.org/>
|
|
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
|
|
SELECT ?event ?eventType ?date ?description WHERE {{
|
|
?event a hc:OrganizationalChangeEvent ;
|
|
hc:eventType ?eventType ;
|
|
hc:eventDate ?date .
|
|
OPTIONAL {{ ?event schema:description ?description }}
|
|
|
|
FILTER(?eventType = "{event_type}")
|
|
{date_filter}
|
|
}}
|
|
ORDER BY ?date
|
|
LIMIT {limit}
|
|
"""
|
|
|
|
results = self.sparql(sparql)
|
|
events = []
|
|
|
|
for row in results:
|
|
event = HeritageEvent(
|
|
event_id=row.get("event", ""),
|
|
event_type=row.get("eventType", event_type),
|
|
event_date=datetime.fromisoformat(row["date"]) if row.get("date") else datetime.now(),
|
|
participants={},
|
|
description=row.get("description", "")
|
|
)
|
|
events.append(event)
|
|
|
|
return events
|
|
|
|
def _get_entity_candidates(
|
|
self,
|
|
ghcids: list[str],
|
|
event_type: str = None
|
|
) -> dict[str, HeritageEvent]:
|
|
"""Get events involving specified entities via SPARQL."""
|
|
ghcid_filter = ", ".join(f'"{g}"' for g in ghcids)
|
|
event_type_filter = f'FILTER(?eventType = "{event_type}")' if event_type else ""
|
|
|
|
sparql = f"""
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT DISTINCT ?event ?eventType ?date ?description ?participant ?role WHERE {{
|
|
?event a hc:OrganizationalChangeEvent ;
|
|
hc:eventType ?eventType ;
|
|
hc:eventDate ?date .
|
|
OPTIONAL {{ ?event schema:description ?description }}
|
|
|
|
# Get participants
|
|
?event ?role ?participant .
|
|
FILTER(STRSTARTS(STR(?role), "http://www.cidoc-crm.org/cidoc-crm/P") ||
|
|
STRSTARTS(STR(?role), "https://nde.nl/ontology/hc/"))
|
|
|
|
{event_type_filter}
|
|
}}
|
|
"""
|
|
|
|
results = self.sparql(sparql)
|
|
return self._results_to_events(results)
|
|
|
|
def _get_semantic_candidates(
|
|
self,
|
|
query: str,
|
|
limit: int
|
|
) -> dict[str, HeritageEvent]:
|
|
"""Get events via semantic similarity."""
|
|
try:
|
|
results = self.vector_search(query, limit)
|
|
except Exception as e:
|
|
logger.warning(f"Vector search failed: {e}")
|
|
return {}
|
|
|
|
events = {}
|
|
for r in results:
|
|
payload = r.get("payload", {}) if isinstance(r, dict) else {}
|
|
event_id = r.get("id", str(id(r)))
|
|
|
|
try:
|
|
event_date = datetime.fromisoformat(
|
|
payload.get("event_date", datetime.now().isoformat())
|
|
)
|
|
except (ValueError, TypeError):
|
|
event_date = datetime.now()
|
|
|
|
event = HeritageEvent(
|
|
event_id=event_id,
|
|
event_type=payload.get("event_type", "UNKNOWN"),
|
|
event_date=event_date,
|
|
participants=payload.get("participants", {}),
|
|
description=payload.get("description", ""),
|
|
confidence=r.get("score", 0.5)
|
|
)
|
|
events[event.event_id] = event
|
|
|
|
return events
|
|
|
|
def _score_event(
|
|
self,
|
|
event: HeritageEvent,
|
|
query: str,
|
|
query_entities: list[str],
|
|
query_time: datetime,
|
|
weights: dict
|
|
) -> float:
|
|
"""Compute multi-factor relevance score."""
|
|
scores = {}
|
|
|
|
# Entity overlap
|
|
if query_entities:
|
|
event_entities = set(event.participants.values())
|
|
overlap = len(event_entities.intersection(set(query_entities)))
|
|
scores["entity"] = overlap / max(len(query_entities), 1)
|
|
else:
|
|
scores["entity"] = 0.5 # Neutral
|
|
|
|
# Semantic similarity
|
|
try:
|
|
query_emb = self.embed(query)
|
|
if event.embedding:
|
|
scores["semantic"] = self._cosine_similarity(query_emb, event.embedding)
|
|
elif event.description:
|
|
desc_emb = self.embed(event.description)
|
|
scores["semantic"] = self._cosine_similarity(query_emb, desc_emb)
|
|
else:
|
|
scores["semantic"] = 0.5
|
|
except Exception as e:
|
|
logger.warning(f"Embedding failed: {e}")
|
|
scores["semantic"] = 0.5
|
|
|
|
# Temporal relevance
|
|
if query_time and event.event_date:
|
|
days_diff = abs((query_time - event.event_date).days)
|
|
scores["temporal"] = 1.0 / (1.0 + days_diff / 365.0)
|
|
else:
|
|
scores["temporal"] = 0.5 # Neutral
|
|
|
|
# Graph connectivity (placeholder - would use SPARQL for full implementation)
|
|
scores["graph"] = 0.5
|
|
|
|
# Weighted sum
|
|
final_score = sum(weights.get(k, 0) * scores.get(k, 0.5) for k in weights)
|
|
return final_score
|
|
|
|
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
|
|
"""Compute cosine similarity between two vectors."""
|
|
a_np = np.array(a)
|
|
b_np = np.array(b)
|
|
norm_product = np.linalg.norm(a_np) * np.linalg.norm(b_np)
|
|
if norm_product == 0:
|
|
return 0.0
|
|
return float(np.dot(a_np, b_np) / norm_product)
|
|
|
|
def _results_to_events(self, results: list[dict]) -> dict[str, HeritageEvent]:
|
|
"""Convert SPARQL results to HeritageEvent objects."""
|
|
events = {}
|
|
|
|
# Group by event ID
|
|
by_event: dict[str, dict[str, Any]] = {}
|
|
for row in results:
|
|
event_id = row.get("event")
|
|
if not event_id:
|
|
continue
|
|
|
|
if event_id not in by_event:
|
|
by_event[event_id] = {
|
|
"event_type": row.get("eventType", "UNKNOWN"),
|
|
"date": row.get("date"),
|
|
"description": row.get("description", ""),
|
|
"participants": {}
|
|
}
|
|
|
|
role = row.get("role", "")
|
|
if "/" in role:
|
|
role = role.split("/")[-1] # Extract role from URI
|
|
participant = row.get("participant")
|
|
if role and participant:
|
|
by_event[event_id]["participants"][role] = participant
|
|
|
|
# Convert to HeritageEvent objects
|
|
for event_id, data in by_event.items():
|
|
try:
|
|
event_date = datetime.fromisoformat(data["date"]) if data["date"] else datetime.now()
|
|
except (ValueError, TypeError):
|
|
event_date = datetime.now()
|
|
|
|
events[event_id] = HeritageEvent(
|
|
event_id=event_id,
|
|
event_type=data["event_type"],
|
|
event_date=event_date,
|
|
participants=data["participants"],
|
|
description=data["description"]
|
|
)
|
|
|
|
return events
|
|
|
|
|
|
# Factory function for creating EventRetriever with default dependencies
|
|
def create_event_retriever(
|
|
oxigraph_endpoint: str = "http://localhost:7878/query",
|
|
qdrant_collection: str = "heritage_events"
|
|
) -> EventRetriever:
|
|
"""
|
|
Create EventRetriever with standard GLAM dependencies.
|
|
|
|
This is a convenience factory that wires up the retriever with
|
|
default Oxigraph and Qdrant connections.
|
|
"""
|
|
# Import here to avoid circular dependencies
|
|
import requests
|
|
|
|
def sparql_query(query: str) -> list[dict]:
|
|
"""Execute SPARQL query against Oxigraph."""
|
|
response = requests.post(
|
|
oxigraph_endpoint,
|
|
data=query,
|
|
headers={
|
|
"Content-Type": "application/sparql-query",
|
|
"Accept": "application/json"
|
|
},
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Convert bindings to simple dict format
|
|
results = []
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
row = {}
|
|
for key, val in binding.items():
|
|
row[key] = val.get("value")
|
|
results.append(row)
|
|
return results
|
|
|
|
def qdrant_search(query: str, limit: int) -> list[dict]:
|
|
"""Search Qdrant events collection."""
|
|
# Placeholder - would use actual Qdrant client
|
|
logger.warning("Qdrant search not implemented - using empty results")
|
|
return []
|
|
|
|
def embed(text: str) -> list[float]:
|
|
"""Embed text using default embedding model."""
|
|
# Placeholder - would use actual embedding model
|
|
logger.warning("Embedding not implemented - using random vector")
|
|
return list(np.random.randn(384))
|
|
|
|
return EventRetriever(
|
|
oxigraph_query_fn=sparql_query,
|
|
qdrant_search_fn=qdrant_search,
|
|
embed_fn=embed
|
|
)
|