""" Heritage Event Retrieval using Hypergraph Patterns Retrieves organizational change events (mergers, foundings, etc.) using multi-factor scoring: entity overlap + semantic similarity + temporal relevance. Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md """ from dataclasses import dataclass, field from datetime import datetime from typing import Optional, Callable, Any import logging import numpy as np logger = logging.getLogger(__name__) @dataclass class HeritageEvent: """Hyperedge representing a heritage organizational event.""" event_id: str event_type: str event_date: datetime participants: dict[str, str] # role -> GHCID description: str affected_collections: list[str] = field(default_factory=list) resulting_entities: list[str] = field(default_factory=list) confidence: float = 1.0 embedding: Optional[list[float]] = None class EventRetriever: """ Retrieve heritage events using hypergraph patterns. Uses multi-factor scoring: - Entity overlap (entities mentioned in query match event participants) - Semantic similarity (query embedding vs event description) - Temporal relevance (how close event date is to query date) - Graph connectivity (how connected the event is in the knowledge graph) """ def __init__( self, oxigraph_query_fn: Callable[[str], list[dict]], qdrant_search_fn: Callable[[str, int], list[dict]], embed_fn: Callable[[str], list[float]] ): """ Args: oxigraph_query_fn: Function to execute SPARQL queries qdrant_search_fn: Function to search Qdrant events collection embed_fn: Function to embed text """ self.sparql = oxigraph_query_fn self.vector_search = qdrant_search_fn self.embed = embed_fn def retrieve( self, query: str, query_entities: list[str] = None, query_time: datetime = None, event_type: str = None, limit: int = 10, weights: dict = None ) -> list[tuple[HeritageEvent, float]]: """ Retrieve events using multi-factor scoring. Args: query: Natural language query query_entities: GHCIDs mentioned in query query_time: Temporal constraint event_type: Filter by event type (MERGER, FOUNDING, CLOSURE, etc.) limit: Max results weights: Scoring weights for each factor Returns: List of (event, score) tuples ordered by relevance """ if weights is None: weights = { "entity": 0.3, "semantic": 0.4, "temporal": 0.2, "graph": 0.1 } # Phase 1: Candidate generation candidates = {} # Entity-based candidates from SPARQL if query_entities: sparql_candidates = self._get_entity_candidates(query_entities, event_type) candidates.update(sparql_candidates) # Semantic candidates from Qdrant vector_candidates = self._get_semantic_candidates(query, limit * 2) candidates.update(vector_candidates) if not candidates: logger.info(f"No event candidates found for query: {query}") return [] # Phase 2: Score all candidates scored = [] for event_id, event in candidates.items(): score = self._score_event( event, query, query_entities, query_time, weights ) scored.append((event, score)) # Sort and return top-k scored.sort(key=lambda x: x[1], reverse=True) return scored[:limit] def retrieve_by_type( self, event_type: str, start_date: datetime = None, end_date: datetime = None, limit: int = 50 ) -> list[HeritageEvent]: """ Retrieve events of a specific type within a date range. Simpler retrieval for structured queries (no scoring). """ date_filter = "" if start_date: date_filter += f'FILTER(?date >= "{start_date.isoformat()}"^^xsd:date) ' if end_date: date_filter += f'FILTER(?date <= "{end_date.isoformat()}"^^xsd:date) ' sparql = f""" PREFIX hc: PREFIX crm: PREFIX schema: PREFIX xsd: SELECT ?event ?eventType ?date ?description WHERE {{ ?event a hc:OrganizationalChangeEvent ; hc:eventType ?eventType ; hc:eventDate ?date . OPTIONAL {{ ?event schema:description ?description }} FILTER(?eventType = "{event_type}") {date_filter} }} ORDER BY ?date LIMIT {limit} """ results = self.sparql(sparql) events = [] for row in results: event = HeritageEvent( event_id=row.get("event", ""), event_type=row.get("eventType", event_type), event_date=datetime.fromisoformat(row["date"]) if row.get("date") else datetime.now(), participants={}, description=row.get("description", "") ) events.append(event) return events def _get_entity_candidates( self, ghcids: list[str], event_type: str = None ) -> dict[str, HeritageEvent]: """Get events involving specified entities via SPARQL.""" ghcid_filter = ", ".join(f'"{g}"' for g in ghcids) event_type_filter = f'FILTER(?eventType = "{event_type}")' if event_type else "" sparql = f""" PREFIX hc: PREFIX crm: PREFIX schema: SELECT DISTINCT ?event ?eventType ?date ?description ?participant ?role WHERE {{ ?event a hc:OrganizationalChangeEvent ; hc:eventType ?eventType ; hc:eventDate ?date . OPTIONAL {{ ?event schema:description ?description }} # Get participants ?event ?role ?participant . FILTER(STRSTARTS(STR(?role), "http://www.cidoc-crm.org/cidoc-crm/P") || STRSTARTS(STR(?role), "https://nde.nl/ontology/hc/")) {event_type_filter} }} """ results = self.sparql(sparql) return self._results_to_events(results) def _get_semantic_candidates( self, query: str, limit: int ) -> dict[str, HeritageEvent]: """Get events via semantic similarity.""" try: results = self.vector_search(query, limit) except Exception as e: logger.warning(f"Vector search failed: {e}") return {} events = {} for r in results: payload = r.get("payload", {}) if isinstance(r, dict) else {} event_id = r.get("id", str(id(r))) try: event_date = datetime.fromisoformat( payload.get("event_date", datetime.now().isoformat()) ) except (ValueError, TypeError): event_date = datetime.now() event = HeritageEvent( event_id=event_id, event_type=payload.get("event_type", "UNKNOWN"), event_date=event_date, participants=payload.get("participants", {}), description=payload.get("description", ""), confidence=r.get("score", 0.5) ) events[event.event_id] = event return events def _score_event( self, event: HeritageEvent, query: str, query_entities: list[str], query_time: datetime, weights: dict ) -> float: """Compute multi-factor relevance score.""" scores = {} # Entity overlap if query_entities: event_entities = set(event.participants.values()) overlap = len(event_entities.intersection(set(query_entities))) scores["entity"] = overlap / max(len(query_entities), 1) else: scores["entity"] = 0.5 # Neutral # Semantic similarity try: query_emb = self.embed(query) if event.embedding: scores["semantic"] = self._cosine_similarity(query_emb, event.embedding) elif event.description: desc_emb = self.embed(event.description) scores["semantic"] = self._cosine_similarity(query_emb, desc_emb) else: scores["semantic"] = 0.5 except Exception as e: logger.warning(f"Embedding failed: {e}") scores["semantic"] = 0.5 # Temporal relevance if query_time and event.event_date: days_diff = abs((query_time - event.event_date).days) scores["temporal"] = 1.0 / (1.0 + days_diff / 365.0) else: scores["temporal"] = 0.5 # Neutral # Graph connectivity (placeholder - would use SPARQL for full implementation) scores["graph"] = 0.5 # Weighted sum final_score = sum(weights.get(k, 0) * scores.get(k, 0.5) for k in weights) return final_score def _cosine_similarity(self, a: list[float], b: list[float]) -> float: """Compute cosine similarity between two vectors.""" a_np = np.array(a) b_np = np.array(b) norm_product = np.linalg.norm(a_np) * np.linalg.norm(b_np) if norm_product == 0: return 0.0 return float(np.dot(a_np, b_np) / norm_product) def _results_to_events(self, results: list[dict]) -> dict[str, HeritageEvent]: """Convert SPARQL results to HeritageEvent objects.""" events = {} # Group by event ID by_event: dict[str, dict[str, Any]] = {} for row in results: event_id = row.get("event") if not event_id: continue if event_id not in by_event: by_event[event_id] = { "event_type": row.get("eventType", "UNKNOWN"), "date": row.get("date"), "description": row.get("description", ""), "participants": {} } role = row.get("role", "") if "/" in role: role = role.split("/")[-1] # Extract role from URI participant = row.get("participant") if role and participant: by_event[event_id]["participants"][role] = participant # Convert to HeritageEvent objects for event_id, data in by_event.items(): try: event_date = datetime.fromisoformat(data["date"]) if data["date"] else datetime.now() except (ValueError, TypeError): event_date = datetime.now() events[event_id] = HeritageEvent( event_id=event_id, event_type=data["event_type"], event_date=event_date, participants=data["participants"], description=data["description"] ) return events # Factory function for creating EventRetriever with default dependencies def create_event_retriever( oxigraph_endpoint: str = "http://localhost:7878/query", qdrant_collection: str = "heritage_events" ) -> EventRetriever: """ Create EventRetriever with standard GLAM dependencies. This is a convenience factory that wires up the retriever with default Oxigraph and Qdrant connections. """ # Import here to avoid circular dependencies import requests def sparql_query(query: str) -> list[dict]: """Execute SPARQL query against Oxigraph.""" response = requests.post( oxigraph_endpoint, data=query, headers={ "Content-Type": "application/sparql-query", "Accept": "application/json" }, timeout=30 ) response.raise_for_status() data = response.json() # Convert bindings to simple dict format results = [] for binding in data.get("results", {}).get("bindings", []): row = {} for key, val in binding.items(): row[key] = val.get("value") results.append(row) return results def qdrant_search(query: str, limit: int) -> list[dict]: """Search Qdrant events collection.""" # Placeholder - would use actual Qdrant client logger.warning("Qdrant search not implemented - using empty results") return [] def embed(text: str) -> list[float]: """Embed text using default embedding model.""" # Placeholder - would use actual embedding model logger.warning("Embedding not implemented - using random vector") return list(np.random.randn(384)) return EventRetriever( oxigraph_query_fn=sparql_query, qdrant_search_fn=qdrant_search, embed_fn=embed )