583 lines
20 KiB
Markdown
583 lines
20 KiB
Markdown
# DSPy Signatures for Heritage Custodian Extraction
|
|
|
|
## Overview
|
|
|
|
This document defines the DSPy signatures (module interfaces) for the Heritage Custodian RAG pipeline. Each signature maps to LinkML classes and follows the CH-Annotator v1.7.0 convention.
|
|
|
|
## Core Signatures
|
|
|
|
### 1. CustodianTypeClassifier
|
|
|
|
Classifies text chunks into GLAMORCUBESFIXPHDNT categories.
|
|
|
|
```python
|
|
import dspy
|
|
from typing import List, Optional
|
|
from pydantic import BaseModel, Field
|
|
|
|
class CustodianTypeOutput(BaseModel):
|
|
"""Output schema for custodian type classification."""
|
|
primary_type: str = Field(description="Primary GLAMORCUBESFIXPHDNT type code (G, L, A, M, O, R, C, U, B, E, S, F, I, X, P, H, D, N, T)")
|
|
primary_type_label: str = Field(description="Human-readable type label")
|
|
confidence: float = Field(ge=0.0, le=1.0, description="Confidence score")
|
|
secondary_types: List[str] = Field(default=[], description="Additional types if MIXED")
|
|
reasoning: str = Field(description="Explanation for classification")
|
|
|
|
class CustodianTypeClassifier(dspy.Signature):
|
|
"""Classify heritage institution text into GLAMORCUBESFIXPHDNT taxonomy.
|
|
|
|
The 19 types are:
|
|
- G: GALLERY (art gallery, exhibition space)
|
|
- L: LIBRARY (public, academic, national library)
|
|
- A: ARCHIVE (government, corporate, personal archive)
|
|
- M: MUSEUM (art, history, science museum)
|
|
- O: OFFICIAL_INSTITUTION (government heritage agency)
|
|
- R: RESEARCH_CENTER (research institute, documentation center)
|
|
- C: COMMERCIAL (corporate heritage, company archive)
|
|
- U: UNSPECIFIED (cannot determine type)
|
|
- B: BIO_CUSTODIAN (botanical garden, zoo, aquarium)
|
|
- E: EDUCATION_PROVIDER (university with collections)
|
|
- S: HERITAGE_SOCIETY (historical society, heemkundige kring)
|
|
- F: FEATURE_CUSTODIAN (monument, historic mansion)
|
|
- I: INTANGIBLE_HERITAGE_GROUP (folklore, oral tradition)
|
|
- X: MIXED (multiple types simultaneously)
|
|
- P: PERSONAL_COLLECTION (private collector)
|
|
- H: HOLY_SACRED_SITE (church archive, monastery library)
|
|
- D: DIGITAL_PLATFORM (online archive, virtual museum)
|
|
- N: NON_PROFIT (heritage NGO)
|
|
- T: TASTE_SCENT_HERITAGE (culinary, perfumery heritage)
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text describing a heritage institution")
|
|
context: Optional[str] = dspy.InputField(desc="Additional context (country, source)", default=None)
|
|
|
|
classification: CustodianTypeOutput = dspy.OutputField(desc="Classification result")
|
|
|
|
|
|
# Example usage
|
|
class CustodianTypeModule(dspy.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.classifier = dspy.ChainOfThought(CustodianTypeClassifier)
|
|
|
|
def forward(self, text: str, context: str = None) -> CustodianTypeOutput:
|
|
result = self.classifier(text=text, context=context)
|
|
return result.classification
|
|
```
|
|
|
|
### 2. CustodianEntityExtractor
|
|
|
|
Extracts heritage institution entities from text.
|
|
|
|
```python
|
|
class CustodianEntity(BaseModel):
|
|
"""Extracted custodian entity conforming to LinkML Custodian class."""
|
|
name: str = Field(description="Official emic name of institution")
|
|
custodian_type: str = Field(description="GLAMORCUBESFIXPHDNT type code")
|
|
description: Optional[str] = Field(default=None, description="Institution description")
|
|
alternative_names: List[str] = Field(default=[], description="Alternative names, abbreviations")
|
|
founding_date: Optional[str] = Field(default=None, description="Founding date (ISO 8601)")
|
|
|
|
# Location (maps to CustodianPlace)
|
|
city: Optional[str] = Field(default=None)
|
|
country_code: Optional[str] = Field(default=None, description="ISO 3166-1 alpha-2")
|
|
address: Optional[str] = Field(default=None)
|
|
|
|
# Identifiers
|
|
wikidata_id: Optional[str] = Field(default=None, description="Wikidata Q-number")
|
|
isil_code: Optional[str] = Field(default=None, description="ISIL identifier")
|
|
website: Optional[str] = Field(default=None)
|
|
|
|
# Extraction metadata
|
|
text_span: str = Field(description="Original text span mentioning entity")
|
|
confidence: float = Field(ge=0.0, le=1.0)
|
|
|
|
class CustodianExtractorOutput(BaseModel):
|
|
entities: List[CustodianEntity] = Field(description="Extracted custodian entities")
|
|
entity_count: int = Field(description="Number of entities extracted")
|
|
|
|
class CustodianEntityExtractor(dspy.Signature):
|
|
"""Extract heritage institution entities from text.
|
|
|
|
Focus on:
|
|
- Institution names (museums, archives, libraries, etc.)
|
|
- Associated locations
|
|
- Identifiers mentioned (ISIL, Wikidata, etc.)
|
|
- Organizational details (founding date, type)
|
|
|
|
Return structured entities conforming to the Heritage Custodian ontology.
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text potentially containing heritage institution mentions")
|
|
expected_types: List[str] = dspy.InputField(
|
|
desc="Expected custodian types to look for (e.g., ['MUSEUM', 'ARCHIVE'])",
|
|
default=[]
|
|
)
|
|
country_hint: Optional[str] = dspy.InputField(
|
|
desc="Country context for better extraction",
|
|
default=None
|
|
)
|
|
|
|
extracted: CustodianExtractorOutput = dspy.OutputField(desc="Extracted entities")
|
|
```
|
|
|
|
### 3. IdentifierExtractor
|
|
|
|
Extracts and validates external identifiers.
|
|
|
|
```python
|
|
class Identifier(BaseModel):
|
|
"""Extracted identifier conforming to LinkML Identifier class."""
|
|
scheme: str = Field(description="Identifier scheme (ISIL, Wikidata, VIAF, KvK, ISNI)")
|
|
value: str = Field(description="Identifier value")
|
|
url: Optional[str] = Field(default=None, description="Resolvable URL")
|
|
confidence: float = Field(ge=0.0, le=1.0)
|
|
valid: bool = Field(description="Whether identifier passes format validation")
|
|
|
|
class IdentifierExtractorOutput(BaseModel):
|
|
identifiers: List[Identifier]
|
|
|
|
class IdentifierExtractor(dspy.Signature):
|
|
"""Extract external identifiers from text.
|
|
|
|
Identifier patterns:
|
|
- ISIL: XX-XXXXX (e.g., NL-AmRM, US-DLC)
|
|
- Wikidata: Q followed by digits (Q190804)
|
|
- VIAF: viaf.org/viaf/NNNNNN
|
|
- KvK: 8-digit Dutch chamber of commerce number
|
|
- ISNI: 16-digit ISNI (0000 0001 2345 6789)
|
|
- ROR: 0xxxxxxxx (Research Organization Registry)
|
|
|
|
Validate format and return structured identifiers.
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text containing potential identifiers")
|
|
|
|
extracted: IdentifierExtractorOutput = dspy.OutputField(desc="Extracted identifiers")
|
|
|
|
|
|
class IdentifierExtractorModule(dspy.Module):
|
|
"""Module with validation logic for identifiers."""
|
|
|
|
PATTERNS = {
|
|
"ISIL": r"[A-Z]{2}-[A-Za-z0-9]+",
|
|
"Wikidata": r"Q\d+",
|
|
"VIAF": r"viaf\.org/viaf/(\d+)",
|
|
"KvK": r"\b\d{8}\b",
|
|
"ISNI": r"\d{4}\s?\d{4}\s?\d{4}\s?\d{4}",
|
|
"ROR": r"0[a-z0-9]{8}",
|
|
}
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.extractor = dspy.ChainOfThought(IdentifierExtractor)
|
|
|
|
def validate_identifier(self, scheme: str, value: str) -> bool:
|
|
import re
|
|
pattern = self.PATTERNS.get(scheme)
|
|
if not pattern:
|
|
return False
|
|
return bool(re.match(pattern, value))
|
|
|
|
def forward(self, text: str) -> IdentifierExtractorOutput:
|
|
result = self.extractor(text=text)
|
|
# Post-validate identifiers
|
|
for ident in result.extracted.identifiers:
|
|
ident.valid = self.validate_identifier(ident.scheme, ident.value)
|
|
return result.extracted
|
|
```
|
|
|
|
### 4. CollectionExtractor
|
|
|
|
Extracts collection metadata.
|
|
|
|
```python
|
|
class Collection(BaseModel):
|
|
"""Extracted collection conforming to LinkML CustodianCollection class."""
|
|
name: str = Field(description="Collection name")
|
|
description: Optional[str] = Field(default=None)
|
|
collection_type: Optional[str] = Field(
|
|
default=None,
|
|
description="archival, bibliographic, museum_objects, audio_visual, etc."
|
|
)
|
|
subject_areas: List[str] = Field(default=[], description="Subject/topic areas")
|
|
temporal_extent: Optional[str] = Field(
|
|
default=None,
|
|
description="Time period covered (e.g., '1800-1900', 'medieval')"
|
|
)
|
|
extent: Optional[str] = Field(
|
|
default=None,
|
|
description="Size/extent (e.g., '10,000 items', '500 linear meters')"
|
|
)
|
|
custodian_name: Optional[str] = Field(
|
|
default=None,
|
|
description="Name of institution holding collection"
|
|
)
|
|
|
|
class CollectionExtractorOutput(BaseModel):
|
|
collections: List[Collection]
|
|
|
|
class CollectionExtractor(dspy.Signature):
|
|
"""Extract heritage collection information from text.
|
|
|
|
Look for:
|
|
- Named collections (e.g., "Van Gogh collection")
|
|
- Collection descriptions
|
|
- Subject areas and themes
|
|
- Temporal coverage (centuries, periods)
|
|
- Extent/size information
|
|
- Digital availability
|
|
|
|
Map to CIDOC-CRM E78_Curated_Holding / PREMIS vocabulary.
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text describing collections")
|
|
custodian_context: Optional[str] = dspy.InputField(
|
|
desc="Institution context",
|
|
default=None
|
|
)
|
|
|
|
extracted: CollectionExtractorOutput = dspy.OutputField(desc="Extracted collections")
|
|
```
|
|
|
|
### 5. RelationshipExtractor
|
|
|
|
Extracts relationships between entities.
|
|
|
|
```python
|
|
class Relationship(BaseModel):
|
|
"""Extracted relationship between heritage entities."""
|
|
source_entity: str = Field(description="Source entity name")
|
|
relationship_type: str = Field(
|
|
description="Relationship type: member_of, part_of, collaborated_with, merged_with, etc."
|
|
)
|
|
target_entity: str = Field(description="Target entity name")
|
|
temporal: Optional[str] = Field(default=None, description="When relationship held")
|
|
confidence: float = Field(ge=0.0, le=1.0)
|
|
|
|
class RelationshipExtractorOutput(BaseModel):
|
|
relationships: List[Relationship]
|
|
|
|
class RelationshipExtractor(dspy.Signature):
|
|
"""Extract relationships between heritage institutions.
|
|
|
|
Relationship types (from ontology):
|
|
- member_of: Institution is member of EncompassingBody
|
|
- part_of: Institution is organizational unit of another
|
|
- collaborated_with: Partnership/project collaboration
|
|
- merged_with: Historical merger (ChangeEvent)
|
|
- split_from: Historical split (ChangeEvent)
|
|
- succeeded_by: Institution succession
|
|
- participated_in_project: Project participation
|
|
- manages_collection: Collection management responsibility
|
|
|
|
Map to W3C ORG ontology (org:memberOf, org:subOrganizationOf, etc.)
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text describing institutional relationships")
|
|
known_entities: List[str] = dspy.InputField(
|
|
desc="Already-extracted entity names for reference",
|
|
default=[]
|
|
)
|
|
|
|
extracted: RelationshipExtractorOutput = dspy.OutputField(desc="Extracted relationships")
|
|
```
|
|
|
|
### 6. ChangeEventExtractor
|
|
|
|
Extracts organizational change events.
|
|
|
|
```python
|
|
class ChangeEvent(BaseModel):
|
|
"""Extracted change event conforming to LinkML ChangeEvent class."""
|
|
event_type: str = Field(
|
|
description="FOUNDING, CLOSURE, MERGER, SPLIT, ACQUISITION, RELOCATION, NAME_CHANGE, TYPE_CHANGE, STATUS_CHANGE, RESTRUCTURING, LEGAL_CHANGE"
|
|
)
|
|
event_date: Optional[str] = Field(default=None, description="ISO 8601 date")
|
|
description: str = Field(description="Event description")
|
|
affected_entities: List[str] = Field(description="Entities affected by event")
|
|
resulting_entities: List[str] = Field(default=[], description="Entities resulting from event")
|
|
confidence: float = Field(ge=0.0, le=1.0)
|
|
|
|
class ChangeEventExtractorOutput(BaseModel):
|
|
events: List[ChangeEvent]
|
|
|
|
class ChangeEventExtractor(dspy.Signature):
|
|
"""Extract organizational change events from text.
|
|
|
|
Event types (from ChangeTypeEnum):
|
|
- FOUNDING: "established", "founded", "created", "opened"
|
|
- CLOSURE: "closed", "dissolved", "ceased operations"
|
|
- MERGER: "merged with", "combined with", "absorbed"
|
|
- SPLIT: "split into", "divided into", "spun off"
|
|
- ACQUISITION: "acquired", "took over"
|
|
- RELOCATION: "moved to", "relocated to"
|
|
- NAME_CHANGE: "renamed to", "formerly known as"
|
|
- TYPE_CHANGE: "became a museum", "converted to archive"
|
|
- STATUS_CHANGE: "reopened", "temporarily closed"
|
|
- RESTRUCTURING: "reorganized", "restructured"
|
|
- LEGAL_CHANGE: "incorporated as", "became a foundation"
|
|
|
|
Map to CIDOC-CRM E5_Event / PROV-O Activity.
|
|
"""
|
|
|
|
text: str = dspy.InputField(desc="Text describing organizational changes")
|
|
|
|
extracted: ChangeEventExtractorOutput = dspy.OutputField(desc="Extracted events")
|
|
```
|
|
|
|
### 7. WebClaimExtractor
|
|
|
|
Extracts claims from web pages with XPath provenance.
|
|
|
|
```python
|
|
class WebClaim(BaseModel):
|
|
"""Extracted web claim conforming to CanonicalClaimTypes enum."""
|
|
claim_type: str = Field(
|
|
description="Canonical claim type: full_name, description, email, phone, address, etc."
|
|
)
|
|
claim_value: str = Field(description="Extracted value")
|
|
xpath: str = Field(description="XPath to source element")
|
|
confidence: float = Field(ge=0.0, le=1.0)
|
|
tier: int = Field(ge=1, le=3, description="Reliability tier (1=structural, 2=pattern, 3=NLP)")
|
|
|
|
class WebClaimExtractorOutput(BaseModel):
|
|
claims: List[WebClaim]
|
|
source_url: str
|
|
retrieved_on: str
|
|
|
|
class WebClaimExtractor(dspy.Signature):
|
|
"""Extract claims from web page content with XPath provenance.
|
|
|
|
CRITICAL: Every claim MUST have an XPath pointer to the source element.
|
|
Claims without XPath are considered fabricated per AGENTS.md Rule 6.
|
|
|
|
Claim tiers:
|
|
- Tier 1 (STRUCTURAL): page_title, page_count, image_count (from HTML structure)
|
|
- Tier 2 (PATTERN): social_facebook, isil_code, kvk_number (regex patterns)
|
|
- Tier 3 (NLP): full_name, description, address (requires XPath verification)
|
|
|
|
Use Docling output for structured extraction.
|
|
"""
|
|
|
|
html_content: str = dspy.InputField(desc="HTML content or Docling markdown")
|
|
source_url: str = dspy.InputField(desc="URL of source page")
|
|
|
|
extracted: WebClaimExtractorOutput = dspy.OutputField(desc="Extracted claims with XPath")
|
|
```
|
|
|
|
## Composite Modules
|
|
|
|
### HeritageEntityPipeline
|
|
|
|
Combines extractors into a full pipeline.
|
|
|
|
```python
|
|
class HeritageEntityPipeline(dspy.Module):
|
|
"""Full heritage entity extraction pipeline."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.type_classifier = CustodianTypeModule()
|
|
self.entity_extractor = dspy.ChainOfThought(CustodianEntityExtractor)
|
|
self.identifier_extractor = IdentifierExtractorModule()
|
|
self.collection_extractor = dspy.ChainOfThought(CollectionExtractor)
|
|
self.relationship_extractor = dspy.ChainOfThought(RelationshipExtractor)
|
|
self.change_event_extractor = dspy.ChainOfThought(ChangeEventExtractor)
|
|
|
|
def forward(
|
|
self,
|
|
text: str,
|
|
expected_types: List[str] = None,
|
|
country_hint: str = None
|
|
) -> dict:
|
|
# 1. Classify document type
|
|
type_result = self.type_classifier(text=text, context=country_hint)
|
|
|
|
# 2. Extract entities
|
|
entities = self.entity_extractor(
|
|
text=text,
|
|
expected_types=expected_types or [type_result.primary_type],
|
|
country_hint=country_hint
|
|
)
|
|
|
|
# 3. Extract identifiers
|
|
identifiers = self.identifier_extractor(text=text)
|
|
|
|
# 4. Extract collections
|
|
collections = self.collection_extractor(text=text)
|
|
|
|
# 5. Extract relationships
|
|
entity_names = [e.name for e in entities.extracted.entities]
|
|
relationships = self.relationship_extractor(
|
|
text=text,
|
|
known_entities=entity_names
|
|
)
|
|
|
|
# 6. Extract change events
|
|
events = self.change_event_extractor(text=text)
|
|
|
|
return {
|
|
"document_type": type_result,
|
|
"entities": entities.extracted,
|
|
"identifiers": identifiers,
|
|
"collections": collections.extracted,
|
|
"relationships": relationships.extracted,
|
|
"events": events.extracted,
|
|
}
|
|
```
|
|
|
|
### QueryRouter
|
|
|
|
Routes queries to appropriate retrieval strategies.
|
|
|
|
```python
|
|
class QueryIntent(BaseModel):
|
|
"""Classified query intent."""
|
|
intent_type: str = Field(
|
|
description="factual, comparative, exploratory, relationship, temporal"
|
|
)
|
|
custodian_types: List[str] = Field(
|
|
description="Relevant GLAMORCUBESFIXPHDNT types"
|
|
)
|
|
geographic_scope: Optional[str] = Field(default=None, description="Country/region filter")
|
|
temporal_scope: Optional[str] = Field(default=None, description="Time period filter")
|
|
entity_mentions: List[str] = Field(default=[], description="Mentioned entity names")
|
|
|
|
class QueryRouter(dspy.Signature):
|
|
"""Route heritage queries to appropriate retrieval strategies.
|
|
|
|
Intent types:
|
|
- factual: "What is the ISIL code for Rijksmuseum?"
|
|
- comparative: "Compare Dutch and Belgian archive systems"
|
|
- exploratory: "What museums exist in Limburg?"
|
|
- relationship: "Which institutions are members of NDE?"
|
|
- temporal: "How has the Noord-Hollands Archief changed since 2000?"
|
|
|
|
Route to:
|
|
- Vector retrieval for semantic similarity
|
|
- Graph traversal for relationships
|
|
- SPARQL for structured queries
|
|
- Hybrid for complex questions
|
|
"""
|
|
|
|
query: str = dspy.InputField(desc="User query")
|
|
|
|
intent: QueryIntent = dspy.OutputField(desc="Classified query intent")
|
|
```
|
|
|
|
## Optimizers and Metrics
|
|
|
|
### Extraction Accuracy Metric
|
|
|
|
```python
|
|
def extraction_accuracy(pred: CustodianExtractorOutput, gold: CustodianExtractorOutput) -> float:
|
|
"""Measure extraction accuracy against gold standard."""
|
|
|
|
pred_names = {e.name.lower() for e in pred.entities}
|
|
gold_names = {e.name.lower() for e in gold.entities}
|
|
|
|
if not gold_names:
|
|
return 1.0 if not pred_names else 0.0
|
|
|
|
precision = len(pred_names & gold_names) / len(pred_names) if pred_names else 0.0
|
|
recall = len(pred_names & gold_names) / len(gold_names)
|
|
|
|
if precision + recall == 0:
|
|
return 0.0
|
|
|
|
f1 = 2 * (precision * recall) / (precision + recall)
|
|
return f1
|
|
```
|
|
|
|
### Type Classification Metric
|
|
|
|
```python
|
|
def type_classification_accuracy(pred: CustodianTypeOutput, gold: CustodianTypeOutput) -> float:
|
|
"""Measure type classification accuracy."""
|
|
|
|
# Primary type match
|
|
primary_match = 1.0 if pred.primary_type == gold.primary_type else 0.0
|
|
|
|
# Secondary types (for MIXED)
|
|
if gold.secondary_types:
|
|
pred_set = set(pred.secondary_types)
|
|
gold_set = set(gold.secondary_types)
|
|
secondary_jaccard = len(pred_set & gold_set) / len(pred_set | gold_set) if (pred_set | gold_set) else 1.0
|
|
else:
|
|
secondary_jaccard = 1.0 if not pred.secondary_types else 0.5
|
|
|
|
return 0.7 * primary_match + 0.3 * secondary_jaccard
|
|
```
|
|
|
|
### DSPy Optimizer Configuration
|
|
|
|
```python
|
|
from dspy.teleprompt import BootstrapFewShot
|
|
|
|
# Training data
|
|
trainset = [
|
|
dspy.Example(
|
|
text="The Rijksmuseum in Amsterdam is one of the most famous art museums in the world...",
|
|
expected_types=["MUSEUM"],
|
|
country_hint="NL"
|
|
).with_inputs("text", "expected_types", "country_hint"),
|
|
# ... more examples
|
|
]
|
|
|
|
# Optimizer
|
|
optimizer = BootstrapFewShot(
|
|
metric=extraction_accuracy,
|
|
max_bootstrapped_demos=4,
|
|
max_labeled_demos=16,
|
|
)
|
|
|
|
# Compile
|
|
compiled_pipeline = optimizer.compile(
|
|
HeritageEntityPipeline(),
|
|
trainset=trainset
|
|
)
|
|
```
|
|
|
|
## Example Invocations
|
|
|
|
### Extract from Conversation JSON
|
|
|
|
```python
|
|
import json
|
|
|
|
# Load conversation
|
|
with open("data/conversations/brazilian_glam.json") as f:
|
|
conv = json.load(f)
|
|
|
|
# Extract from assistant messages
|
|
pipeline = HeritageEntityPipeline()
|
|
for msg in conv["chat_messages"]:
|
|
if msg["sender"] == "assistant":
|
|
text = msg["text"]
|
|
result = pipeline(text=text, country_hint="BR")
|
|
|
|
for entity in result["entities"].entities:
|
|
print(f"Found: {entity.name} ({entity.custodian_type})")
|
|
```
|
|
|
|
### Extract from Web Archive
|
|
|
|
```python
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
# Convert HTML to markdown
|
|
converter = DocumentConverter()
|
|
doc = converter.convert("data/web/rijksmuseum.html")
|
|
markdown = doc.document.export_to_markdown()
|
|
|
|
# Extract with web claims
|
|
claim_extractor = dspy.ChainOfThought(WebClaimExtractor)
|
|
claims = claim_extractor(
|
|
html_content=markdown,
|
|
source_url="https://www.rijksmuseum.nl/"
|
|
)
|
|
|
|
for claim in claims.extracted.claims:
|
|
print(f"{claim.claim_type}: {claim.claim_value} (xpath: {claim.xpath})")
|
|
```
|