483 lines
16 KiB
Markdown
483 lines
16 KiB
Markdown
# Schema-Aware Chunking Strategy
|
|
|
|
## Overview
|
|
|
|
Traditional chunking strategies (fixed-size, recursive split) lose ontological context. This document describes a schema-aware chunking strategy that respects the Heritage Custodian ontology structure.
|
|
|
|
## Chunking Hierarchy
|
|
|
|
```
|
|
Document
|
|
│
|
|
├── Custodian Chunks (entity boundaries)
|
|
│ ├── Name mentions
|
|
│ ├── Type indicators
|
|
│ └── Description blocks
|
|
│
|
|
├── Observation Chunks (evidence units)
|
|
│ ├── Source attribution
|
|
│ ├── Claim statements
|
|
│ └── XPath pointers
|
|
│
|
|
├── Collection Chunks (holdings descriptions)
|
|
│ ├── Collection names
|
|
│ ├── Subject areas
|
|
│ └── Extent information
|
|
│
|
|
├── Relationship Chunks (inter-entity links)
|
|
│ ├── Membership statements
|
|
│ ├── Collaboration mentions
|
|
│ └── Historical events
|
|
│
|
|
└── Context Chunks (background information)
|
|
├── Country/region context
|
|
├── Domain background
|
|
└── Temporal context
|
|
```
|
|
|
|
## Chunking Strategies by Source Type
|
|
|
|
### 1. Conversation JSON Chunking
|
|
|
|
Conversations are chunked by message boundaries with entity co-reference:
|
|
|
|
```python
|
|
class ConversationChunker:
|
|
"""Chunk conversation JSON respecting entity boundaries."""
|
|
|
|
def __init__(self, entity_extractor):
|
|
self.entity_extractor = entity_extractor
|
|
self.coref_resolver = EntityCorefResolver()
|
|
|
|
def chunk(self, conversation: dict) -> List[Chunk]:
|
|
chunks = []
|
|
entity_mentions = {} # Track entity mentions across messages
|
|
|
|
for i, message in enumerate(conversation["chat_messages"]):
|
|
text = message["text"]
|
|
sender = message["sender"]
|
|
|
|
# Skip short messages
|
|
if len(text) < 100:
|
|
continue
|
|
|
|
# Extract entities from this message
|
|
entities = self.entity_extractor(text)
|
|
|
|
# Resolve coreferences with previous mentions
|
|
for entity in entities:
|
|
resolved = self.coref_resolver.resolve(entity, entity_mentions)
|
|
entity_mentions[resolved.name] = resolved
|
|
|
|
# Create chunk with entity metadata
|
|
chunk = Chunk(
|
|
text=text,
|
|
metadata={
|
|
"source_type": "conversation",
|
|
"conversation_id": conversation["uuid"],
|
|
"message_index": i,
|
|
"sender": sender,
|
|
"mentioned_entities": [e.name for e in entities],
|
|
"entity_types": list(set(e.custodian_type for e in entities)),
|
|
}
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
```
|
|
|
|
**Chunk boundaries:**
|
|
- Message boundaries (natural conversation turns)
|
|
- Long messages split at paragraph breaks
|
|
- Entity mention continuity preserved
|
|
|
|
**Metadata added:**
|
|
- `conversation_id`: Source conversation UUID
|
|
- `message_index`: Position in conversation
|
|
- `sender`: human/assistant
|
|
- `mentioned_entities`: Entity names in chunk
|
|
- `entity_types`: GLAMORCUBESFIXPHDNT types
|
|
|
|
### 2. Website HTML Chunking
|
|
|
|
Websites are chunked by semantic sections detected via Docling:
|
|
|
|
```python
|
|
class WebsiteChunker:
|
|
"""Chunk website HTML using Docling structure analysis."""
|
|
|
|
def __init__(self):
|
|
self.docling = DocumentConverter()
|
|
self.section_classifier = SectionClassifier()
|
|
|
|
def chunk(self, html_path: str, source_url: str) -> List[Chunk]:
|
|
# Convert to structured document
|
|
doc = self.docling.convert(html_path)
|
|
|
|
chunks = []
|
|
for section in doc.document.sections:
|
|
# Classify section type
|
|
section_type = self.section_classifier.classify(section)
|
|
|
|
# Skip navigation, footer, etc.
|
|
if section_type in ["navigation", "footer", "sidebar"]:
|
|
continue
|
|
|
|
# Extract XPath for provenance
|
|
xpath = self._get_xpath(section)
|
|
|
|
chunk = Chunk(
|
|
text=section.text,
|
|
metadata={
|
|
"source_type": "website",
|
|
"source_url": source_url,
|
|
"section_type": section_type, # about, contact, collection, etc.
|
|
"xpath": xpath,
|
|
"has_images": section.has_pictures,
|
|
"html_tag": section.original_tag,
|
|
}
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
def _get_xpath(self, section) -> str:
|
|
"""Generate XPath for section element."""
|
|
# Implementation depends on Docling structure
|
|
pass
|
|
```
|
|
|
|
**Section types detected:**
|
|
- `about`: Organization description (maps to Custodian.description)
|
|
- `contact`: Contact information (maps to CustodianPlace)
|
|
- `collection`: Collection descriptions (maps to CustodianCollection)
|
|
- `visit`: Visitor information
|
|
- `news`: Recent updates
|
|
- `history`: Historical information (maps to ChangeEvent)
|
|
|
|
### 3. CSV Registry Chunking
|
|
|
|
CSV rows become individual chunks with schema mapping:
|
|
|
|
```python
|
|
class CSVRegistryChunker:
|
|
"""Chunk CSV registry data with schema mapping."""
|
|
|
|
SCHEMA_MAPPINGS = {
|
|
# ISIL registry columns → LinkML slots
|
|
"Instelling": "preferred_label",
|
|
"Plaats": "settlement",
|
|
"ISIL code": "isil_code",
|
|
"Toegekend op": "registration_date",
|
|
"Opmerking": "provenance_note",
|
|
|
|
# Dutch organizations CSV columns
|
|
"naam_organisatie": "preferred_label",
|
|
"postadres_plaats": "settlement",
|
|
"ISIL-code": "isil_code",
|
|
"KvK-nummer": "registration_number",
|
|
"Collectie Nederland": "aggregator_membership",
|
|
}
|
|
|
|
def chunk(self, csv_path: str, registry_type: str) -> List[Chunk]:
|
|
df = pd.read_csv(csv_path, encoding='utf-8-sig')
|
|
|
|
chunks = []
|
|
for idx, row in df.iterrows():
|
|
# Build text representation
|
|
text_parts = []
|
|
metadata = {
|
|
"source_type": "csv_registry",
|
|
"registry_type": registry_type,
|
|
"row_index": idx,
|
|
"data_tier": "TIER_1_AUTHORITATIVE",
|
|
}
|
|
|
|
for col, slot in self.SCHEMA_MAPPINGS.items():
|
|
if col in row and pd.notna(row[col]):
|
|
value = str(row[col])
|
|
text_parts.append(f"{slot}: {value}")
|
|
metadata[slot] = value
|
|
|
|
chunk = Chunk(
|
|
text="\n".join(text_parts),
|
|
metadata=metadata
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
```
|
|
|
|
**CSV sources:**
|
|
- ISIL registry (`ISIL-codes_2025-08-01.csv`)
|
|
- Dutch organizations (`voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv`)
|
|
- Country-specific registries
|
|
|
|
### 4. YAML Instance Chunking
|
|
|
|
LinkML instance files are chunked by entity:
|
|
|
|
```python
|
|
class YAMLInstanceChunker:
|
|
"""Chunk YAML instance files by entity."""
|
|
|
|
def chunk(self, yaml_path: str) -> List[Chunk]:
|
|
with open(yaml_path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle single entity or list
|
|
entities = data if isinstance(data, list) else [data]
|
|
|
|
chunks = []
|
|
for entity in entities:
|
|
# Build text from entity fields
|
|
text = self._entity_to_text(entity)
|
|
|
|
# Extract metadata from entity
|
|
metadata = {
|
|
"source_type": "yaml_instance",
|
|
"yaml_path": yaml_path,
|
|
"hc_id": entity.get("hc_id"),
|
|
"ghcid": entity.get("ghcid"),
|
|
"custodian_type": entity.get("custodian_type"),
|
|
"country_code": self._extract_country(entity),
|
|
}
|
|
|
|
chunk = Chunk(
|
|
text=text,
|
|
metadata=metadata
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
def _entity_to_text(self, entity: dict) -> str:
|
|
"""Convert entity dict to searchable text."""
|
|
parts = []
|
|
|
|
# Core identity
|
|
if "preferred_label" in entity:
|
|
parts.append(f"Name: {entity['preferred_label']}")
|
|
if "description" in entity:
|
|
parts.append(f"Description: {entity['description']}")
|
|
|
|
# Location
|
|
if "custodian_place" in entity:
|
|
place = entity["custodian_place"]
|
|
parts.append(f"Location: {place.get('settlement', '')} {place.get('country', '')}")
|
|
|
|
# Collections
|
|
if "collections" in entity:
|
|
for coll in entity["collections"]:
|
|
parts.append(f"Collection: {coll.get('collection_name', '')}")
|
|
|
|
# Identifiers
|
|
if "identifiers" in entity:
|
|
for ident in entity["identifiers"]:
|
|
parts.append(f"{ident['identifier_scheme']}: {ident['identifier_value']}")
|
|
|
|
return "\n".join(parts)
|
|
```
|
|
|
|
## Chunk Metadata Schema
|
|
|
|
All chunks share a common metadata schema for consistent retrieval:
|
|
|
|
```python
|
|
class ChunkMetadata(BaseModel):
|
|
"""Universal chunk metadata schema."""
|
|
|
|
# Source identification
|
|
source_type: Literal["conversation", "website", "csv_registry", "yaml_instance", "wikidata"]
|
|
source_id: str # Unique source identifier
|
|
|
|
# Ontology mapping
|
|
primary_class: Optional[str] = None # LinkML class name
|
|
mentioned_classes: List[str] = []
|
|
|
|
# Entity information
|
|
custodian_type: Optional[str] = None # GLAMORCUBESFIXPHDNT code
|
|
mentioned_entities: List[str] = []
|
|
ghcid: Optional[str] = None
|
|
|
|
# Geographic scope
|
|
country_code: Optional[str] = None # ISO 3166-1 alpha-2
|
|
region_code: Optional[str] = None
|
|
settlement: Optional[str] = None
|
|
|
|
# Provenance
|
|
data_tier: str = "TIER_4_INFERRED"
|
|
extraction_date: str
|
|
confidence_score: float = 0.5
|
|
|
|
# Retrieval hints
|
|
language: str = "en"
|
|
has_identifiers: bool = False
|
|
has_collections: bool = False
|
|
has_relationships: bool = False
|
|
```
|
|
|
|
## Embedding Strategy
|
|
|
|
### Multilingual Embeddings
|
|
|
|
Use BGE-M3 for multilingual support (NL, EN, DE, FR, ES, PT, etc.):
|
|
|
|
```python
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
class HeritageEmbedder:
|
|
"""Domain-aware embedding generator."""
|
|
|
|
def __init__(self):
|
|
self.model = SentenceTransformer("BAAI/bge-m3")
|
|
|
|
# Heritage domain prefixes for query/passage distinction
|
|
self.QUERY_PREFIX = "Represent this heritage institution query: "
|
|
self.PASSAGE_PREFIX = "Represent this heritage institution text: "
|
|
|
|
def embed_passage(self, text: str, metadata: ChunkMetadata) -> List[float]:
|
|
"""Embed passage with domain context."""
|
|
|
|
# Add type context for better discrimination
|
|
context = ""
|
|
if metadata.custodian_type:
|
|
type_label = CUSTODIAN_TYPE_LABELS.get(metadata.custodian_type, "")
|
|
context = f"[{type_label}] "
|
|
|
|
if metadata.country_code:
|
|
context += f"[{metadata.country_code}] "
|
|
|
|
full_text = self.PASSAGE_PREFIX + context + text
|
|
return self.model.encode(full_text).tolist()
|
|
|
|
def embed_query(self, query: str, type_filter: str = None) -> List[float]:
|
|
"""Embed query with optional type context."""
|
|
|
|
context = ""
|
|
if type_filter:
|
|
type_label = CUSTODIAN_TYPE_LABELS.get(type_filter, "")
|
|
context = f"[{type_label}] "
|
|
|
|
full_query = self.QUERY_PREFIX + context + query
|
|
return self.model.encode(full_query).tolist()
|
|
```
|
|
|
|
### Hybrid Embeddings
|
|
|
|
Combine dense and sparse embeddings for better retrieval:
|
|
|
|
```python
|
|
class HybridEmbedder:
|
|
"""Combine dense and sparse embeddings."""
|
|
|
|
def __init__(self):
|
|
self.dense_model = SentenceTransformer("BAAI/bge-m3")
|
|
self.sparse_model = BM25Encoder() # Or SPLADE
|
|
|
|
def embed(self, text: str, metadata: ChunkMetadata) -> dict:
|
|
return {
|
|
"dense": self.dense_model.encode(text).tolist(),
|
|
"sparse": self.sparse_model.encode(text),
|
|
"metadata": metadata.dict(),
|
|
}
|
|
```
|
|
|
|
## Chunk Size Guidelines
|
|
|
|
| Source Type | Target Size | Max Size | Overlap |
|
|
|-------------|-------------|----------|---------|
|
|
| Conversation | 500 tokens | 1000 tokens | 100 tokens |
|
|
| Website | 300 tokens | 800 tokens | 50 tokens |
|
|
| CSV Registry | 1 row | 1 row | None |
|
|
| YAML Instance | 1 entity | 1 entity | None |
|
|
|
|
## Deduplication
|
|
|
|
Prevent duplicate chunks from multiple sources:
|
|
|
|
```python
|
|
class ChunkDeduplicator:
|
|
"""Deduplicate chunks using content hashing and entity matching."""
|
|
|
|
def __init__(self):
|
|
self.seen_hashes = set()
|
|
self.entity_chunks = {} # entity_name → best chunk
|
|
|
|
def deduplicate(self, chunks: List[Chunk]) -> List[Chunk]:
|
|
unique_chunks = []
|
|
|
|
for chunk in chunks:
|
|
# Content hash deduplication
|
|
content_hash = hashlib.md5(chunk.text.encode()).hexdigest()
|
|
if content_hash in self.seen_hashes:
|
|
continue
|
|
self.seen_hashes.add(content_hash)
|
|
|
|
# Entity-based deduplication (keep highest tier)
|
|
for entity in chunk.metadata.get("mentioned_entities", []):
|
|
if entity in self.entity_chunks:
|
|
existing = self.entity_chunks[entity]
|
|
# Keep higher tier source
|
|
if self._tier_priority(chunk) > self._tier_priority(existing):
|
|
unique_chunks.remove(existing)
|
|
self.entity_chunks[entity] = chunk
|
|
unique_chunks.append(chunk)
|
|
else:
|
|
self.entity_chunks[entity] = chunk
|
|
unique_chunks.append(chunk)
|
|
|
|
return unique_chunks
|
|
|
|
def _tier_priority(self, chunk: Chunk) -> int:
|
|
tier = chunk.metadata.get("data_tier", "TIER_4_INFERRED")
|
|
return {
|
|
"TIER_1_AUTHORITATIVE": 4,
|
|
"TIER_2_VERIFIED": 3,
|
|
"TIER_3_CROWD_SOURCED": 2,
|
|
"TIER_4_INFERRED": 1,
|
|
}.get(tier, 0)
|
|
```
|
|
|
|
## Implementation
|
|
|
|
### Full Chunking Pipeline
|
|
|
|
```python
|
|
class HeritageChunkingPipeline:
|
|
"""Complete chunking pipeline for heritage documents."""
|
|
|
|
def __init__(self, schema_path: str):
|
|
self.schema = load_linkml_schema(schema_path)
|
|
self.conversation_chunker = ConversationChunker(entity_extractor)
|
|
self.website_chunker = WebsiteChunker()
|
|
self.csv_chunker = CSVRegistryChunker()
|
|
self.yaml_chunker = YAMLInstanceChunker()
|
|
self.embedder = HeritageEmbedder()
|
|
self.deduplicator = ChunkDeduplicator()
|
|
|
|
def process(self, source_path: str, source_type: str) -> List[Chunk]:
|
|
# Route to appropriate chunker
|
|
if source_type == "conversation":
|
|
chunks = self.conversation_chunker.chunk(json.load(open(source_path)))
|
|
elif source_type == "website":
|
|
chunks = self.website_chunker.chunk(source_path)
|
|
elif source_type == "csv":
|
|
chunks = self.csv_chunker.chunk(source_path)
|
|
elif source_type == "yaml":
|
|
chunks = self.yaml_chunker.chunk(source_path)
|
|
else:
|
|
raise ValueError(f"Unknown source type: {source_type}")
|
|
|
|
# Deduplicate
|
|
chunks = self.deduplicator.deduplicate(chunks)
|
|
|
|
# Generate embeddings
|
|
for chunk in chunks:
|
|
chunk.embedding = self.embedder.embed_passage(
|
|
chunk.text,
|
|
ChunkMetadata(**chunk.metadata)
|
|
)
|
|
|
|
return chunks
|
|
```
|