1041 lines
39 KiB
Python
1041 lines
39 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Enrich KB Netherlands library entries with claim-level provenance tracking.
|
||
|
||
This script implements proper data provenance following the web_enrichment_provenance.yaml schema:
|
||
- Each factual claim has precise source references
|
||
- Character offsets in source markdown
|
||
- Markdown heading paths for structural context
|
||
- SHA-256 hashes for content verification
|
||
- Exa highlight indices when available
|
||
|
||
Usage:
|
||
python scripts/enrich_kb_libraries_exa_provenance.py [--dry-run] [--limit N] [--file FILENAME]
|
||
|
||
Schema: schemas/web_enrichment_provenance.yaml
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import yaml
|
||
import time
|
||
import hashlib
|
||
import re
|
||
import uuid
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import Dict, List, Optional, Any, Tuple
|
||
from dataclasses import dataclass, field, asdict
|
||
import logging
|
||
import argparse
|
||
|
||
# Set up logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Paths
|
||
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
|
||
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")
|
||
|
||
# Rate limiting
|
||
REQUEST_DELAY = 1.5 # seconds between Exa requests
|
||
|
||
|
||
# =============================================================================
|
||
# DATA CLASSES (matching web_enrichment_provenance.yaml schema)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class SourceReference:
|
||
"""Precise reference to source text supporting a claim."""
|
||
source_id: str
|
||
text_excerpt: str
|
||
char_start: int
|
||
char_end: int
|
||
markdown_heading_path: Optional[str] = None
|
||
sentence_index: Optional[int] = None
|
||
exa_highlight_index: Optional[int] = None
|
||
relevance_score: Optional[float] = None
|
||
|
||
|
||
@dataclass
|
||
class Claim:
|
||
"""A single factual assertion extracted from web sources."""
|
||
claim_id: str
|
||
claim_type: str # ClaimTypeEnum value
|
||
field_path: str
|
||
value: Any
|
||
value_type: str # ValueTypeEnum value
|
||
source_references: List[SourceReference]
|
||
confidence_score: float
|
||
verified: bool = False
|
||
verified_by: Optional[str] = None
|
||
verified_date: Optional[str] = None
|
||
claim_notes: Optional[str] = None
|
||
|
||
|
||
@dataclass
|
||
class WebSource:
|
||
"""A web page fetched and used as source for claims."""
|
||
source_id: str
|
||
url: str
|
||
fetch_timestamp: str
|
||
http_status: Optional[int] = None
|
||
content_type: Optional[str] = None
|
||
title: Optional[str] = None
|
||
author: Optional[str] = None
|
||
published_date: Optional[str] = None
|
||
raw_markdown: Optional[str] = None
|
||
raw_markdown_hash: Optional[str] = None
|
||
exa_highlights: List[str] = field(default_factory=list)
|
||
exa_highlight_scores: List[float] = field(default_factory=list)
|
||
|
||
|
||
@dataclass
|
||
class WebEnrichment:
|
||
"""Container for all web-enriched data with full provenance tracking."""
|
||
enrichment_id: str
|
||
search_query: str
|
||
search_timestamp: str
|
||
search_engine: str
|
||
claims: List[Claim]
|
||
raw_sources: List[WebSource]
|
||
enrichment_status: str # EnrichmentStatusEnum value
|
||
enrichment_notes: Optional[str] = None
|
||
|
||
|
||
# =============================================================================
|
||
# UTILITY FUNCTIONS
|
||
# =============================================================================
|
||
|
||
def generate_enrichment_id() -> str:
|
||
"""Generate unique enrichment ID: enrich-YYYYMMDDTHHMMSS-xxxxxxxx"""
|
||
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S')
|
||
suffix = uuid.uuid4().hex[:8]
|
||
return f"enrich-{timestamp}-{suffix}"
|
||
|
||
|
||
def generate_source_id(url: str) -> str:
|
||
"""Generate source ID from URL hash."""
|
||
return f"src-{hashlib.sha256(url.encode()).hexdigest()[:12]}"
|
||
|
||
|
||
def generate_claim_id(field_path: str, index: int = 1) -> str:
|
||
"""Generate claim ID from field path."""
|
||
safe_path = re.sub(r'[^a-z0-9_]', '_', field_path.lower())
|
||
return f"claim-{safe_path}-{index}"
|
||
|
||
|
||
def compute_content_hash(content: str) -> str:
|
||
"""Compute SHA-256 hash of content."""
|
||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||
|
||
|
||
def find_text_position(haystack: str, needle: str) -> Tuple[int, int]:
|
||
"""Find character positions of needle in haystack."""
|
||
start = haystack.find(needle)
|
||
if start == -1:
|
||
# Try case-insensitive
|
||
start = haystack.lower().find(needle.lower())
|
||
if start == -1:
|
||
return (-1, -1)
|
||
end = start + len(needle)
|
||
return (start, end)
|
||
|
||
|
||
def extract_markdown_heading_path(markdown: str, char_position: int) -> Optional[str]:
|
||
"""
|
||
Extract the markdown heading path for a given character position.
|
||
Returns format: "# H1 > ## H2 > ### H3"
|
||
"""
|
||
if char_position < 0 or char_position >= len(markdown):
|
||
return None
|
||
|
||
# Find all headings before this position
|
||
heading_pattern = r'^(#{1,6})\s+(.+)$'
|
||
headings = []
|
||
current_pos = 0
|
||
|
||
for line in markdown.split('\n'):
|
||
line_end = current_pos + len(line)
|
||
|
||
if current_pos > char_position:
|
||
break
|
||
|
||
match = re.match(heading_pattern, line)
|
||
if match:
|
||
level = len(match.group(1))
|
||
title = match.group(2).strip()
|
||
|
||
# Remove headings of same or lower level (they're siblings or parents being replaced)
|
||
headings = [(l, t) for l, t in headings if l < level]
|
||
headings.append((level, title))
|
||
|
||
current_pos = line_end + 1 # +1 for newline
|
||
|
||
if not headings:
|
||
return None
|
||
|
||
return " > ".join([f"{'#' * level} {title}" for level, title in headings])
|
||
|
||
|
||
def get_sentence_index(text: str, char_position: int) -> Optional[int]:
|
||
"""Get the sentence index for a character position."""
|
||
if char_position < 0:
|
||
return None
|
||
|
||
# Simple sentence splitting
|
||
sentences = re.split(r'(?<=[.!?])\s+', text[:char_position + 100])
|
||
|
||
current_pos = 0
|
||
for i, sentence in enumerate(sentences):
|
||
sentence_end = current_pos + len(sentence)
|
||
if char_position <= sentence_end:
|
||
return i
|
||
current_pos = sentence_end + 1
|
||
|
||
return len(sentences) - 1
|
||
|
||
|
||
# =============================================================================
|
||
# CLAIM EXTRACTION WITH PROVENANCE
|
||
# =============================================================================
|
||
|
||
class ClaimExtractor:
|
||
"""Extract claims from text with source references."""
|
||
|
||
def __init__(self, source: WebSource):
|
||
self.source = source
|
||
self.markdown = source.raw_markdown or ""
|
||
self.claims: List[Claim] = []
|
||
self.claim_counter: Dict[str, int] = {}
|
||
|
||
def _next_claim_id(self, field_path: str) -> str:
|
||
"""Get next claim ID for a field path."""
|
||
if field_path not in self.claim_counter:
|
||
self.claim_counter[field_path] = 0
|
||
self.claim_counter[field_path] += 1
|
||
return generate_claim_id(field_path, self.claim_counter[field_path])
|
||
|
||
def _create_source_reference(
|
||
self,
|
||
excerpt: str,
|
||
exa_highlight_index: Optional[int] = None,
|
||
relevance_score: Optional[float] = None
|
||
) -> Optional[SourceReference]:
|
||
"""Create a source reference with character offsets."""
|
||
char_start, char_end = find_text_position(self.markdown, excerpt)
|
||
|
||
if char_start == -1:
|
||
# Text not found - use approximate reference
|
||
return SourceReference(
|
||
source_id=self.source.source_id,
|
||
text_excerpt=excerpt[:200] + "..." if len(excerpt) > 200 else excerpt,
|
||
char_start=-1,
|
||
char_end=-1,
|
||
markdown_heading_path=None,
|
||
sentence_index=None,
|
||
exa_highlight_index=exa_highlight_index,
|
||
relevance_score=relevance_score
|
||
)
|
||
|
||
heading_path = extract_markdown_heading_path(self.markdown, char_start)
|
||
sentence_idx = get_sentence_index(self.markdown, char_start)
|
||
|
||
return SourceReference(
|
||
source_id=self.source.source_id,
|
||
text_excerpt=excerpt[:500] if len(excerpt) > 500 else excerpt,
|
||
char_start=char_start,
|
||
char_end=char_end,
|
||
markdown_heading_path=heading_path,
|
||
sentence_index=sentence_idx,
|
||
exa_highlight_index=exa_highlight_index,
|
||
relevance_score=relevance_score
|
||
)
|
||
|
||
def extract_description(self) -> Optional[Claim]:
|
||
"""Extract description claim from first meaningful paragraph."""
|
||
# Look for first substantial paragraph (>100 chars, not navigation)
|
||
paragraphs = re.split(r'\n\n+', self.markdown)
|
||
|
||
for para in paragraphs:
|
||
# Skip short paragraphs, navigation, lists
|
||
if len(para) < 100:
|
||
continue
|
||
if para.strip().startswith(('- ', '* ', '|', '#')):
|
||
continue
|
||
if 'skip to' in para.lower() or 'jump to' in para.lower():
|
||
continue
|
||
|
||
# Found a good paragraph
|
||
clean_para = re.sub(r'\s+', ' ', para).strip()
|
||
if len(clean_para) > 50:
|
||
excerpt = clean_para[:500]
|
||
ref = self._create_source_reference(excerpt)
|
||
|
||
if ref:
|
||
return Claim(
|
||
claim_id=self._next_claim_id("description"),
|
||
claim_type="DESCRIPTIVE",
|
||
field_path="description",
|
||
value=excerpt + "..." if len(clean_para) > 500 else clean_para,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.8,
|
||
verified=False
|
||
)
|
||
|
||
return None
|
||
|
||
def extract_green_library_features(self) -> List[Claim]:
|
||
"""Extract claims about 'green library' features (specific to Schiedam case)."""
|
||
claims = []
|
||
text_lower = self.markdown.lower()
|
||
|
||
# Check for "green library" mention
|
||
if "'green' library" in text_lower or "green library" in text_lower:
|
||
# Find the specific text
|
||
patterns = [
|
||
(r"first 'green' library in the Netherlands", "notable_features.green_library.distinction"),
|
||
(r"first green library", "notable_features.green_library.distinction"),
|
||
(r"trees? (?:that )?(?:are |weigh(?:ing)? )?(\d+)[- ]to[- ](\d+)\s*(?:kilos?|kg)", "notable_features.green_library.tree_weights"),
|
||
(r"(\d+)[- ](?:to|-)[- ](\d+)\s*met(?:er|re)s?\s*high", "notable_features.green_library.tree_heights"),
|
||
(r"large trees?\s+(\d+)[- ](?:to|-)[- ](\d+)\s*met", "notable_features.green_library.tree_heights"),
|
||
]
|
||
|
||
for pattern, field_path in patterns:
|
||
match = re.search(pattern, self.markdown, re.IGNORECASE)
|
||
if match:
|
||
excerpt = match.group(0)
|
||
ref = self._create_source_reference(excerpt)
|
||
|
||
if ref:
|
||
# Determine value based on field
|
||
if "weights" in field_path:
|
||
value = f"{match.group(1)}-{match.group(2)} kg"
|
||
elif "heights" in field_path:
|
||
value = f"{match.group(1)}-{match.group(2)} meters"
|
||
else:
|
||
value = excerpt
|
||
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id(field_path),
|
||
claim_type="ARCHITECTURAL",
|
||
field_path=field_path,
|
||
value=value,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.9,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_sustainability_features(self) -> List[Claim]:
|
||
"""Extract sustainability/green design features."""
|
||
claims = []
|
||
|
||
sustainability_patterns = [
|
||
(r"recycled bookcases?.*?cardboard", "notable_features.sustainability", "Recycled bookcases made from industrial cardboard"),
|
||
(r"LED lighting", "notable_features.sustainability", "LED lighting"),
|
||
(r"climate control.*?planters?", "notable_features.sustainability", "Climate control system in planters"),
|
||
(r"chairs?.*?PET bottles?", "notable_features.sustainability", "Chairs made from recycled PET bottles"),
|
||
]
|
||
|
||
for pattern, field_path, default_value in sustainability_patterns:
|
||
match = re.search(pattern, self.markdown, re.IGNORECASE)
|
||
if match:
|
||
excerpt = match.group(0)
|
||
ref = self._create_source_reference(excerpt)
|
||
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id(field_path),
|
||
claim_type="ARCHITECTURAL",
|
||
field_path=field_path,
|
||
value=default_value,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_tree_species(self) -> List[Claim]:
|
||
"""Extract tree species mentions."""
|
||
claims = []
|
||
|
||
# Pattern for tree species (Bucida, Tamarinde, Ficus, etc.)
|
||
species_pattern = r'\b(Bucida|Tamarinde?|Ficus|Ficus benjamina)\b'
|
||
|
||
matches = list(re.finditer(species_pattern, self.markdown, re.IGNORECASE))
|
||
if matches:
|
||
species_list = list(set(m.group(1).title() for m in matches))
|
||
|
||
# Use first match for source reference
|
||
first_match = matches[0]
|
||
# Get surrounding context
|
||
start = max(0, first_match.start() - 20)
|
||
end = min(len(self.markdown), first_match.end() + 20)
|
||
excerpt = self.markdown[start:end]
|
||
|
||
ref = self._create_source_reference(excerpt)
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("notable_features.green_library.tree_species"),
|
||
claim_type="DESCRIPTIVE",
|
||
field_path="notable_features.green_library.tree_species",
|
||
value=species_list,
|
||
value_type="LIST_STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.95,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_accessibility_features(self) -> List[Claim]:
|
||
"""Extract accessibility information."""
|
||
claims = []
|
||
|
||
accessibility_patterns = [
|
||
(r"wheelchair", "notable_features.accessibility", "Wheelchair accessible"),
|
||
(r"ramps?.*?disabled access", "notable_features.accessibility", "Ramps for disabled access"),
|
||
(r"lift|elevator", "notable_features.accessibility", "Lift/elevator available"),
|
||
(r"wheelchairs? available", "notable_features.accessibility", "Wheelchairs available for use"),
|
||
]
|
||
|
||
for pattern, field_path, default_value in accessibility_patterns:
|
||
match = re.search(pattern, self.markdown, re.IGNORECASE)
|
||
if match:
|
||
excerpt = match.group(0)
|
||
ref = self._create_source_reference(excerpt)
|
||
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id(field_path),
|
||
claim_type="SERVICE",
|
||
field_path=field_path,
|
||
value=default_value,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_historic_building_info(self) -> List[Claim]:
|
||
"""Extract historic building information."""
|
||
claims = []
|
||
|
||
# Architect pattern
|
||
architect_match = re.search(
|
||
r'(?:designed by|architect[:\s]+)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
|
||
self.markdown
|
||
)
|
||
if architect_match:
|
||
ref = self._create_source_reference(architect_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("historic_building.architect"),
|
||
claim_type="DESCRIPTIVE",
|
||
field_path="historic_building.architect",
|
||
value=architect_match.group(1),
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
# Monument status
|
||
if re.search(r'\bmonument\b', self.markdown, re.IGNORECASE):
|
||
match = re.search(r'monument', self.markdown, re.IGNORECASE)
|
||
if match:
|
||
# Get context
|
||
start = max(0, match.start() - 30)
|
||
end = min(len(self.markdown), match.end() + 30)
|
||
excerpt = self.markdown[start:end]
|
||
|
||
ref = self._create_source_reference(excerpt)
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("historic_building.status"),
|
||
claim_type="DESCRIPTIVE",
|
||
field_path="historic_building.status",
|
||
value="Monument",
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.8,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_services(self) -> List[Claim]:
|
||
"""Extract services offered by the library."""
|
||
claims = []
|
||
|
||
service_patterns = {
|
||
"Large book collection": r'large (?:collection of )?books?|book collection',
|
||
"DVDs and Blu-rays": r'DVDs?|Blu-?rays?',
|
||
"Comics collection": r'comics?|graphic novels?',
|
||
"Study spaces": r'study (?:spaces?|room)|reading (?:room|table)|leestafel',
|
||
"Free WiFi": r'free Wi-?Fi|gratis (?:wifi|internet)|Wifi',
|
||
"Coffee service": r'roasted coffee|koffie|coffee',
|
||
"DigiTaalhuis": r'DigiTaalhuis|digitaalhuis',
|
||
"Digicafé": r'digicaf[eé]',
|
||
"Taalcafé": r'taalcaf[eé]',
|
||
"Tax filing help": r'invulhulp|belastingaangifte',
|
||
"Digital skills training": r'klik\s*[&+]\s*tik|digitale vaardigheden',
|
||
"Internet access": r'internettoegang',
|
||
"Interlibrary loan": r'interbibliothecair|boeken afhalen',
|
||
}
|
||
|
||
services_found = []
|
||
refs_found = []
|
||
|
||
for service, pattern in service_patterns.items():
|
||
match = re.search(pattern, self.markdown, re.IGNORECASE)
|
||
if match:
|
||
services_found.append(service)
|
||
ref = self._create_source_reference(match.group(0))
|
||
if ref:
|
||
refs_found.append(ref)
|
||
|
||
if services_found and refs_found:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("services"),
|
||
claim_type="SERVICE",
|
||
field_path="services",
|
||
value=services_found,
|
||
value_type="LIST_STRING",
|
||
source_references=refs_found[:3], # Limit to 3 references
|
||
confidence_score=0.8,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_contact_info(self) -> List[Claim]:
|
||
"""Extract contact information (phone, email, address)."""
|
||
claims = []
|
||
|
||
# Phone numbers (Dutch format)
|
||
phone_match = re.search(r'\b(0\d{2,3}[\s-]?\d{3}[\s-]?\d{3,4}|\+31\s?\d{2,3}[\s-]?\d{3}[\s-]?\d{3,4})\b', self.markdown)
|
||
if phone_match:
|
||
ref = self._create_source_reference(phone_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("contact.phone"),
|
||
claim_type="CONTACT",
|
||
field_path="contact.phone",
|
||
value=phone_match.group(1),
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.9,
|
||
verified=False
|
||
))
|
||
|
||
# Email addresses
|
||
email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', self.markdown)
|
||
if email_match:
|
||
ref = self._create_source_reference(email_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("contact.email"),
|
||
claim_type="CONTACT",
|
||
field_path="contact.email",
|
||
value=email_match.group(0),
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.9,
|
||
verified=False
|
||
))
|
||
|
||
# Dutch postal addresses
|
||
address_match = re.search(r'([A-Z][a-z]+(?:straat|weg|plein|laan)\s+\d+)\s*(?:,\s*)?\n?(\d{4}\s*[A-Z]{2})\s+([A-Z][a-z]+)', self.markdown)
|
||
if address_match:
|
||
full_address = f"{address_match.group(1)}, {address_match.group(2)} {address_match.group(3)}"
|
||
ref = self._create_source_reference(address_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("contact.address"),
|
||
claim_type="GEOGRAPHIC",
|
||
field_path="contact.address",
|
||
value=full_address,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_organizational_info(self) -> List[Claim]:
|
||
"""Extract organizational structure information."""
|
||
claims = []
|
||
|
||
# Legal entity names (Stichting = foundation)
|
||
stichting_match = re.search(r'Stichting\s+(?:de\s+)?([A-Z][a-zA-Z\s]+?)(?:\s+is|\s+in|\.|\,)', self.markdown)
|
||
if stichting_match:
|
||
ref = self._create_source_reference(stichting_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("organization.legal_name"),
|
||
claim_type="ORGANIZATIONAL",
|
||
field_path="organization.legal_name",
|
||
value=f"Stichting {stichting_match.group(1).strip()}",
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.9,
|
||
verified=False
|
||
))
|
||
|
||
# Fusion/merger info
|
||
fusie_match = re.search(r'(?:fusie|ontstaan uit)\s+(?:tussen\s+)?(?:de\s+)?(.+?)\s+en\s+(.+?)(?:\s+en|\s+is|\.|,)', self.markdown, re.IGNORECASE)
|
||
if fusie_match:
|
||
ref = self._create_source_reference(fusie_match.group(0))
|
||
if ref:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("organization.fusion_components"),
|
||
claim_type="ORGANIZATIONAL",
|
||
field_path="organization.fusion_components",
|
||
value=[fusie_match.group(1).strip(), fusie_match.group(2).strip()],
|
||
value_type="LIST_STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
# Municipalities served
|
||
gemeenten_match = re.search(r'(?:werkzaam in|gemeenten?)\s+(.+?(?:,\s*.+?)*\s+en\s+[A-Z][a-z]+)', self.markdown, re.IGNORECASE)
|
||
if gemeenten_match:
|
||
municipalities_text = gemeenten_match.group(1)
|
||
# Parse comma-separated list with "en" for last item
|
||
municipalities = re.split(r',\s*|\s+en\s+', municipalities_text)
|
||
municipalities = [m.strip() for m in municipalities if m.strip()]
|
||
|
||
ref = self._create_source_reference(gemeenten_match.group(0))
|
||
if ref and municipalities:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("organization.municipalities_served"),
|
||
claim_type="GEOGRAPHIC",
|
||
field_path="organization.municipalities_served",
|
||
value=municipalities,
|
||
value_type="LIST_STRING",
|
||
source_references=[ref],
|
||
confidence_score=0.85,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_opening_hours(self) -> List[Claim]:
|
||
"""Extract opening hours information."""
|
||
claims = []
|
||
|
||
# Look for Dutch day patterns with times
|
||
days_pattern = r'(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)[:\s]+(\d{1,2}[:.]\d{2})\s*[-–]\s*(\d{1,2}[:.]\d{2})'
|
||
matches = list(re.finditer(days_pattern, self.markdown, re.IGNORECASE))
|
||
|
||
if matches:
|
||
hours = {}
|
||
refs = []
|
||
for match in matches:
|
||
day = match.group(1).lower()
|
||
hours[day] = f"{match.group(2)} - {match.group(3)}"
|
||
ref = self._create_source_reference(match.group(0))
|
||
if ref:
|
||
refs.append(ref)
|
||
|
||
if hours and refs:
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("opening_hours"),
|
||
claim_type="TEMPORAL",
|
||
field_path="opening_hours",
|
||
value=hours,
|
||
value_type="OBJECT",
|
||
source_references=refs[:3],
|
||
confidence_score=0.9,
|
||
verified=False
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_from_exa_highlights(self) -> List[Claim]:
|
||
"""Extract claims specifically from Exa highlights."""
|
||
claims = []
|
||
|
||
for i, highlight in enumerate(self.source.exa_highlights):
|
||
score = self.source.exa_highlight_scores[i] if i < len(self.source.exa_highlight_scores) else 0.0
|
||
|
||
# Find position of highlight in markdown
|
||
ref = self._create_source_reference(highlight, exa_highlight_index=i, relevance_score=score)
|
||
|
||
if ref and score >= 0.7: # Only high-relevance highlights
|
||
# Create a descriptive claim from the highlight
|
||
claims.append(Claim(
|
||
claim_id=self._next_claim_id("exa_highlight"),
|
||
claim_type="DESCRIPTIVE",
|
||
field_path=f"highlights[{i}]",
|
||
value=highlight,
|
||
value_type="STRING",
|
||
source_references=[ref],
|
||
confidence_score=score,
|
||
verified=False,
|
||
claim_notes=f"Exa highlight with relevance score {score:.2f}"
|
||
))
|
||
|
||
return claims
|
||
|
||
def extract_all_claims(self) -> List[Claim]:
|
||
"""Extract all claims from the source."""
|
||
all_claims = []
|
||
|
||
# Description (always try)
|
||
desc_claim = self.extract_description()
|
||
if desc_claim:
|
||
all_claims.append(desc_claim)
|
||
|
||
# Specific feature extractions
|
||
all_claims.extend(self.extract_green_library_features())
|
||
all_claims.extend(self.extract_sustainability_features())
|
||
all_claims.extend(self.extract_tree_species())
|
||
all_claims.extend(self.extract_accessibility_features())
|
||
all_claims.extend(self.extract_historic_building_info())
|
||
all_claims.extend(self.extract_services())
|
||
|
||
# New extractors
|
||
all_claims.extend(self.extract_contact_info())
|
||
all_claims.extend(self.extract_organizational_info())
|
||
all_claims.extend(self.extract_opening_hours())
|
||
all_claims.extend(self.extract_from_exa_highlights())
|
||
|
||
return all_claims
|
||
|
||
|
||
# =============================================================================
|
||
# EXA INTEGRATION (placeholder for MCP-based calls)
|
||
# =============================================================================
|
||
|
||
def process_exa_result(result: Dict[str, Any], fetch_timestamp: str) -> WebSource:
|
||
"""Convert Exa search result to WebSource with full content."""
|
||
url = result.get('url', '')
|
||
source_id = generate_source_id(url)
|
||
|
||
raw_markdown = result.get('text', '')
|
||
|
||
return WebSource(
|
||
source_id=source_id,
|
||
url=url,
|
||
fetch_timestamp=fetch_timestamp,
|
||
http_status=200, # Exa doesn't return this
|
||
content_type="text/markdown",
|
||
title=result.get('title'),
|
||
author=result.get('author'),
|
||
published_date=result.get('publishedDate'),
|
||
raw_markdown=raw_markdown,
|
||
raw_markdown_hash=compute_content_hash(raw_markdown) if raw_markdown else None,
|
||
exa_highlights=result.get('highlights', []),
|
||
exa_highlight_scores=result.get('highlightScores', [])
|
||
)
|
||
|
||
|
||
def create_web_enrichment_from_exa_results(
|
||
results: List[Dict[str, Any]],
|
||
search_query: str
|
||
) -> WebEnrichment:
|
||
"""Create WebEnrichment from Exa search results with claim-level provenance."""
|
||
|
||
enrichment_id = generate_enrichment_id()
|
||
fetch_timestamp = datetime.now(timezone.utc).isoformat()
|
||
|
||
# Process sources
|
||
sources = [process_exa_result(r, fetch_timestamp) for r in results]
|
||
|
||
# Extract claims from each source
|
||
all_claims = []
|
||
for source in sources:
|
||
extractor = ClaimExtractor(source)
|
||
claims = extractor.extract_all_claims()
|
||
all_claims.extend(claims)
|
||
|
||
# Determine status
|
||
if all_claims:
|
||
status = "SUCCESS"
|
||
elif sources:
|
||
status = "PARTIAL"
|
||
else:
|
||
status = "NO_RESULTS"
|
||
|
||
return WebEnrichment(
|
||
enrichment_id=enrichment_id,
|
||
search_query=search_query,
|
||
search_timestamp=fetch_timestamp,
|
||
search_engine="exa",
|
||
claims=all_claims,
|
||
raw_sources=sources,
|
||
enrichment_status=status,
|
||
enrichment_notes=f"Extracted {len(all_claims)} claims from {len(sources)} sources"
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# YAML CONVERSION
|
||
# =============================================================================
|
||
|
||
def source_reference_to_dict(ref: SourceReference) -> Dict[str, Any]:
|
||
"""Convert SourceReference to dict for YAML."""
|
||
d = {
|
||
'source_id': ref.source_id,
|
||
'text_excerpt': ref.text_excerpt,
|
||
'char_start': ref.char_start,
|
||
'char_end': ref.char_end,
|
||
}
|
||
if ref.markdown_heading_path:
|
||
d['markdown_heading_path'] = ref.markdown_heading_path
|
||
if ref.sentence_index is not None:
|
||
d['sentence_index'] = ref.sentence_index
|
||
if ref.exa_highlight_index is not None:
|
||
d['exa_highlight_index'] = ref.exa_highlight_index
|
||
if ref.relevance_score is not None:
|
||
d['relevance_score'] = ref.relevance_score
|
||
return d
|
||
|
||
|
||
def claim_to_dict(claim: Claim) -> Dict[str, Any]:
|
||
"""Convert Claim to dict for YAML."""
|
||
d = {
|
||
'claim_id': claim.claim_id,
|
||
'claim_type': claim.claim_type,
|
||
'field_path': claim.field_path,
|
||
'value': claim.value,
|
||
'value_type': claim.value_type,
|
||
'source_references': [source_reference_to_dict(r) for r in claim.source_references],
|
||
'confidence_score': claim.confidence_score,
|
||
'verified': claim.verified,
|
||
}
|
||
if claim.verified_by:
|
||
d['verified_by'] = claim.verified_by
|
||
if claim.verified_date:
|
||
d['verified_date'] = claim.verified_date
|
||
if claim.claim_notes:
|
||
d['claim_notes'] = claim.claim_notes
|
||
return d
|
||
|
||
|
||
def web_source_to_dict(source: WebSource, include_raw: bool = False) -> Dict[str, Any]:
|
||
"""Convert WebSource to dict for YAML."""
|
||
d = {
|
||
'source_id': source.source_id,
|
||
'url': source.url,
|
||
'fetch_timestamp': source.fetch_timestamp,
|
||
}
|
||
if source.http_status:
|
||
d['http_status'] = source.http_status
|
||
if source.title:
|
||
d['title'] = source.title
|
||
if source.author:
|
||
d['author'] = source.author
|
||
if source.published_date:
|
||
d['published_date'] = source.published_date
|
||
if source.raw_markdown_hash:
|
||
d['raw_markdown_hash'] = source.raw_markdown_hash
|
||
if source.exa_highlights:
|
||
d['exa_highlights'] = source.exa_highlights
|
||
if source.exa_highlight_scores:
|
||
d['exa_highlight_scores'] = source.exa_highlight_scores
|
||
|
||
# Optionally include full raw content (can be large)
|
||
if include_raw and source.raw_markdown:
|
||
d['raw_markdown'] = source.raw_markdown
|
||
|
||
return d
|
||
|
||
|
||
def web_enrichment_to_dict(enrichment: WebEnrichment, include_raw: bool = False) -> Dict[str, Any]:
|
||
"""Convert WebEnrichment to dict for YAML storage."""
|
||
return {
|
||
'enrichment_id': enrichment.enrichment_id,
|
||
'search_query': enrichment.search_query,
|
||
'search_timestamp': enrichment.search_timestamp,
|
||
'search_engine': enrichment.search_engine,
|
||
'enrichment_status': enrichment.enrichment_status,
|
||
'enrichment_notes': enrichment.enrichment_notes,
|
||
'claims': [claim_to_dict(c) for c in enrichment.claims],
|
||
'raw_sources': [web_source_to_dict(s, include_raw) for s in enrichment.raw_sources],
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# FILE OPERATIONS
|
||
# =============================================================================
|
||
|
||
def load_kb_library_file(filepath: Path) -> Dict[str, Any]:
|
||
"""Load a single KB library YAML file."""
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
return yaml.safe_load(f)
|
||
|
||
|
||
def save_kb_library_file(filepath: Path, data: Dict[str, Any]):
|
||
"""Save KB library data to YAML file."""
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
|
||
|
||
def get_library_website(entry: Dict[str, Any]) -> Optional[str]:
|
||
"""Extract website URL from entry."""
|
||
# Try Google Maps enrichment first
|
||
google_enrichment = entry.get('google_maps_enrichment', {})
|
||
if google_enrichment.get('website'):
|
||
return google_enrichment['website']
|
||
|
||
# Try Wikidata enrichment
|
||
wikidata_enrichment = entry.get('wikidata_enrichment', {})
|
||
wikidata_ids = wikidata_enrichment.get('wikidata_identifiers', {})
|
||
if wikidata_ids.get('Website'):
|
||
return wikidata_ids['Website']
|
||
|
||
return None
|
||
|
||
|
||
def get_library_name(entry: Dict[str, Any]) -> str:
|
||
"""Extract library name from entry."""
|
||
original = entry.get('original_entry', {})
|
||
return original.get('organisatie', 'Unknown Library')
|
||
|
||
|
||
def get_library_city(entry: Dict[str, Any]) -> str:
|
||
"""Extract library city from entry."""
|
||
original = entry.get('original_entry', {})
|
||
return original.get('plaatsnaam_bezoekadres', '')
|
||
|
||
|
||
# =============================================================================
|
||
# MAIN PROCESSING
|
||
# =============================================================================
|
||
|
||
def process_single_file(
|
||
filepath: Path,
|
||
exa_results: List[Dict[str, Any]],
|
||
search_query: str,
|
||
dry_run: bool = False
|
||
) -> bool:
|
||
"""
|
||
Process a single file with Exa results and add claim-level provenance.
|
||
|
||
Args:
|
||
filepath: Path to YAML file
|
||
exa_results: Results from Exa search
|
||
search_query: The search query used
|
||
dry_run: If True, don't write changes
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Load existing data
|
||
data = load_kb_library_file(filepath)
|
||
library_name = get_library_name(data)
|
||
|
||
logger.info(f"Processing: {library_name}")
|
||
|
||
# Create web enrichment with provenance
|
||
enrichment = create_web_enrichment_from_exa_results(exa_results, search_query)
|
||
|
||
logger.info(f" - Extracted {len(enrichment.claims)} claims from {len(enrichment.raw_sources)} sources")
|
||
|
||
# Remove old enrichment formats
|
||
if 'exa_enrichment' in data:
|
||
del data['exa_enrichment']
|
||
if 'website_enrichment' in data:
|
||
del data['website_enrichment']
|
||
|
||
# Add new provenance-tracked enrichment
|
||
data['web_enrichment'] = web_enrichment_to_dict(enrichment, include_raw=False)
|
||
|
||
if not dry_run:
|
||
save_kb_library_file(filepath, data)
|
||
logger.info(f" - Saved to {filepath.name}")
|
||
else:
|
||
logger.info(f" - [DRY RUN] Would save to {filepath.name}")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error processing {filepath}: {e}")
|
||
return False
|
||
|
||
|
||
def main():
|
||
"""Main entry point."""
|
||
parser = argparse.ArgumentParser(
|
||
description='Enrich KB libraries with claim-level provenance tracking'
|
||
)
|
||
parser.add_argument('--dry-run', action='store_true',
|
||
help='Show what would be done without making changes')
|
||
parser.add_argument('--limit', type=int, default=None,
|
||
help='Limit number of entries to process')
|
||
parser.add_argument('--file', type=str, default=None,
|
||
help='Process a specific file (e.g., 1377_kb_isil.yaml)')
|
||
args = parser.parse_args()
|
||
|
||
logger.info("=" * 70)
|
||
logger.info("KB Netherlands Libraries - Claim-Level Provenance Enrichment")
|
||
logger.info("=" * 70)
|
||
logger.info(f"Schema: schemas/web_enrichment_provenance.yaml")
|
||
logger.info("")
|
||
|
||
# Find files to process
|
||
if args.file:
|
||
files = [ENTRIES_DIR / args.file]
|
||
if not files[0].exists():
|
||
logger.error(f"File not found: {files[0]}")
|
||
return 1
|
||
else:
|
||
files = sorted(ENTRIES_DIR.glob("*_kb_isil.yaml"))
|
||
|
||
logger.info(f"Found {len(files)} KB library files")
|
||
|
||
if args.limit:
|
||
files = files[:args.limit]
|
||
logger.info(f"Limited to {len(files)} files")
|
||
|
||
# Print instructions for MCP-based enrichment
|
||
logger.info("")
|
||
logger.info("=" * 70)
|
||
logger.info("This script processes Exa results into claim-level provenance.")
|
||
logger.info("To use: Pass Exa search results as JSON to stdin, or integrate with MCP.")
|
||
logger.info("=" * 70)
|
||
logger.info("")
|
||
|
||
# Example: show what would be processed
|
||
for f in files[:5]:
|
||
data = load_kb_library_file(f)
|
||
name = get_library_name(data)
|
||
city = get_library_city(data)
|
||
website = get_library_website(data)
|
||
|
||
has_old = 'exa_enrichment' in data or 'website_enrichment' in data
|
||
has_new = 'web_enrichment' in data
|
||
|
||
status = "NEW" if has_new else ("LEGACY" if has_old else "NONE")
|
||
|
||
logger.info(f" {f.name}: {name} ({city})")
|
||
logger.info(f" Website: {website or 'Not found'}")
|
||
logger.info(f" Status: {status}")
|
||
|
||
if len(files) > 5:
|
||
logger.info(f" ... and {len(files) - 5} more files")
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|