glam/src/glam_extractor/annotators/llm_annotator.py
2025-12-14 17:09:55 +01:00

1992 lines
80 KiB
Python

"""
LLM-Based Agentic Annotator for GLAM Documents.
This module provides LLM-only entity annotation following GLAM-NER v1.7.0.
NO HEURISTIC/PATTERN-BASED METHODS - all recognition is done via LLM inference.
Supported LLM Providers:
- Z.AI (Zhipu AI) GLM-4 (default)
- Anthropic Claude
- OpenAI GPT-4
Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.
Features:
- Exponential backoff retry for rate limits (429)
- Automatic provider fallback (Z.AI → Claude → OpenAI)
- Configurable retry attempts and delays
"""
import asyncio
import json
import logging
import os
import random
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from enum import Enum
# Configure logging
logger = logging.getLogger(__name__)
# Import base classes
from .base import (
AnnotationSession,
EntityClaim,
LayoutClaim,
AggregateClaim,
ImageClaim,
Provenance,
EntityHypernym,
LayoutRegion,
SemanticRole,
RelationshipClaim,
RelationshipSubject,
RelationshipObject,
RelationshipPredicate,
RelationshipHypernym,
TemporalScope,
SpatialScope,
RelationshipQualifier,
get_ontology_class,
validate_relationship_constraints,
)
from .html_parser import HTMLDocument
from .schema_builder import GLAMSchema, FieldSpec, heritage_custodian_schema
class LLMProvider(str, Enum):
"""Supported LLM providers."""
ZAI = "zai" # Zhipu AI GLM-4
ANTHROPIC = "anthropic" # Claude
OPENAI = "openai" # GPT-4
@dataclass
class RetryConfig:
"""Configuration for retry logic with exponential backoff."""
max_retries: int = 5 # Maximum retry attempts
base_delay: float = 1.0 # Initial delay in seconds
max_delay: float = 60.0 # Maximum delay cap
exponential_base: float = 2.0 # Exponential backoff base
jitter: bool = True # Add random jitter to prevent thundering herd
retry_on_status: tuple = (429, 500, 502, 503, 504) # HTTP status codes to retry
@dataclass
class LLMAnnotatorConfig:
"""Configuration for LLM-based annotation."""
provider: LLMProvider = LLMProvider.ZAI
model: str = "glm-4.6" # Z.AI's latest model
api_key: Optional[str] = None
temperature: float = 0.1 # Low temp for consistent extraction
max_tokens: int = 4096
timeout: int = 120 # Longer timeout for LLM calls
# Annotation settings
extract_entities: bool = True
extract_layout: bool = True
extract_claims: bool = True
extract_images: bool = True # NEW: Enable image analysis via vision model
# Vision model settings (for image analysis)
vision_model: str = "glm-4.5v" # Z.AI's vision model
vision_max_tokens: int = 2048 # Max tokens for vision response
max_images_per_page: int = 10 # Limit images analyzed per page
min_image_size: int = 50 # Minimum dimension (width or height) to analyze
# Provenance settings
context_convention: str = "GLAM-NER v1.7.0-unified"
# Retry settings
retry: RetryConfig = field(default_factory=RetryConfig)
# Fallback providers (tried in order when primary fails)
fallback_providers: Optional[List[LLMProvider]] = None
def __post_init__(self):
"""Load API key from environment if not provided."""
if self.api_key is None:
if self.provider == LLMProvider.ZAI:
self.api_key = os.environ.get("ZAI_API_TOKEN")
elif self.provider == LLMProvider.ANTHROPIC:
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
elif self.provider == LLMProvider.OPENAI:
self.api_key = os.environ.get("OPENAI_API_KEY")
# Default fallback chain if not specified
if self.fallback_providers is None:
self.fallback_providers = [
p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
if p != self.provider
]
# =============================================================================
# GLAM-NER v1.7.0 SYSTEM PROMPT
# =============================================================================
GLAM_NER_SYSTEM_PROMPT = """You are an expert entity annotator following the GLAM-NER v1.7.0-unified Entity Annotation Convention.
Your task is to extract structured claims from heritage institution documents with full provenance.
## HYPERNYMS AND HYPONYMS (10 types with subcategories)
### 1. AGT (Agent): Humans, animals, AI, fictional characters
**Subcategories:**
- AGT.PER: Person - INDIVIDUAL human beings with SPECIFIC NAMES (maps to crm:E21_Person)
⚠️ STRICT EXCLUSIONS - Do NOT tag as AGT.PER:
• Groups/collectives: "staff", "members", "curators", "colleagues", "board", "team", "committee", "participants", "community"
• Plural person references: "archivists", "researchers", "visitors", "filmmakers", "historians"
• Role descriptions without names: "the curator", "a researcher", "museum director"
• Organizations/events with role words: "FIAF Commission members", "conference colleagues", "board members"
• Networks: "VPRO/Tegenlicht network", "ACE member institutions"
• Topic references: "Verhalen van Bolsward" (stories about something)
• Conference/event participants: "Women and Silent Screen Conference participants"
• Fund/foundation board: "Prince Claus Fund board members"
• Festival communities: "Le Giornate del Cinema Muto community"
• Generic collectives: "community", "network", "consortium", "association"
✓ ONLY tag as AGT.PER: Named individuals like "Giovanna Fossati", "Dr. Jan van der Berg", "Martin Scorsese"
- AGT.STF: Staff - personnel in professional roles (maps to pico:PersonObservation)
- AGT.COL: Collective - named collectives without formal structure
- AGT.FIC: Fictional - characters from fiction/mythology
- AGT.MYT: Mythological - gods, deities, legendary figures
- AGT.ANI: Animal - named individual animals with agency
- AGT.ART: Artificial - AI systems, robots, software agents (maps to prov:SoftwareAgent)
Examples: "Dr. Jan van der Berg" → AGT.PER, "Giovanna Fossati" → AGT.PER, "the museum director" → AGT.STF
❌ NOT AGT.PER: "AMIA conference colleagues", "Prince Claus Fund board members", "festival community"
### 2. GRP (Group): Organizations, collectives, formal and informal
**Subcategories:**
- GRP.HER: Heritage institutions - museums, archives, libraries (maps to glam:HeritageCustodian)
- GRP.PAR: Parent/governing bodies (maps to rico:CorporateBody)
- GRP.UNT: Organizational units/departments (maps to org:OrganizationalUnit)
- GRP.COR: Corporations and businesses (maps to schema:Corporation)
- GRP.GOV: Government agencies (maps to schema:GovernmentOrganization)
- GRP.EDU: Educational institutions (maps to schema:EducationalOrganization)
- GRP.REL: Religious organizations (maps to schema:ReligiousOrganization)
- GRP.ASS: Associations and societies (maps to org:FormalOrganization)
- GRP.INF: Informal groups - movements, families, dynasties
- GRP.HIS: Historical organizations - defunct entities
- GRP.ETH: Ethnic groups - Jews, Roma, Sinti, indigenous peoples (maps to crm:E74_Group)
Examples: "Rijksmuseum" → GRP.HER, "Ministry of Culture" → GRP.GOV, "Joden" → GRP.ETH
### 3. TOP (Toponym): Place names, nominal geographic references
**Subcategories:**
- TOP.SET: Settlement - cities, towns, villages (maps to schema:City)
- TOP.REG: Region - provinces, states, counties (maps to schema:AdministrativeArea)
- TOP.CTY: Country - nations, sovereign states (maps to schema:Country)
- TOP.ADR: Address - street addresses (maps to schema:PostalAddress)
- TOP.IAD: Institutional address
- TOP.BLD: Building - named buildings, monuments (maps to crm:E18_Physical_Thing)
- TOP.NAT: Natural features - mountains, rivers
- TOP.HIS: Historical places - concentration camps, transit camps, former territories (maps to crm:E53_Place)
- TOP.LEG: Legendary/fictional places
Examples: "Amsterdam" → TOP.SET, "the Netherlands" → TOP.CTY, "Auschwitz" → TOP.HIS
### 4. GEO (Geometry): Coordinates, shapes, spatial data
**Subcategories:**
- GEO.PNT: Point coordinates (maps to geo:Point)
- GEO.LIN: Line/path (maps to geo:LineString)
- GEO.POL: Polygon/area (maps to geo:Polygon)
- GEO.BOX: Bounding box (maps to geo:Envelope)
Examples: "52.3676° N, 4.9041° E" → GEO.PNT
### 5. TMP (Temporal): Dates, times, durations, periods
**Subcategories:**
- TMP.DAT: Absolute date - specific point (maps to time:Instant) [alias: TMP.DAB]
- TMP.DAB: Date Absolute - specific date "1885-03-22" (maps to time:Instant)
- TMP.DRL: Date Relative - "last year", "recently", "two weeks ago" (maps to time:Instant)
- TMP.TIM: Time of day (maps to time:Instant) [alias: TMP.TAB]
- TMP.TAB: Time Absolute - specific time "14:30:00" (maps to time:Instant)
- TMP.TRL: Time Relative - "later that evening", "soon after" (maps to time:Instant)
- TMP.DUR: Duration/period - "three hours", "from 1885 to 1890" (maps to time:Duration)
- TMP.RNG: Date range - "1885-1890", "March 1-15" (maps to time:Interval)
- TMP.SET: Recurring time - "every Monday", "annually"
- TMP.OPH: Opening hours - "Tuesday-Sunday 10:00-17:00" (maps to schema:OpeningHoursSpecification)
- TMP.REL: Relative time - "before", "after" [deprecated, use TMP.DRL/TMP.TRL]
- TMP.CEN: Century - "17th century", "the 1800s" (maps to crm:E4_Period)
- TMP.ERA: Historical era/period name - "Renaissance", "Bronze Age" (maps to crm:E4_Period)
- TMP.EXP: Exhibition period - "10 February - 4 June 2023" (maps to time:Interval)
Examples: "1885" → TMP.DAB, "18th century" → TMP.CEN, "every Tuesday" → TMP.SET, "10:00-17:00" → TMP.OPH
### 6. APP (Appellation): Identifiers, codes, reference numbers
**Subcategories:**
- APP.ISL: ISIL code (maps to crm:E42_Identifier)
- APP.WKD: Wikidata ID (maps to crm:E42_Identifier)
- APP.VIF: VIAF ID (maps to crm:E42_Identifier)
- APP.DOI: DOI
- APP.URL: URL/URI (maps to schema:URL)
- APP.ISBN: ISBN
- APP.ISSN: ISSN
- APP.KVK: Dutch Chamber of Commerce number
- APP.TTL: Title of work (maps to crm:E35_Title) [alias: APP.TIT]
- APP.TIT: Title of work (maps to crm:E35_Title)
- APP.NAM: Personal name - structured (maps to pnv:PersonName) [alias: APP.PNM]
- APP.PNM: Personal name - structured (maps to pnv:PersonName)
- APP.AWD: Award name
- APP.COL: Collection name
- APP.EXH: Exhibition name/title (maps to crm:E35_Title)
Examples: "ISIL NL-AmRM" → APP.ISL, "Q190804" → APP.WKD, "Rembrandt and His Era" → APP.EXH
### 7. ROL (Role): Titles, positions, honorifics, occupations
**Subcategories:**
- ROL.OCC: Occupation - profession, trade, job title (maps to schema:Occupation)
- ROL.TTL: Title/honorific - "Dr.", "Prof.", academic/professional titles (maps to schema:Role)
- ROL.HON: Honorific - "Sir", "Dame", "The Honorable" (maps to schema:honorificPrefix)
- ROL.NOB: Nobility title - "Duke", "Baron", "Count", hereditary titles (maps to schema:honorificSuffix)
- ROL.POS: Position/office - "Director", "Chairman" (maps to org:Post)
- ROL.REL: Relational role - father, mother, kinship (maps to bio:Relationship)
- ROL.REL.REL: Religious role - "Bishop", "Rabbi", "Imam" (maps to schema:Role)
Examples: "Director" → ROL.POS, "Prof. Dr." → ROL.TTL, "Duke of Wellington" → ROL.NOB, "Rabbi" → ROL.REL.REL
### 8. WRK (Work): Works following FRBR model
**Subcategories:**
- WRK.WRK: FRBR Work - abstract (maps to frbroo:F1_Work) [alias: WRK.ABS]
- WRK.ABS: Abstract work (maps to frbroo:F1_Work)
- WRK.EXP: FRBR Expression (maps to frbroo:F2_Expression)
- WRK.MAN: FRBR Manifestation (maps to frbroo:F3_Manifestation)
- WRK.ITM: FRBR Item (maps to frbroo:F5_Item)
- WRK.MSS: Manuscript - handwritten/unpublished work (maps to rico:Record)
- WRK.ARC: Archival record/document (maps to rico:Record)
- WRK.TXT: Textual work (maps to schema:Book)
- WRK.VIS: Visual work (maps to schema:VisualArtwork)
- WRK.MUS: Musical work (maps to schema:MusicComposition)
- WRK.PER: Performance (maps to schema:PerformingArtsEvent)
- WRK.CIN: Cinematic work (maps to schema:Movie)
- WRK.OBJ: Physical object/artifact (maps to crm:E22_Human-Made_Object)
- WRK.COL: Collection (maps to crm:E78_Curated_Holding)
- WRK.SER: Series (maps to schema:CreativeWorkSeries)
- WRK.WEB: Web resource/page (maps to schema:WebPage)
- WRK.URL: URL reference to work/link (maps to schema:URL)
- WRK.EML: Email message (maps to schema:Message)
- WRK.SOC: Social media post/content (maps to schema:SocialMediaPosting)
- WRK.CIT: Citation/bibliographic reference (maps to schema:Citation)
Examples: "The Night Watch" → WRK.VIS, "Annual Report 2023" → WRK.TXT, "15th-century codex" → WRK.MSS
### 9. QTY (Quantity): Measurements, counts, numeric values
**Subcategories:**
- QTY.CNT: Count (maps to crm:E54_Dimension)
- QTY.MSR: Measurement (maps to crm:E54_Dimension)
- QTY.PCT: Percentage
- QTY.CUR: Currency/monetary (maps to schema:MonetaryAmount)
- QTY.ORD: Ordinal (maps to crm:E60_Number)
- QTY.RNG: Range
Examples: "over 8,000 artworks" → QTY.CNT, "€2.5 million" → QTY.CUR
### 10. THG (Thing): Physical objects, artifacts, concepts, events
**Subcategories:**
- THG.ART: Artwork (maps to crm:E22_Human-Made_Object)
- THG.AFT: Artifact - human-made object of historical significance (maps to crm:E22_Human-Made_Object)
- THG.SPC: Specimen - natural history specimen, scientific sample (maps to crm:E20_Biological_Object)
- THG.DOC: Document (maps to foaf:Document)
- THG.PHO: Photograph (maps to schema:Photograph)
- THG.OBJ: Physical object - generic (maps to crm:E19_Physical_Object)
- THG.EVT: Historical event - deportation, persecution, liberation, war (maps to crm:E5_Event)
- THG.CON: Concept/abstract thing - stories, memories, heritage, mission (maps to crm:E28_Conceptual_Object)
- THG.TAX: Taxonomic term - species (maps to crm:E55_Type)
- THG.LNG: Language (maps to crm:E56_Language)
- THG.MAT: Material - bronze, marble, paper, etc. (maps to crm:E57_Material)
Examples: "17th-century painting" → THG.ART, "deportation" → THG.EVT, "the stories" → THG.CON, "Dutch" → THG.LNG
## RELATIONSHIP TYPES AND CONSTRAINTS
Relationships connect two entities. Each relationship has domain (subject) and range (object) constraints.
**⚠️ CRITICAL: COMPREHENSIVE SEMANTIC TRIPLE EXTRACTION ⚠️**
You MUST extract ALL semantic relationships from narrative text, not just named entity relationships.
Decompose every sentence into its constituent semantic triples (subject-predicate-object).
Example text: "In het Herinneringscentrum Kamp Westerbork vertellen we de verhalen van meer dan honderdduizend Joden en Sinti en Roma die vanuit Nederland naar vernietigings- en concentratiekampen werden gedeporteerd"
This SINGLE sentence contains these triples:
1. REL.ORG.ACT: Herinneringscentrum Kamp Westerbork → performs activity → tell stories
2. REL.SUB.ABT: the stories → are about → Joden (Jews)
3. REL.SUB.ABT: the stories → are about → Sinti and Roma
4. REL.QTY.CNT: Jews/Sinti/Roma → quantity → more than 100,000
5. REL.SPA.ORG: deportees → originated from → Nederland
6. REL.SPA.DST: deportees → destination → concentration camps
7. REL.SPA.DST: deportees → destination → extermination camps
8. REL.EVT.PAR: Jews/Sinti/Roma → participated in → deportation (forced)
### REL.CRE (Creation) - Agent creates Work
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.CRE.AUT | AGT.PER, AGT.GRP | WRK.TXT | "Martin Luther authored 95 Theses" |
| REL.CRE.ART | AGT.PER | WRK.VIS, THG.ART | "Rembrandt painted The Night Watch" |
| REL.CRE.COM | AGT.PER | WRK.MUS | "Beethoven composed Symphony No. 9" |
| REL.CRE.PHO | AGT.PER | THG.PHO | "Photographer captured portrait" |
| REL.CRE.DES | AGT.PER, AGT.GRP | WRK.OBJ | "Architect designed building" |
### REL.SPA (Spatial) - Located in / Contains / Origin / Destination
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SPA.LOC | AGT, EVT, GRP, WRK | TOP | "Museum located in Amsterdam" |
| REL.SPA.WTH | TOP | TOP | "Amsterdam within North Holland" |
| REL.SPA.CON | TOP | TOP | "Netherlands contains Amsterdam" |
| REL.SPA.ORG | AGT.PER, WRK, GRP.ETH | TOP | "Jews came from Netherlands" |
| REL.SPA.DST | AGT, EVT, GRP | TOP | "Deported to concentration camps" |
### REL.SOC (Social) - Person-to-person relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SOC.FAM.SPO | AGT.PER | AGT.PER | "Martin Luther married Katharina von Bora" |
| REL.SOC.FAM.PAR | AGT.PER | AGT.PER | "Parent of child" |
| REL.SOC.PRO.STU | AGT.PER | AGT.PER | "Student studied under master" |
| REL.SOC.MEM | AGT.PER, GRP | GRP | "Person/org member of organization" |
| REL.SOC.EMP | AGT.PER | GRP | "Employee works for company" |
### REL.ORG (Organizational) - Group activities and relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.ORG.PAR | GRP | GRP | "Parent organization" |
| REL.ORG.SUB | GRP | GRP | "Subsidiary organization" |
| REL.ORG.SUC | GRP | GRP | "Successor organization" |
| REL.ORG.FND | AGT.PER, GRP | GRP | "Founder established organization" |
| REL.ORG.ACT | GRP.HER, GRP | THG.CON, WRK | "Museum tells stories" / "Archive preserves documents" |
| REL.ORG.MIS | GRP.HER, GRP | THG.CON | "Organization's mission is..." |
| REL.ORG.SRV | GRP.HER, GRP | GRP, AGT | "Museum serves researchers" |
### REL.CUS (Custodial) - Ownership/Keeping
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.CUS.KEP | WRK, THG | GRP.HER | "Artwork kept by Rijksmuseum" |
| REL.CUS.OWN | WRK, THG | AGT.PER, GRP | "Collector owns painting" |
| REL.CUS.COL | WRK, THG | WRK.COL | "Item in collection" |
| REL.CUS.DNT | WRK, THG | AGT.PER | "Donated by benefactor" |
### REL.WRK (Work/FRBR) - Work relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.WRK.EXP | WRK.EXP | WRK.WRK | "Expression of work" |
| REL.WRK.PRT | WRK | WRK | "Part of larger work" |
| REL.WRK.SER | WRK | WRK.SER | "Volume in series" |
| REL.WRK.TRN | WRK.EXP | WRK.WRK | "Translation of work" |
### REL.SUB (Subject/About) - Topics and content
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SUB.ABT | WRK, GRP.HER, THG | AGT, GRP, EVT, TOP, THG | "Stories about Jews and Roma" |
| REL.SUB.DEP | WRK.VIS, THG.PHO | AGT, TOP, EVT | "Photo depicts memorial" |
| REL.SUB.THM | GRP.HER, WRK.COL | THG.CON | "Collection themes: WWII, Holocaust" |
### REL.EVT (Event) - Participation and historical events
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.EVT.PAR | AGT, GRP, GRP.ETH | EVT, THG.EVT | "Jews participated in deportation" |
| REL.EVT.ORG | AGT, GRP | EVT | "Nazis organized deportations" |
| REL.EVT.LOC | EVT | TOP | "Deportations from Netherlands" |
| REL.EVT.VIC | AGT, GRP, GRP.ETH | EVT | "Jews were victims of persecution" |
| REL.EVT.TIM | EVT | TMP | "Deportations in 1942-1944" |
### REL.QTY (Quantity) - Numeric relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.QTY.CNT | GRP, GRP.ETH, WRK.COL | QTY.CNT | "More than 100,000 people" |
| REL.QTY.MSR | THG, TOP | QTY.MSR | "Building is 500 sqm" |
| REL.QTY.YRS | GRP.HER, AGT | QTY.CNT, TMP | "Museum operating for 50 years" |
### REL.ROL (Role) - Occupation/Position
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.ROL.OCC | AGT.PER | ROL.OCC | "Person has occupation" |
| REL.ROL.HLD | AGT.PER | ROL.POS | "Person holds position" |
**IMPORTANT: Always include entity_type in relationship subject/object for validation!**
### ENTITY TYPES FOR RELATIONSHIP EXTRACTION
When extracting relationships, use these additional entity types:
- **GRP.ETH**: Ethnic groups (Joden, Sinti, Roma, etc.)
- **THG.CON**: Abstract concepts (stories, memories, heritage, mission)
- **THG.EVT**: Historical events (deportation, persecution, liberation)
- **TOP.HIS**: Historical places (concentration camps, transit camps)
## LAYOUT REGIONS (DOC hypernym)
Primary: HDR (heading), PAR (paragraph), SEN (sentence), LST (list), TBL (table)
Media: GAL (gallery), MAP (map), AUD (audio), VID (video), EMB (embedded)
Navigation: NAV (navigation), TOC (table of contents), IDX (index)
Front/Back: TTP (title page), DED (dedication), COL (colophon), BIB (bibliography), APP (appendix), GLO (glossary)
Commercial: ADV (advertisement), LOG (logo)
## OUTPUT FORMAT
Return a JSON object with this structure:
```json
{
"entities": [
{
"hypernym": "GRP",
"hyponym": "GRP.HER",
"text": "Rijksmuseum",
"xpath": "/html/body/div[1]/h1",
"confidence": 0.95,
"class_uri": "glam:HeritageCustodian",
"notes": "Main heritage institution name"
}
],
"layout_regions": [
{
"region": "HDR",
"level": 1,
"semantic_role": "PRIM",
"xpath": "/html/body/div[1]/h1",
"text_preview": "Rijksmuseum Amsterdam",
"contains_entities": ["GRP.HER:Rijksmuseum", "TOP.SET:Amsterdam"]
}
],
"claims": [
{
"claim_type": "full_name",
"claim_value": "Rijksmuseum Amsterdam",
"xpath": "/html/body/div[1]/h1",
"confidence": 0.95,
"source_entities": ["GRP.HER:Rijksmuseum"]
}
],
"relationships": [
{
"relationship_type": "REL.SPA.LOC",
"subject": {"entity_type": "GRP.HER", "text": "Rijksmuseum"},
"object": {"entity_type": "TOP.SET", "text": "Amsterdam"},
"predicate_uri": "schema:location",
"confidence": 0.90
}
]
}
```
## RULES
1. Every claim MUST have an XPath location in the source document
2. Use HYPONYM codes (e.g., GRP.HER, AGT.PER) not just hypernyms (e.g., GRP, AGT)
3. Include class_uri ontology mapping for each entity
4. Confidence scores: 0.9-1.0 (explicit), 0.7-0.9 (clear), 0.5-0.7 (inferred)
5. Entities within layout regions should be cross-referenced
6. Claims without XPath provenance are FABRICATED and must not be included
7. Extract relationships between entities (especially REL.SPA.LOC, REL.ORG.*, REL.CRE.*)
## ⚠️ CRITICAL: COMPREHENSIVE SEMANTIC EXTRACTION ⚠️
8. **DECOMPOSE EVERY NARRATIVE SENTENCE INTO TRIPLES** - A single sentence often contains 5-10 semantic relationships
9. **Extract ALL entities** - not just named entities, but also:
- Quantities (QTY.CNT: "more than 100,000")
- Ethnic groups (GRP.ETH: "Jews", "Sinti", "Roma")
- Abstract concepts (THG.CON: "stories", "memories", "heritage")
- Historical events (THG.EVT: "deportation", "persecution")
- Historical places (TOP.HIS: "concentration camps", "transit camps")
10. **Extract organizational activities** (REL.ORG.ACT): What does the institution DO? (preserve, tell, exhibit, research)
11. **Extract subject matter** (REL.SUB.ABT): What is the institution/collection ABOUT?
12. **Extract quantities** (REL.QTY.CNT): Numbers of visitors, items, people affected
13. **Extract spatial origins and destinations** (REL.SPA.ORG, REL.SPA.DST): Where did things/people come FROM and go TO?
14. **Extract event participation** (REL.EVT.PAR, REL.EVT.VIC): Who was involved in historical events?
### Example: Deep Semantic Parsing
Text: "Het museum bewaart meer dan 5000 voorwerpen uit de Tweede Wereldoorlog"
**INCORRECT** (shallow extraction):
- 1 entity: "Het museum" (GRP.HER)
- 0 relationships
**CORRECT** (deep semantic extraction):
- Entities:
- "Het museum" (GRP.HER)
- "meer dan 5000" (QTY.CNT)
- "voorwerpen" (THG.AFT - artifacts)
- "Tweede Wereldoorlog" (TMP.ERA)
- Relationships:
- REL.ORG.ACT: museum → performs → preservation (bewaart)
- REL.CUS.KEP: voorwerpen → kept by → museum
- REL.QTY.CNT: voorwerpen → quantity → meer dan 5000
- REL.TMP.DUR: voorwerpen → from period → Tweede Wereldoorlog
## CLAIM TYPES FOR HERITAGE INSTITUTIONS
- full_name: Official institution name
- short_name: Abbreviated name or acronym
- description: Institution description
- email: Contact email
- phone: Contact phone
- address: Physical address
- website: Official website URL
- social_media: Social media links (facebook, twitter, instagram, linkedin, youtube)
- opening_hours: Visitor hours
- admission_info: Ticket/entry information
- founding_date: When institution was established
- collection_count: Number of items in collection
- kvk_number: Dutch Chamber of Commerce number
- isil_code: International Standard Identifier for Libraries
- wikidata_id: Wikidata Q-number
- parent_organization: Parent/umbrella organization
"""
class LLMAnnotator:
"""
LLM-based document annotator.
Uses LLM inference for all entity recognition and claim extraction.
NO heuristic or pattern-based methods.
Example:
>>> config = LLMAnnotatorConfig(provider=LLMProvider.ZAI, model="glm-4")
>>> annotator = LLMAnnotator(config)
>>> session = await annotator.annotate(document)
>>> print(f"Found {len(session.entity_claims)} entities")
"""
def __init__(self, config: Optional[LLMAnnotatorConfig] = None):
"""
Initialize LLM annotator.
Args:
config: LLM configuration (defaults to Z.AI GLM-4)
"""
self.config = config or LLMAnnotatorConfig()
self._client = None
if not self.config.api_key:
raise ValueError(
f"API key not found for {self.config.provider.value}. "
f"Set environment variable or pass api_key in config."
)
async def annotate(
self,
document: Union[HTMLDocument, str, Path],
source_url: Optional[str] = None,
image_dir: Optional[Path] = None,
) -> AnnotationSession:
"""
Annotate a document using LLM inference.
Args:
document: HTMLDocument, HTML string, or path to HTML file
source_url: Optional source URL for provenance
image_dir: Optional directory containing downloaded images for vision analysis
Returns:
AnnotationSession with extracted claims
"""
# Load document if needed
html_content: str
source_file: Optional[str] = None
if isinstance(document, Path):
with open(document, 'r', encoding='utf-8') as f:
html_content = f.read()
source_url = source_url or str(document)
source_file = str(document)
# Auto-detect image directory if not provided
if image_dir is None:
image_dir = document.parent
elif isinstance(document, str):
# Check if it's a file path (short string, no HTML tags)
is_file_path = len(document) < 500 and not document.strip().startswith('<')
if is_file_path:
try:
path = Path(document)
if path.exists():
with open(path, 'r', encoding='utf-8') as f:
html_content = f.read()
source_url = source_url or document
source_file = document
if image_dir is None:
image_dir = path.parent
else:
html_content = document
except OSError:
# Path too long or invalid
html_content = document
else:
html_content = document
elif isinstance(document, HTMLDocument):
html_content = document.raw_html
source_url = source_url or document.source_url
source_file = document.source_file
else:
raise TypeError(f"Unsupported document type: {type(document)}")
# Create session
session = AnnotationSession(
session_id=f"llm-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
source_url=source_url or "unknown",
source_file=source_file,
)
# Prepare prompt
user_prompt = self._prepare_prompt(html_content)
# Call LLM for text annotation
try:
response = await self._call_llm(user_prompt)
# Parse response
annotations = self._parse_response(response)
# Convert to claims
self._populate_session(session, annotations, source_url)
except Exception as e:
session.errors.append(f"LLM annotation failed: {e}")
# Image analysis (if enabled)
if self.config.extract_images:
try:
image_claims = await self.analyze_images_in_html(
html_content=html_content,
base_url=source_url,
image_dir=image_dir,
)
for claim in image_claims:
session.add_image_claim(claim)
if image_claims:
logger.info(f"Analyzed {len(image_claims)} images from document")
except Exception as e:
session.errors.append(f"Image analysis failed: {e}")
logger.warning(f"Image analysis failed: {e}")
session.completed_at = datetime.now(timezone.utc).isoformat()
return session
def _prepare_prompt(self, html_content: str) -> str:
"""Prepare the user prompt with document content."""
# Truncate if too long (LLM context limits)
max_chars = 30000
if len(html_content) > max_chars:
html_content = html_content[:max_chars] + "\n... [truncated]"
return f"""Analyze the following HTML document and extract all entities, layout regions, claims, and relationships.
Return a JSON object following the schema in the system prompt.
HTML DOCUMENT:
```html
{html_content}
```
## ⚠️ CRITICAL EXTRACTION REQUIREMENTS ⚠️
### 1. COMPREHENSIVE ENTITY EXTRACTION
Extract ALL entities, not just named entities:
- Heritage institutions (GRP.HER)
- Ethnic groups (GRP.ETH): Jews, Roma, Sinti, etc.
- Quantities (QTY.CNT): "more than 100,000", "5000 objects"
- Historical events (THG.EVT): deportation, persecution, liberation
- Abstract concepts (THG.CON): stories, memories, heritage, mission
- Historical places (TOP.HIS): concentration camps, transit camps
- Time periods (TMP.ERA): World War II, Holocaust
### 2. COMPREHENSIVE RELATIONSHIP EXTRACTION
Decompose EVERY narrative sentence into semantic triples:
- REL.ORG.ACT: What activities does the organization perform? (preserve, tell, exhibit, research, commemorate)
- REL.SUB.ABT: What is the collection/institution/story ABOUT?
- REL.QTY.CNT: Quantities of people, objects, visitors
- REL.SPA.ORG: Where did people/things come FROM?
- REL.SPA.DST: Where did people/things go TO?
- REL.EVT.PAR: Who participated in events (voluntary or forced)?
- REL.EVT.VIC: Who were victims of events?
### 3. EXAMPLE - WHAT WE EXPECT
For text: "In het Herinneringscentrum vertellen we de verhalen van meer dan honderdduizend Joden"
Extract:
- **Entities**:
- Herinneringscentrum (GRP.HER)
- de verhalen (THG.CON - stories/narratives)
- meer dan honderdduizend (QTY.CNT - >100,000)
- Joden (GRP.ETH - Jews as ethnic group)
- **Relationships**:
- REL.ORG.ACT: Herinneringscentrum → tells → verhalen
- REL.SUB.ABT: verhalen → about → Joden
- REL.QTY.CNT: Joden → quantity → meer dan honderdduizend
### 4. DO NOT:
- Skip abstract concepts or quantities
- Extract only named entities
- Ignore the semantic relationships within sentences
- Produce shallow extractions with few relationships
IMPORTANT: The richness of semantic extraction is critical. A single paragraph may contain 10-20 relationships.
"""
def _calculate_backoff_delay(self, attempt: int) -> float:
"""
Calculate delay for exponential backoff.
Args:
attempt: Current retry attempt number (0-indexed)
Returns:
Delay in seconds
"""
retry = self.config.retry
delay = retry.base_delay * (retry.exponential_base ** attempt)
delay = min(delay, retry.max_delay)
# Add jitter to prevent thundering herd
if retry.jitter:
delay = delay * (0.5 + random.random())
return delay
def _get_api_key_for_provider(self, provider: LLMProvider) -> Optional[str]:
"""Get API key for a specific provider from environment."""
env_vars = {
LLMProvider.ZAI: "ZAI_API_TOKEN",
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
LLMProvider.OPENAI: "OPENAI_API_KEY",
}
return os.environ.get(env_vars.get(provider, ""))
async def _call_provider(
self,
provider: LLMProvider,
user_prompt: str,
api_key: Optional[str] = None,
) -> str:
"""
Call a specific LLM provider.
Args:
provider: Which provider to call
user_prompt: The user prompt to send
api_key: Optional API key override
Returns:
LLM response string
"""
# Use provided key or get from environment
key = api_key or self._get_api_key_for_provider(provider)
if not key:
raise ValueError(f"No API key available for {provider.value}")
if provider == LLMProvider.ZAI:
return await self._call_zai(user_prompt, key)
elif provider == LLMProvider.ANTHROPIC:
return await self._call_anthropic(user_prompt, key)
elif provider == LLMProvider.OPENAI:
return await self._call_openai(user_prompt, key)
else:
raise ValueError(f"Unsupported provider: {provider}")
async def _call_llm(self, user_prompt: str) -> str:
"""
Call the LLM API with retry logic and provider fallback.
Implements:
1. Exponential backoff with jitter for rate limits
2. Automatic fallback to alternative providers on failure
Returns:
LLM response string
Raises:
Exception: If all retries and fallbacks are exhausted
"""
import httpx
# Build provider chain: primary + fallbacks
providers_to_try = [self.config.provider]
if self.config.fallback_providers:
providers_to_try.extend(self.config.fallback_providers)
last_exception: Optional[Exception] = None
for provider in providers_to_try:
api_key = (
self.config.api_key
if provider == self.config.provider
else self._get_api_key_for_provider(provider)
)
if not api_key:
logger.info(f"Skipping {provider.value}: no API key available")
continue
logger.info(f"Trying provider: {provider.value}")
for attempt in range(self.config.retry.max_retries):
try:
return await self._call_provider(provider, user_prompt, api_key)
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
if status_code in self.config.retry.retry_on_status:
delay = self._calculate_backoff_delay(attempt)
logger.warning(
f"Provider {provider.value} returned {status_code} "
f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
f"Retrying in {delay:.2f}s..."
)
await asyncio.sleep(delay)
last_exception = e
else:
# Non-retryable error, try next provider
logger.error(
f"Provider {provider.value} returned non-retryable "
f"status {status_code}: {e}"
)
last_exception = e
break
except httpx.TimeoutException as e:
delay = self._calculate_backoff_delay(attempt)
logger.warning(
f"Provider {provider.value} timed out "
f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
f"Retrying in {delay:.2f}s..."
)
await asyncio.sleep(delay)
last_exception = e
except Exception as e:
logger.error(f"Provider {provider.value} failed: {e}")
last_exception = e
break
# All retries exhausted for this provider, try next
logger.warning(f"Provider {provider.value} exhausted all retries")
# All providers failed
raise RuntimeError(
f"All LLM providers failed. Last error: {last_exception}"
) from last_exception
async def _call_zai(self, user_prompt: str, api_key: str) -> str:
"""
Call Z.AI API using Anthropic-compatible endpoint.
Z.AI GLM Coding Plan provides an Anthropic-compatible API at:
https://api.z.ai/api/anthropic/v1/messages
Uses same message format as Anthropic Claude API.
"""
import httpx
# Z.AI Anthropic-compatible endpoint
url = "https://api.z.ai/api/anthropic/v1/messages"
# Z.AI uses Anthropic-style headers
headers = {
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"Content-Type": "application/json",
}
# Map Z.AI model names - GLM models available via Anthropic API
# Default to claude-3-5-sonnet if model not explicitly set for Z.AI
model = self.config.model
if model.startswith("glm-"):
# Z.AI's Anthropic endpoint uses Claude model names
model = "claude-sonnet-4-20250514"
payload = {
"model": model,
"max_tokens": self.config.max_tokens,
"system": GLAM_NER_SYSTEM_PROMPT,
"messages": [
{"role": "user", "content": user_prompt},
],
}
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
response = await client.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
# Anthropic response format
return data["content"][0]["text"]
async def _call_anthropic(self, user_prompt: str, api_key: str) -> str:
"""Call Anthropic Claude API."""
import httpx
url = "https://api.anthropic.com/v1/messages"
headers = {
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"Content-Type": "application/json",
}
# Use Claude-specific model name for fallback
model = (
self.config.model
if self.config.provider == LLMProvider.ANTHROPIC
else "claude-3-5-sonnet-20241022"
)
payload = {
"model": model,
"max_tokens": self.config.max_tokens,
"system": GLAM_NER_SYSTEM_PROMPT,
"messages": [
{"role": "user", "content": user_prompt},
],
}
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
response = await client.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
return data["content"][0]["text"]
async def _call_openai(self, user_prompt: str, api_key: str) -> str:
"""Call OpenAI GPT-4 API."""
import httpx
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
# Use OpenAI-specific model name for fallback
model = (
self.config.model
if self.config.provider == LLMProvider.OPENAI
else "gpt-4o"
)
payload = {
"model": model,
"messages": [
{"role": "system", "content": GLAM_NER_SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"temperature": self.config.temperature,
"max_tokens": self.config.max_tokens,
}
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
response = await client.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
# =========================================================================
# IMAGE ANALYSIS METHODS (Z.AI GLM-4.5V Vision API)
# =========================================================================
async def _analyze_image(
self,
image_url: Optional[str] = None,
image_base64: Optional[str] = None,
image_path: Optional[str] = None,
alt_text: Optional[str] = None,
context: Optional[str] = None,
) -> Dict[str, Any]:
"""
Analyze an image using Z.AI GLM-4.5V vision model.
Extracts visual descriptions, entities, OCR text, and heritage relevance.
Args:
image_url: URL of the image (absolute or relative)
image_base64: Base64-encoded image data
image_path: Local file path to image
alt_text: HTML alt text for context
context: Surrounding text context from the page
Returns:
Dict with analysis results:
{
"description": "Natural language description",
"detected_entities": [{"type": "AGT.PER", "text": "...", "confidence": 0.9}],
"extracted_text": "OCR text if present",
"heritage_relevance": "Why this matters for heritage",
"image_type": "photograph|painting|document|map|artifact|other",
"era_estimate": "Estimated time period",
"style": "Photographic/artistic style",
"analysis_confidence": 0.85
}
"""
import httpx
import base64
# Prepare image content for API
image_content = None
if image_base64:
# Already base64 encoded
image_content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
elif image_path:
# Read and encode local file
try:
path = Path(image_path)
if path.exists():
with open(path, 'rb') as f:
img_data = base64.b64encode(f.read()).decode('utf-8')
# Detect MIME type from extension
ext = path.suffix.lower()
mime_types = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.gif': 'image/gif',
'.webp': 'image/webp', '.bmp': 'image/bmp'
}
mime_type = mime_types.get(ext, 'image/jpeg')
image_content = {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_data}"}}
except Exception as e:
logger.warning(f"Failed to read image file {image_path}: {e}")
return {"error": f"Failed to read image: {e}"}
elif image_url:
# Use URL directly (works for absolute URLs)
if image_url.startswith('http://') or image_url.startswith('https://'):
image_content = {"type": "image_url", "image_url": {"url": image_url}}
else:
# Relative URL - can't analyze without base URL
logger.debug(f"Skipping relative URL image: {image_url}")
return {"error": "Relative URL - cannot analyze without base URL"}
if not image_content:
return {"error": "No valid image source provided"}
# Build prompt for heritage image analysis
prompt_parts = [
"Analyze this image from a heritage institution website.",
"",
"Provide a JSON response with the following fields:",
"- description: Detailed description of what the image shows",
"- detected_entities: Array of entities visible in the image, each with {type, text, confidence}",
" - Use GLAM-NER types: AGT.PER (person), WRK.VIS (artwork), THG.ART (artifact), TOP.BLD (building), GRP.ETH (ethnic group), etc.",
"- extracted_text: Any text visible in the image (OCR)",
"- heritage_relevance: Why this image is significant for heritage/cultural preservation",
"- image_type: One of: photograph, painting, document, map, artifact, museum_object, historical_photo, memorial, building, portrait, group_photo, exhibition, other",
"- era_estimate: Estimated time period of the content (e.g., '1940s', 'World War II', 'medieval', 'contemporary')",
"- style: Artistic or photographic style",
"- analysis_confidence: Your confidence in this analysis (0.0-1.0)",
]
if alt_text:
prompt_parts.extend(["", f"HTML alt text: {alt_text}"])
if context:
prompt_parts.extend(["", f"Page context: {context[:500]}..."])
prompt_parts.extend([
"",
"Return ONLY valid JSON, no markdown code blocks."
])
prompt = "\n".join(prompt_parts)
# Call Z.AI GLM-4.5V Vision API
# Z.AI uses OpenAI-compatible format for vision at a different endpoint
url = "https://api.z.ai/api/paas/v4/chat/completions"
headers = {
"Authorization": f"Bearer {self.config.api_key}",
"Content-Type": "application/json",
}
payload = {
"model": self.config.vision_model, # "glm-4.5v"
"messages": [
{
"role": "user",
"content": [
image_content,
{"type": "text", "text": prompt}
]
}
],
"max_tokens": self.config.vision_max_tokens,
"temperature": 0.1,
}
# Retry logic with exponential backoff for vision API
max_retries = self.config.retry.max_retries
base_delay = self.config.retry.base_delay
for attempt in range(max_retries + 1):
try:
async with httpx.AsyncClient(timeout=60) as client:
response = await client.post(url, headers=headers, json=payload)
# Check for rate limit
if response.status_code == 429:
if attempt < max_retries:
delay = base_delay * (2 ** attempt)
if self.config.retry.jitter:
delay += random.uniform(0, delay * 0.1)
delay = min(delay, self.config.retry.max_delay)
logger.info(f"Vision API rate limited, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(delay)
continue
else:
return {"error": "Vision API rate limited after max retries"}
response.raise_for_status()
data = response.json()
# Parse response content
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
# Try to parse as JSON
try:
result = json.loads(content)
return result
except json.JSONDecodeError:
# If not valid JSON, return the text as description
return {
"description": content,
"detected_entities": [],
"analysis_confidence": 0.5,
"error": "Response was not valid JSON"
}
except httpx.HTTPStatusError as e:
if e.response.status_code in self.config.retry.retry_on_status and attempt < max_retries:
delay = base_delay * (2 ** attempt)
logger.info(f"Vision API error {e.response.status_code}, retrying in {delay:.1f}s")
await asyncio.sleep(delay)
continue
logger.warning(f"Vision API HTTP error: {e.response.status_code}")
return {"error": f"Vision API error: {e.response.status_code}"}
except Exception as e:
logger.warning(f"Vision API call failed: {e}")
return {"error": str(e)}
return {"error": "Vision API failed after retries"}
async def analyze_images_in_html(
self,
html_content: str,
base_url: Optional[str] = None,
image_dir: Optional[Path] = None,
) -> List[ImageClaim]:
"""
Extract and analyze all images from an HTML document.
Args:
html_content: Raw HTML content
base_url: Base URL for resolving relative image paths
image_dir: Local directory containing downloaded images
Returns:
List of ImageClaim objects with analysis results
"""
from lxml import html as lxml_html
from urllib.parse import urljoin
image_claims: List[ImageClaim] = []
# Parse HTML
try:
tree = lxml_html.fromstring(html_content)
except Exception as e:
logger.warning(f"Failed to parse HTML for image extraction: {e}")
return []
# Find all img elements
img_elements = tree.xpath('//img[@src]')
# Limit number of images per page
if len(img_elements) > self.config.max_images_per_page:
logger.info(f"Limiting image analysis to {self.config.max_images_per_page} of {len(img_elements)} images")
img_elements = img_elements[:self.config.max_images_per_page]
for img in img_elements:
src = img.get('src', '')
alt = img.get('alt', '')
title = img.get('title', '')
width_str = img.get('width', '')
height_str = img.get('height', '')
# Skip data URIs and tiny images
if src.startswith('data:'):
continue
# Parse dimensions
try:
width = int(width_str) if width_str.isdigit() else None
height = int(height_str) if height_str.isdigit() else None
# Skip tiny images (likely icons/decorations)
if width and height:
if width < self.config.min_image_size or height < self.config.min_image_size:
continue
except (ValueError, TypeError):
width = height = None
# Get XPath for provenance
xpath = tree.getroottree().getpath(img)
# Get surrounding context text
parent = img.getparent()
context = ""
if parent is not None:
# Get text from parent and siblings
context_parts = []
if parent.text:
context_parts.append(parent.text.strip())
for sibling in parent:
if sibling.tail:
context_parts.append(sibling.tail.strip())
context = " ".join(context_parts)[:300]
# Resolve image URL
image_url = None
image_path = None
image_base64 = None
if src.startswith('http://') or src.startswith('https://'):
image_url = src
elif base_url:
image_url = urljoin(base_url, src)
# Check for local copy
if image_dir:
# Try to find the image in the local directory
src_filename = Path(src).name
local_candidates = [
image_dir / src_filename,
image_dir / src.lstrip('/'),
image_dir / src,
]
for candidate in local_candidates:
if candidate.exists():
image_path = str(candidate)
break
# Analyze the image
analysis = await self._analyze_image(
image_url=image_url,
image_path=image_path,
image_base64=image_base64,
alt_text=alt or title,
context=context,
)
# Skip if analysis failed
if analysis.get("error"):
logger.debug(f"Skipping image {src}: {analysis.get('error')}")
continue
# Create ImageClaim
claim = ImageClaim(
image_url=image_url or src,
image_path=image_path,
alt_text=alt if alt else None,
title=title if title else None,
width=width,
height=height,
xpath=xpath,
description=analysis.get("description"),
detected_entities=analysis.get("detected_entities", []),
extracted_text=analysis.get("extracted_text"),
heritage_relevance=analysis.get("heritage_relevance"),
image_type=analysis.get("image_type"),
era_estimate=analysis.get("era_estimate"),
style=analysis.get("style"),
analysis_model=self.config.vision_model,
analysis_confidence=analysis.get("analysis_confidence"),
provenance=Provenance(
namespace="glam-ner",
path=xpath,
timestamp=datetime.now(timezone.utc).isoformat(),
agent=f"LLMAnnotator/{self.config.vision_model}",
context_convention="GLAM-NER v1.7.0-unified/vision",
confidence=analysis.get("analysis_confidence", 0.5),
),
)
image_claims.append(claim)
return image_claims
def _parse_response(self, response: str) -> Dict[str, Any]:
"""Parse LLM response JSON."""
# Find JSON in response (may be wrapped in markdown code blocks)
import re
# Try to find JSON block
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response)
if json_match:
json_str = json_match.group(1)
else:
# Try to parse whole response as JSON
json_str = response
try:
return json.loads(json_str)
except json.JSONDecodeError:
# Return empty structure if parsing fails
return {"entities": [], "layout_regions": [], "claims": []}
def _populate_session(
self,
session: AnnotationSession,
annotations: Dict[str, Any],
source_url: Optional[str],
) -> None:
"""Populate session with parsed annotations."""
timestamp = datetime.now(timezone.utc).isoformat()
# Build entity ID lookup for relationship linking
entity_id_lookup: Dict[str, str] = {}
# Process entities
for entity in annotations.get("entities", []):
# Parse hypernym - can come from explicit field or be inferred from hyponym
hypernym_str = entity.get("hypernym", "THG")
hyponym_str = entity.get("hyponym", "")
# If hyponym has dot notation (e.g., "GRP.HER"), extract hypernym from it
if hyponym_str and "." in hyponym_str:
hypernym_str = hyponym_str.split(".")[0]
try:
hypernym = EntityHypernym(hypernym_str)
except ValueError:
hypernym = EntityHypernym.THG
# Generate unique claim ID
claim_id = f"entity-{len(session.entity_claims)+1}"
# Store lookup for relationship linking (by text span)
entity_text = entity.get("text", "")
if entity_text:
entity_id_lookup[entity_text] = claim_id
# Get class_uri from hyponym (or hypernym as fallback)
# hyponym takes precedence for more specific ontology mapping
class_uri = None
if hyponym_str:
class_uri = get_ontology_class(hyponym_str)
if not class_uri:
class_uri = get_ontology_class(hypernym_str)
# Map LLM response fields to EntityClaim fields
# EntityClaim inherits text_content from Claim base class
claim = EntityClaim(
claim_id=claim_id,
hypernym=hypernym,
hyponym=hyponym_str if hyponym_str else "unknown",
text_content=entity_text, # LLM returns "text", we use text_content
class_uri=class_uri, # Auto-populated from hyponym/hypernym
isil_id=entity.get("isil_id"), # If present from LLM
cidoc_class=entity.get("cidoc_class"), # Backwards compatibility
recognition_confidence=entity.get("confidence", 0.5),
provenance=Provenance(
namespace="glam-ner",
path=entity.get("xpath", ""),
timestamp=timestamp,
agent=f"{self.config.provider.value}/{self.config.model}",
context_convention=self.config.context_convention,
),
)
session.entity_claims.append(claim)
# Process layout regions
for region in annotations.get("layout_regions", []):
try:
region_type = LayoutRegion(region.get("region", "PAR"))
except ValueError:
region_type = LayoutRegion.PAR
try:
semantic_role = SemanticRole(region.get("semantic_role", "PRIM"))
except ValueError:
semantic_role = SemanticRole.PRIM
# LayoutClaim uses text_content from base Claim class
claim = LayoutClaim(
claim_id=f"layout-{len(session.layout_claims)+1}",
region=region_type,
semantic_role=semantic_role,
xpath=region.get("xpath", ""),
text_content=region.get("text_preview", "")[:200], # Use text_content
provenance=Provenance(
namespace="glam-ner",
path=region.get("xpath", ""),
timestamp=timestamp,
agent=f"{self.config.provider.value}/{self.config.model}",
context_convention=self.config.context_convention,
),
)
session.layout_claims.append(claim)
# Process relationships
for rel in annotations.get("relationships", []):
self._process_relationship(
session=session,
rel_data=rel,
entity_id_lookup=entity_id_lookup,
timestamp=timestamp,
)
# Process claims (aggregate)
for claim_data in annotations.get("claims", []):
claim = AggregateClaim(
claim_id=f"claim-{len(session.aggregate_claims)+1}",
claim_type=claim_data.get("claim_type", "unknown"),
claim_value=claim_data.get("claim_value", ""),
text_content=claim_data.get("claim_value", ""), # Store value in text_content too
provenance=Provenance(
namespace="glam-ner",
path=claim_data.get("xpath", ""),
timestamp=timestamp,
agent=f"{self.config.provider.value}/{self.config.model}",
context_convention=self.config.context_convention,
confidence=claim_data.get("confidence", 0.5),
),
)
session.aggregate_claims.append(claim)
def _process_relationship(
self,
session: AnnotationSession,
rel_data: Dict[str, Any],
entity_id_lookup: Dict[str, str],
timestamp: str,
) -> None:
"""Process a relationship from LLM response and add to session."""
# Parse relationship type
rel_type = rel_data.get("relationship_type", "REL.CRE")
rel_hypernym = None
rel_hyponym = None
# Extract hypernym and hyponym from relationship type
if rel_type and "." in rel_type:
parts = rel_type.split(".")
if len(parts) >= 2:
hypernym_str = f"{parts[0]}.{parts[1]}" # e.g., "REL.CRE"
try:
rel_hypernym = RelationshipHypernym(hypernym_str)
except ValueError:
pass
rel_hyponym = rel_type # Full code, e.g., "REL.CRE.AUT"
# Parse subject
subject_data = rel_data.get("subject", {})
subject_text = subject_data.get("text", "")
subject = RelationshipSubject(
entity_id=entity_id_lookup.get(subject_text),
entity_type=subject_data.get("entity_type") or subject_data.get("type"), # Support both keys
span_text=subject_text,
uri=subject_data.get("uri"),
)
# Parse object
object_data = rel_data.get("object", {})
object_text = object_data.get("text", "")
obj = RelationshipObject(
entity_id=entity_id_lookup.get(object_text),
entity_type=object_data.get("entity_type") or object_data.get("type"), # Support both keys
span_text=object_text,
uri=object_data.get("uri"),
)
# Parse predicate
predicate_data = rel_data.get("predicate", {})
predicate = RelationshipPredicate(
uri=predicate_data.get("uri"),
label=predicate_data.get("label", rel_type),
direction=predicate_data.get("direction", "FORWARD"),
)
# Parse temporal scope (if present)
temporal_scope = None
temporal_data = rel_data.get("temporal", {})
if temporal_data:
temporal_scope = TemporalScope(
start_date=temporal_data.get("start_date"),
end_date=temporal_data.get("end_date"),
temporal_modifier=temporal_data.get("modifier"),
)
# Parse spatial scope (if present)
spatial_scope = None
spatial_data = rel_data.get("spatial", {})
if spatial_data:
spatial_scope = SpatialScope(
place_id=spatial_data.get("place_id"),
place_name=spatial_data.get("place_name"),
geo_uri=spatial_data.get("geo_uri"),
)
# Parse qualifiers (if present)
qualifiers = []
for qual_data in rel_data.get("qualifiers", []):
qualifiers.append(RelationshipQualifier(
qualifier_type=qual_data.get("type", ""),
qualifier_value=qual_data.get("value", ""),
qualifier_uri=qual_data.get("uri"),
))
# Create relationship claim
claim = RelationshipClaim(
claim_id=f"rel-{len(session.relationship_claims)+1}",
relationship_hypernym=rel_hypernym,
relationship_hyponym=rel_hyponym,
subject=subject,
predicate=predicate,
object=obj,
temporal_scope=temporal_scope,
spatial_scope=spatial_scope,
qualifiers=qualifiers,
negation=rel_data.get("negation", False),
hypothetical=rel_data.get("hypothetical", False),
source_claim=rel_data.get("source_claim", False),
attributed_to=rel_data.get("attributed_to"),
extraction_confidence=rel_data.get("confidence", 0.5),
text_content=rel_data.get("text", ""), # Original text span
provenance=Provenance(
namespace="glam-ner",
path=rel_data.get("xpath", ""),
timestamp=timestamp,
agent=f"{self.config.provider.value}/{self.config.model}",
context_convention=self.config.context_convention,
confidence=rel_data.get("confidence", 0.5),
),
)
# Validate domain/range constraints
if rel_hyponym:
validation_result = validate_relationship_constraints(
relationship_type=rel_hyponym,
subject_type=subject.entity_type,
object_type=obj.entity_type,
strict=False, # Treat violations as warnings, not errors
)
# Add any validation warnings to session errors
if validation_result.warnings:
for warning in validation_result.warnings:
session.errors.append(f"[VALIDATION WARNING] {claim.claim_id}: {warning}")
logger.warning(f"Relationship validation: {warning}")
if validation_result.errors:
for error in validation_result.errors:
session.errors.append(f"[VALIDATION ERROR] {claim.claim_id}: {error}")
logger.error(f"Relationship validation: {error}")
session.add_relationship_claim(claim)
async def annotate_batch(
self,
documents: List[Union[HTMLDocument, str, Path]],
source_urls: Optional[List[str]] = None,
concurrency: int = 3,
) -> List[AnnotationSession]:
"""
Annotate multiple documents concurrently.
Args:
documents: List of documents to annotate
source_urls: Optional list of source URLs
concurrency: Maximum concurrent requests
Returns:
List of AnnotationSessions
"""
urls: List[Optional[str]] = list(source_urls) if source_urls else [None] * len(documents)
semaphore = asyncio.Semaphore(concurrency)
async def annotate_with_semaphore(doc: Union[HTMLDocument, str, Path], url: Optional[str]) -> AnnotationSession:
async with semaphore:
return await self.annotate(doc, url)
tasks = [
annotate_with_semaphore(doc, url)
for doc, url in zip(documents, urls)
]
return await asyncio.gather(*tasks)
async def annotate_with_schema(
self,
document: Union[HTMLDocument, str, Path],
schema: Optional[GLAMSchema] = None,
source_url: Optional[str] = None,
validate_output: bool = True,
) -> Tuple[AnnotationSession, Dict[str, Any]]:
"""
Annotate a document using schema-driven extraction.
This method uses GLAMSchema to:
1. Generate targeted extraction prompts
2. Extract structured fields defined in the schema
3. Optionally validate output against JSON Schema
Args:
document: HTMLDocument, HTML string, or path to HTML file
schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
source_url: Optional source URL for provenance
validate_output: Whether to validate extracted data against schema
Returns:
Tuple of (AnnotationSession, structured_data dict)
Example:
>>> schema = (
... GLAMSchema("custom")
... .entities("GRP", "TOP")
... .structure()
... .field("name::str::Institution name") # GLiNER2 syntax
... .field("type::[MUSEUM|ARCHIVE]::str::Type")
... .build()
... )
>>> session, data = await annotator.annotate_with_schema(doc, schema)
>>> print(data["structured"]["name"])
"""
# Use default schema if not provided
if schema is None:
schema = heritage_custodian_schema()
# Load document
html_content: str
if isinstance(document, Path):
with open(document, 'r', encoding='utf-8') as f:
html_content = f.read()
source_url = source_url or str(document)
elif isinstance(document, str):
# Check if it's a file path (short string, no HTML tags)
is_file_path = len(document) < 500 and not document.strip().startswith('<')
if is_file_path:
try:
path = Path(document)
if path.exists():
with open(path, 'r', encoding='utf-8') as f:
html_content = f.read()
source_url = source_url or document
else:
html_content = document
except OSError:
# Path too long or invalid
html_content = document
else:
html_content = document
elif isinstance(document, HTMLDocument):
html_content = document.raw_html
source_url = source_url or document.source_url
else:
raise TypeError(f"Unsupported document type: {type(document)}")
# Create session
session = AnnotationSession(
session_id=f"schema-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
source_url=source_url or "unknown",
)
# Generate schema-aware prompt
schema_prompt = schema.to_llm_prompt(include_examples=True, output_format="json")
user_prompt = self._prepare_schema_prompt(html_content, schema_prompt)
structured_data: Dict[str, Any] = {}
try:
# Call LLM with schema-aware prompt
response = await self._call_llm(user_prompt)
# Parse response
annotations = self._parse_response(response)
# Extract structured data from response
structured_data = annotations.get("structured", {})
# Also extract classifications if present
if "classifications" in annotations:
structured_data["_classifications"] = annotations["classifications"]
# Extract relations if present
if "relations" in annotations:
structured_data["_relations"] = annotations["relations"]
# Validate against JSON Schema if requested
if validate_output and structured_data:
validation_errors = self._validate_structured_output(structured_data, schema)
if validation_errors:
session.errors.extend(validation_errors)
# Populate session with entity and claim data
self._populate_session(session, annotations, source_url)
# Add structured data to session config
session.config["structured_data"] = structured_data
session.config["schema_name"] = schema.name
except Exception as e:
session.errors.append(f"Schema-driven annotation failed: {e}")
session.completed_at = datetime.now(timezone.utc).isoformat()
return session, structured_data
def _prepare_schema_prompt(self, html_content: str, schema_prompt: str) -> str:
"""Prepare prompt with schema instructions and document content."""
# Truncate if too long
max_chars = 25000 # Leave room for schema prompt
if len(html_content) > max_chars:
html_content = html_content[:max_chars] + "\n... [truncated]"
return f"""{schema_prompt}
---
## Document to Analyze
Extract all information following the schema above from this HTML document:
```html
{html_content}
```
## Instructions
1. Extract ALL entities matching the specified hypernyms
2. Fill in ALL structured fields from the schema
3. Include XPath locations for provenance
4. Use confidence scores appropriately
5. Return ONLY a valid JSON object matching the output format
IMPORTANT: The "structured" field in your response must contain the extracted field values.
"""
def _validate_structured_output(
self,
data: Dict[str, Any],
schema: GLAMSchema,
) -> List[str]:
"""
Validate structured output against schema.
Args:
data: Extracted structured data
schema: GLAMSchema used for extraction
Returns:
List of validation error messages (empty if valid)
"""
errors = []
# Check required fields
for field in schema.fields:
if field.required and field.name not in data:
errors.append(f"Missing required field: {field.name}")
# Validate field types and choices
for field in schema.fields:
if field.name not in data:
continue
value = data[field.name]
# Check choices
if field.choices and value:
if field.dtype == "list":
invalid_values = [v for v in value if v not in field.choices]
if invalid_values:
errors.append(
f"Invalid values for {field.name}: {invalid_values}. "
f"Valid: {field.choices}"
)
elif value not in field.choices:
errors.append(
f"Invalid value for {field.name}: {value}. "
f"Valid: {field.choices}"
)
# Check patterns
if field.pattern and value and isinstance(value, str):
import re
if not re.match(field.pattern, value):
errors.append(
f"Field {field.name} does not match pattern {field.pattern}: {value}"
)
return errors
async def extract_structured(
self,
document: Union[HTMLDocument, str, Path],
fields: List[str],
source_url: Optional[str] = None,
) -> Dict[str, Any]:
"""
Quick structured extraction using GLiNER2-style field specs.
This is a convenience method for simple extractions without
full annotation session overhead.
Args:
document: Document to extract from
fields: List of GLiNER2-style field specs
e.g., ["name::str::Institution name",
"type::[MUSEUM|ARCHIVE]::str::Type"]
source_url: Optional source URL
Returns:
Dict of extracted field values
Example:
>>> data = await annotator.extract_structured(
... html_doc,
... ["name::str::Full name",
... "email::str::Contact email",
... "type::[MUSEUM|ARCHIVE|LIBRARY]::str::Institution type"]
... )
>>> print(data["name"])
"""
# Build schema from field specs
schema = GLAMSchema("quick_extraction").structure()
for field_spec in fields:
parsed = FieldSpec.from_gliner2_syntax(field_spec)
schema.fields.append(parsed)
schema = schema.build()
# Run extraction
_, structured_data = await self.annotate_with_schema(
document,
schema=schema,
source_url=source_url,
validate_output=False, # Skip validation for quick extraction
)
return structured_data
# =============================================================================
# CONVENIENCE FUNCTIONS
# =============================================================================
def create_llm_annotator(
provider: str = "zai",
model: Optional[str] = None,
api_key: Optional[str] = None,
enable_fallback: bool = True,
max_retries: int = 5,
) -> LLMAnnotator:
"""
Create an LLM annotator with the specified provider.
Args:
provider: "zai", "anthropic", or "openai"
model: Optional model name (uses provider default if not specified)
api_key: Optional API key (uses environment variable if not specified)
enable_fallback: Enable automatic fallback to other providers on failure
max_retries: Maximum retry attempts per provider
Returns:
Configured LLMAnnotator instance
"""
provider_enum = LLMProvider(provider)
default_models = {
LLMProvider.ZAI: "glm-4.6",
LLMProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
LLMProvider.OPENAI: "gpt-4o",
}
# Configure retry
retry_config = RetryConfig(max_retries=max_retries)
# Configure fallback providers
fallback_providers = (
[p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
if p != provider_enum]
if enable_fallback else []
)
config = LLMAnnotatorConfig(
provider=provider_enum,
model=model or default_models[provider_enum],
api_key=api_key,
retry=retry_config,
fallback_providers=fallback_providers,
)
return LLMAnnotator(config)
async def annotate_html_file(
file_path: Union[str, Path],
provider: str = "zai",
model: Optional[str] = None,
) -> AnnotationSession:
"""
Annotate an HTML file using LLM.
Args:
file_path: Path to HTML file
provider: LLM provider ("zai", "anthropic", "openai")
model: Optional model name
Returns:
AnnotationSession with extracted claims
"""
annotator = create_llm_annotator(provider=provider, model=model)
return await annotator.annotate(file_path)
async def annotate_with_schema(
file_path: Union[str, Path],
schema: Optional[GLAMSchema] = None,
provider: str = "zai",
model: Optional[str] = None,
) -> Tuple[AnnotationSession, Dict[str, Any]]:
"""
Annotate an HTML file using schema-driven extraction.
Args:
file_path: Path to HTML file
schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
provider: LLM provider ("zai", "anthropic", "openai")
model: Optional model name
Returns:
Tuple of (AnnotationSession, structured_data dict)
Example:
>>> schema = (
... GLAMSchema("museum")
... .entities("GRP", "TOP")
... .structure()
... .field("name::str::Museum name")
... .field("city::str::City location")
... .build()
... )
>>> session, data = await annotate_with_schema("museum.html", schema)
>>> print(data["name"])
"""
annotator = create_llm_annotator(provider=provider, model=model)
return await annotator.annotate_with_schema(file_path, schema=schema)
__all__ = [
"LLMProvider",
"LLMAnnotatorConfig",
"RetryConfig",
"LLMAnnotator",
"GLAM_NER_SYSTEM_PROMPT",
"create_llm_annotator",
"annotate_html_file",
"annotate_with_schema",
]