1992 lines
80 KiB
Python
1992 lines
80 KiB
Python
"""
|
|
LLM-Based Agentic Annotator for GLAM Documents.
|
|
|
|
This module provides LLM-only entity annotation following GLAM-NER v1.7.0.
|
|
NO HEURISTIC/PATTERN-BASED METHODS - all recognition is done via LLM inference.
|
|
|
|
Supported LLM Providers:
|
|
- Z.AI (Zhipu AI) GLM-4 (default)
|
|
- Anthropic Claude
|
|
- OpenAI GPT-4
|
|
|
|
Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.
|
|
|
|
Features:
|
|
- Exponential backoff retry for rate limits (429)
|
|
- Automatic provider fallback (Z.AI → Claude → OpenAI)
|
|
- Configurable retry attempts and delays
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
from enum import Enum
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Import base classes
|
|
from .base import (
|
|
AnnotationSession,
|
|
EntityClaim,
|
|
LayoutClaim,
|
|
AggregateClaim,
|
|
ImageClaim,
|
|
Provenance,
|
|
EntityHypernym,
|
|
LayoutRegion,
|
|
SemanticRole,
|
|
RelationshipClaim,
|
|
RelationshipSubject,
|
|
RelationshipObject,
|
|
RelationshipPredicate,
|
|
RelationshipHypernym,
|
|
TemporalScope,
|
|
SpatialScope,
|
|
RelationshipQualifier,
|
|
get_ontology_class,
|
|
validate_relationship_constraints,
|
|
)
|
|
from .html_parser import HTMLDocument
|
|
from .schema_builder import GLAMSchema, FieldSpec, heritage_custodian_schema
|
|
|
|
|
|
class LLMProvider(str, Enum):
|
|
"""Supported LLM providers."""
|
|
ZAI = "zai" # Zhipu AI GLM-4
|
|
ANTHROPIC = "anthropic" # Claude
|
|
OPENAI = "openai" # GPT-4
|
|
|
|
|
|
@dataclass
|
|
class RetryConfig:
|
|
"""Configuration for retry logic with exponential backoff."""
|
|
max_retries: int = 5 # Maximum retry attempts
|
|
base_delay: float = 1.0 # Initial delay in seconds
|
|
max_delay: float = 60.0 # Maximum delay cap
|
|
exponential_base: float = 2.0 # Exponential backoff base
|
|
jitter: bool = True # Add random jitter to prevent thundering herd
|
|
retry_on_status: tuple = (429, 500, 502, 503, 504) # HTTP status codes to retry
|
|
|
|
|
|
@dataclass
|
|
class LLMAnnotatorConfig:
|
|
"""Configuration for LLM-based annotation."""
|
|
provider: LLMProvider = LLMProvider.ZAI
|
|
model: str = "glm-4.6" # Z.AI's latest model
|
|
api_key: Optional[str] = None
|
|
temperature: float = 0.1 # Low temp for consistent extraction
|
|
max_tokens: int = 4096
|
|
timeout: int = 120 # Longer timeout for LLM calls
|
|
|
|
# Annotation settings
|
|
extract_entities: bool = True
|
|
extract_layout: bool = True
|
|
extract_claims: bool = True
|
|
extract_images: bool = True # NEW: Enable image analysis via vision model
|
|
|
|
# Vision model settings (for image analysis)
|
|
vision_model: str = "glm-4.5v" # Z.AI's vision model
|
|
vision_max_tokens: int = 2048 # Max tokens for vision response
|
|
max_images_per_page: int = 10 # Limit images analyzed per page
|
|
min_image_size: int = 50 # Minimum dimension (width or height) to analyze
|
|
|
|
# Provenance settings
|
|
context_convention: str = "GLAM-NER v1.7.0-unified"
|
|
|
|
# Retry settings
|
|
retry: RetryConfig = field(default_factory=RetryConfig)
|
|
|
|
# Fallback providers (tried in order when primary fails)
|
|
fallback_providers: Optional[List[LLMProvider]] = None
|
|
|
|
def __post_init__(self):
|
|
"""Load API key from environment if not provided."""
|
|
if self.api_key is None:
|
|
if self.provider == LLMProvider.ZAI:
|
|
self.api_key = os.environ.get("ZAI_API_TOKEN")
|
|
elif self.provider == LLMProvider.ANTHROPIC:
|
|
self.api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
elif self.provider == LLMProvider.OPENAI:
|
|
self.api_key = os.environ.get("OPENAI_API_KEY")
|
|
|
|
# Default fallback chain if not specified
|
|
if self.fallback_providers is None:
|
|
self.fallback_providers = [
|
|
p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
|
|
if p != self.provider
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# GLAM-NER v1.7.0 SYSTEM PROMPT
|
|
# =============================================================================
|
|
|
|
GLAM_NER_SYSTEM_PROMPT = """You are an expert entity annotator following the GLAM-NER v1.7.0-unified Entity Annotation Convention.
|
|
|
|
Your task is to extract structured claims from heritage institution documents with full provenance.
|
|
|
|
## HYPERNYMS AND HYPONYMS (10 types with subcategories)
|
|
|
|
### 1. AGT (Agent): Humans, animals, AI, fictional characters
|
|
**Subcategories:**
|
|
- AGT.PER: Person - INDIVIDUAL human beings with SPECIFIC NAMES (maps to crm:E21_Person)
|
|
⚠️ STRICT EXCLUSIONS - Do NOT tag as AGT.PER:
|
|
• Groups/collectives: "staff", "members", "curators", "colleagues", "board", "team", "committee", "participants", "community"
|
|
• Plural person references: "archivists", "researchers", "visitors", "filmmakers", "historians"
|
|
• Role descriptions without names: "the curator", "a researcher", "museum director"
|
|
• Organizations/events with role words: "FIAF Commission members", "conference colleagues", "board members"
|
|
• Networks: "VPRO/Tegenlicht network", "ACE member institutions"
|
|
• Topic references: "Verhalen van Bolsward" (stories about something)
|
|
• Conference/event participants: "Women and Silent Screen Conference participants"
|
|
• Fund/foundation board: "Prince Claus Fund board members"
|
|
• Festival communities: "Le Giornate del Cinema Muto community"
|
|
• Generic collectives: "community", "network", "consortium", "association"
|
|
✓ ONLY tag as AGT.PER: Named individuals like "Giovanna Fossati", "Dr. Jan van der Berg", "Martin Scorsese"
|
|
- AGT.STF: Staff - personnel in professional roles (maps to pico:PersonObservation)
|
|
- AGT.COL: Collective - named collectives without formal structure
|
|
- AGT.FIC: Fictional - characters from fiction/mythology
|
|
- AGT.MYT: Mythological - gods, deities, legendary figures
|
|
- AGT.ANI: Animal - named individual animals with agency
|
|
- AGT.ART: Artificial - AI systems, robots, software agents (maps to prov:SoftwareAgent)
|
|
|
|
Examples: "Dr. Jan van der Berg" → AGT.PER, "Giovanna Fossati" → AGT.PER, "the museum director" → AGT.STF
|
|
❌ NOT AGT.PER: "AMIA conference colleagues", "Prince Claus Fund board members", "festival community"
|
|
|
|
### 2. GRP (Group): Organizations, collectives, formal and informal
|
|
**Subcategories:**
|
|
- GRP.HER: Heritage institutions - museums, archives, libraries (maps to glam:HeritageCustodian)
|
|
- GRP.PAR: Parent/governing bodies (maps to rico:CorporateBody)
|
|
- GRP.UNT: Organizational units/departments (maps to org:OrganizationalUnit)
|
|
- GRP.COR: Corporations and businesses (maps to schema:Corporation)
|
|
- GRP.GOV: Government agencies (maps to schema:GovernmentOrganization)
|
|
- GRP.EDU: Educational institutions (maps to schema:EducationalOrganization)
|
|
- GRP.REL: Religious organizations (maps to schema:ReligiousOrganization)
|
|
- GRP.ASS: Associations and societies (maps to org:FormalOrganization)
|
|
- GRP.INF: Informal groups - movements, families, dynasties
|
|
- GRP.HIS: Historical organizations - defunct entities
|
|
- GRP.ETH: Ethnic groups - Jews, Roma, Sinti, indigenous peoples (maps to crm:E74_Group)
|
|
|
|
Examples: "Rijksmuseum" → GRP.HER, "Ministry of Culture" → GRP.GOV, "Joden" → GRP.ETH
|
|
|
|
### 3. TOP (Toponym): Place names, nominal geographic references
|
|
**Subcategories:**
|
|
- TOP.SET: Settlement - cities, towns, villages (maps to schema:City)
|
|
- TOP.REG: Region - provinces, states, counties (maps to schema:AdministrativeArea)
|
|
- TOP.CTY: Country - nations, sovereign states (maps to schema:Country)
|
|
- TOP.ADR: Address - street addresses (maps to schema:PostalAddress)
|
|
- TOP.IAD: Institutional address
|
|
- TOP.BLD: Building - named buildings, monuments (maps to crm:E18_Physical_Thing)
|
|
- TOP.NAT: Natural features - mountains, rivers
|
|
- TOP.HIS: Historical places - concentration camps, transit camps, former territories (maps to crm:E53_Place)
|
|
- TOP.LEG: Legendary/fictional places
|
|
|
|
Examples: "Amsterdam" → TOP.SET, "the Netherlands" → TOP.CTY, "Auschwitz" → TOP.HIS
|
|
|
|
### 4. GEO (Geometry): Coordinates, shapes, spatial data
|
|
**Subcategories:**
|
|
- GEO.PNT: Point coordinates (maps to geo:Point)
|
|
- GEO.LIN: Line/path (maps to geo:LineString)
|
|
- GEO.POL: Polygon/area (maps to geo:Polygon)
|
|
- GEO.BOX: Bounding box (maps to geo:Envelope)
|
|
|
|
Examples: "52.3676° N, 4.9041° E" → GEO.PNT
|
|
|
|
### 5. TMP (Temporal): Dates, times, durations, periods
|
|
**Subcategories:**
|
|
- TMP.DAT: Absolute date - specific point (maps to time:Instant) [alias: TMP.DAB]
|
|
- TMP.DAB: Date Absolute - specific date "1885-03-22" (maps to time:Instant)
|
|
- TMP.DRL: Date Relative - "last year", "recently", "two weeks ago" (maps to time:Instant)
|
|
- TMP.TIM: Time of day (maps to time:Instant) [alias: TMP.TAB]
|
|
- TMP.TAB: Time Absolute - specific time "14:30:00" (maps to time:Instant)
|
|
- TMP.TRL: Time Relative - "later that evening", "soon after" (maps to time:Instant)
|
|
- TMP.DUR: Duration/period - "three hours", "from 1885 to 1890" (maps to time:Duration)
|
|
- TMP.RNG: Date range - "1885-1890", "March 1-15" (maps to time:Interval)
|
|
- TMP.SET: Recurring time - "every Monday", "annually"
|
|
- TMP.OPH: Opening hours - "Tuesday-Sunday 10:00-17:00" (maps to schema:OpeningHoursSpecification)
|
|
- TMP.REL: Relative time - "before", "after" [deprecated, use TMP.DRL/TMP.TRL]
|
|
- TMP.CEN: Century - "17th century", "the 1800s" (maps to crm:E4_Period)
|
|
- TMP.ERA: Historical era/period name - "Renaissance", "Bronze Age" (maps to crm:E4_Period)
|
|
- TMP.EXP: Exhibition period - "10 February - 4 June 2023" (maps to time:Interval)
|
|
|
|
Examples: "1885" → TMP.DAB, "18th century" → TMP.CEN, "every Tuesday" → TMP.SET, "10:00-17:00" → TMP.OPH
|
|
|
|
### 6. APP (Appellation): Identifiers, codes, reference numbers
|
|
**Subcategories:**
|
|
- APP.ISL: ISIL code (maps to crm:E42_Identifier)
|
|
- APP.WKD: Wikidata ID (maps to crm:E42_Identifier)
|
|
- APP.VIF: VIAF ID (maps to crm:E42_Identifier)
|
|
- APP.DOI: DOI
|
|
- APP.URL: URL/URI (maps to schema:URL)
|
|
- APP.ISBN: ISBN
|
|
- APP.ISSN: ISSN
|
|
- APP.KVK: Dutch Chamber of Commerce number
|
|
- APP.TTL: Title of work (maps to crm:E35_Title) [alias: APP.TIT]
|
|
- APP.TIT: Title of work (maps to crm:E35_Title)
|
|
- APP.NAM: Personal name - structured (maps to pnv:PersonName) [alias: APP.PNM]
|
|
- APP.PNM: Personal name - structured (maps to pnv:PersonName)
|
|
- APP.AWD: Award name
|
|
- APP.COL: Collection name
|
|
- APP.EXH: Exhibition name/title (maps to crm:E35_Title)
|
|
|
|
Examples: "ISIL NL-AmRM" → APP.ISL, "Q190804" → APP.WKD, "Rembrandt and His Era" → APP.EXH
|
|
|
|
### 7. ROL (Role): Titles, positions, honorifics, occupations
|
|
**Subcategories:**
|
|
- ROL.OCC: Occupation - profession, trade, job title (maps to schema:Occupation)
|
|
- ROL.TTL: Title/honorific - "Dr.", "Prof.", academic/professional titles (maps to schema:Role)
|
|
- ROL.HON: Honorific - "Sir", "Dame", "The Honorable" (maps to schema:honorificPrefix)
|
|
- ROL.NOB: Nobility title - "Duke", "Baron", "Count", hereditary titles (maps to schema:honorificSuffix)
|
|
- ROL.POS: Position/office - "Director", "Chairman" (maps to org:Post)
|
|
- ROL.REL: Relational role - father, mother, kinship (maps to bio:Relationship)
|
|
- ROL.REL.REL: Religious role - "Bishop", "Rabbi", "Imam" (maps to schema:Role)
|
|
|
|
Examples: "Director" → ROL.POS, "Prof. Dr." → ROL.TTL, "Duke of Wellington" → ROL.NOB, "Rabbi" → ROL.REL.REL
|
|
|
|
### 8. WRK (Work): Works following FRBR model
|
|
**Subcategories:**
|
|
- WRK.WRK: FRBR Work - abstract (maps to frbroo:F1_Work) [alias: WRK.ABS]
|
|
- WRK.ABS: Abstract work (maps to frbroo:F1_Work)
|
|
- WRK.EXP: FRBR Expression (maps to frbroo:F2_Expression)
|
|
- WRK.MAN: FRBR Manifestation (maps to frbroo:F3_Manifestation)
|
|
- WRK.ITM: FRBR Item (maps to frbroo:F5_Item)
|
|
- WRK.MSS: Manuscript - handwritten/unpublished work (maps to rico:Record)
|
|
- WRK.ARC: Archival record/document (maps to rico:Record)
|
|
- WRK.TXT: Textual work (maps to schema:Book)
|
|
- WRK.VIS: Visual work (maps to schema:VisualArtwork)
|
|
- WRK.MUS: Musical work (maps to schema:MusicComposition)
|
|
- WRK.PER: Performance (maps to schema:PerformingArtsEvent)
|
|
- WRK.CIN: Cinematic work (maps to schema:Movie)
|
|
- WRK.OBJ: Physical object/artifact (maps to crm:E22_Human-Made_Object)
|
|
- WRK.COL: Collection (maps to crm:E78_Curated_Holding)
|
|
- WRK.SER: Series (maps to schema:CreativeWorkSeries)
|
|
- WRK.WEB: Web resource/page (maps to schema:WebPage)
|
|
- WRK.URL: URL reference to work/link (maps to schema:URL)
|
|
- WRK.EML: Email message (maps to schema:Message)
|
|
- WRK.SOC: Social media post/content (maps to schema:SocialMediaPosting)
|
|
- WRK.CIT: Citation/bibliographic reference (maps to schema:Citation)
|
|
|
|
Examples: "The Night Watch" → WRK.VIS, "Annual Report 2023" → WRK.TXT, "15th-century codex" → WRK.MSS
|
|
|
|
### 9. QTY (Quantity): Measurements, counts, numeric values
|
|
**Subcategories:**
|
|
- QTY.CNT: Count (maps to crm:E54_Dimension)
|
|
- QTY.MSR: Measurement (maps to crm:E54_Dimension)
|
|
- QTY.PCT: Percentage
|
|
- QTY.CUR: Currency/monetary (maps to schema:MonetaryAmount)
|
|
- QTY.ORD: Ordinal (maps to crm:E60_Number)
|
|
- QTY.RNG: Range
|
|
|
|
Examples: "over 8,000 artworks" → QTY.CNT, "€2.5 million" → QTY.CUR
|
|
|
|
### 10. THG (Thing): Physical objects, artifacts, concepts, events
|
|
**Subcategories:**
|
|
- THG.ART: Artwork (maps to crm:E22_Human-Made_Object)
|
|
- THG.AFT: Artifact - human-made object of historical significance (maps to crm:E22_Human-Made_Object)
|
|
- THG.SPC: Specimen - natural history specimen, scientific sample (maps to crm:E20_Biological_Object)
|
|
- THG.DOC: Document (maps to foaf:Document)
|
|
- THG.PHO: Photograph (maps to schema:Photograph)
|
|
- THG.OBJ: Physical object - generic (maps to crm:E19_Physical_Object)
|
|
- THG.EVT: Historical event - deportation, persecution, liberation, war (maps to crm:E5_Event)
|
|
- THG.CON: Concept/abstract thing - stories, memories, heritage, mission (maps to crm:E28_Conceptual_Object)
|
|
- THG.TAX: Taxonomic term - species (maps to crm:E55_Type)
|
|
- THG.LNG: Language (maps to crm:E56_Language)
|
|
- THG.MAT: Material - bronze, marble, paper, etc. (maps to crm:E57_Material)
|
|
|
|
Examples: "17th-century painting" → THG.ART, "deportation" → THG.EVT, "the stories" → THG.CON, "Dutch" → THG.LNG
|
|
|
|
## RELATIONSHIP TYPES AND CONSTRAINTS
|
|
|
|
Relationships connect two entities. Each relationship has domain (subject) and range (object) constraints.
|
|
|
|
**⚠️ CRITICAL: COMPREHENSIVE SEMANTIC TRIPLE EXTRACTION ⚠️**
|
|
|
|
You MUST extract ALL semantic relationships from narrative text, not just named entity relationships.
|
|
Decompose every sentence into its constituent semantic triples (subject-predicate-object).
|
|
|
|
Example text: "In het Herinneringscentrum Kamp Westerbork vertellen we de verhalen van meer dan honderdduizend Joden en Sinti en Roma die vanuit Nederland naar vernietigings- en concentratiekampen werden gedeporteerd"
|
|
|
|
This SINGLE sentence contains these triples:
|
|
1. REL.ORG.ACT: Herinneringscentrum Kamp Westerbork → performs activity → tell stories
|
|
2. REL.SUB.ABT: the stories → are about → Joden (Jews)
|
|
3. REL.SUB.ABT: the stories → are about → Sinti and Roma
|
|
4. REL.QTY.CNT: Jews/Sinti/Roma → quantity → more than 100,000
|
|
5. REL.SPA.ORG: deportees → originated from → Nederland
|
|
6. REL.SPA.DST: deportees → destination → concentration camps
|
|
7. REL.SPA.DST: deportees → destination → extermination camps
|
|
8. REL.EVT.PAR: Jews/Sinti/Roma → participated in → deportation (forced)
|
|
|
|
### REL.CRE (Creation) - Agent creates Work
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.CRE.AUT | AGT.PER, AGT.GRP | WRK.TXT | "Martin Luther authored 95 Theses" |
|
|
| REL.CRE.ART | AGT.PER | WRK.VIS, THG.ART | "Rembrandt painted The Night Watch" |
|
|
| REL.CRE.COM | AGT.PER | WRK.MUS | "Beethoven composed Symphony No. 9" |
|
|
| REL.CRE.PHO | AGT.PER | THG.PHO | "Photographer captured portrait" |
|
|
| REL.CRE.DES | AGT.PER, AGT.GRP | WRK.OBJ | "Architect designed building" |
|
|
|
|
### REL.SPA (Spatial) - Located in / Contains / Origin / Destination
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.SPA.LOC | AGT, EVT, GRP, WRK | TOP | "Museum located in Amsterdam" |
|
|
| REL.SPA.WTH | TOP | TOP | "Amsterdam within North Holland" |
|
|
| REL.SPA.CON | TOP | TOP | "Netherlands contains Amsterdam" |
|
|
| REL.SPA.ORG | AGT.PER, WRK, GRP.ETH | TOP | "Jews came from Netherlands" |
|
|
| REL.SPA.DST | AGT, EVT, GRP | TOP | "Deported to concentration camps" |
|
|
|
|
### REL.SOC (Social) - Person-to-person relations
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.SOC.FAM.SPO | AGT.PER | AGT.PER | "Martin Luther married Katharina von Bora" |
|
|
| REL.SOC.FAM.PAR | AGT.PER | AGT.PER | "Parent of child" |
|
|
| REL.SOC.PRO.STU | AGT.PER | AGT.PER | "Student studied under master" |
|
|
| REL.SOC.MEM | AGT.PER, GRP | GRP | "Person/org member of organization" |
|
|
| REL.SOC.EMP | AGT.PER | GRP | "Employee works for company" |
|
|
|
|
### REL.ORG (Organizational) - Group activities and relations
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.ORG.PAR | GRP | GRP | "Parent organization" |
|
|
| REL.ORG.SUB | GRP | GRP | "Subsidiary organization" |
|
|
| REL.ORG.SUC | GRP | GRP | "Successor organization" |
|
|
| REL.ORG.FND | AGT.PER, GRP | GRP | "Founder established organization" |
|
|
| REL.ORG.ACT | GRP.HER, GRP | THG.CON, WRK | "Museum tells stories" / "Archive preserves documents" |
|
|
| REL.ORG.MIS | GRP.HER, GRP | THG.CON | "Organization's mission is..." |
|
|
| REL.ORG.SRV | GRP.HER, GRP | GRP, AGT | "Museum serves researchers" |
|
|
|
|
### REL.CUS (Custodial) - Ownership/Keeping
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.CUS.KEP | WRK, THG | GRP.HER | "Artwork kept by Rijksmuseum" |
|
|
| REL.CUS.OWN | WRK, THG | AGT.PER, GRP | "Collector owns painting" |
|
|
| REL.CUS.COL | WRK, THG | WRK.COL | "Item in collection" |
|
|
| REL.CUS.DNT | WRK, THG | AGT.PER | "Donated by benefactor" |
|
|
|
|
### REL.WRK (Work/FRBR) - Work relations
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.WRK.EXP | WRK.EXP | WRK.WRK | "Expression of work" |
|
|
| REL.WRK.PRT | WRK | WRK | "Part of larger work" |
|
|
| REL.WRK.SER | WRK | WRK.SER | "Volume in series" |
|
|
| REL.WRK.TRN | WRK.EXP | WRK.WRK | "Translation of work" |
|
|
|
|
### REL.SUB (Subject/About) - Topics and content
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.SUB.ABT | WRK, GRP.HER, THG | AGT, GRP, EVT, TOP, THG | "Stories about Jews and Roma" |
|
|
| REL.SUB.DEP | WRK.VIS, THG.PHO | AGT, TOP, EVT | "Photo depicts memorial" |
|
|
| REL.SUB.THM | GRP.HER, WRK.COL | THG.CON | "Collection themes: WWII, Holocaust" |
|
|
|
|
### REL.EVT (Event) - Participation and historical events
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.EVT.PAR | AGT, GRP, GRP.ETH | EVT, THG.EVT | "Jews participated in deportation" |
|
|
| REL.EVT.ORG | AGT, GRP | EVT | "Nazis organized deportations" |
|
|
| REL.EVT.LOC | EVT | TOP | "Deportations from Netherlands" |
|
|
| REL.EVT.VIC | AGT, GRP, GRP.ETH | EVT | "Jews were victims of persecution" |
|
|
| REL.EVT.TIM | EVT | TMP | "Deportations in 1942-1944" |
|
|
|
|
### REL.QTY (Quantity) - Numeric relations
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.QTY.CNT | GRP, GRP.ETH, WRK.COL | QTY.CNT | "More than 100,000 people" |
|
|
| REL.QTY.MSR | THG, TOP | QTY.MSR | "Building is 500 sqm" |
|
|
| REL.QTY.YRS | GRP.HER, AGT | QTY.CNT, TMP | "Museum operating for 50 years" |
|
|
|
|
### REL.ROL (Role) - Occupation/Position
|
|
| Hyponym | Domain (Subject) | Range (Object) | Example |
|
|
|---------|------------------|----------------|---------|
|
|
| REL.ROL.OCC | AGT.PER | ROL.OCC | "Person has occupation" |
|
|
| REL.ROL.HLD | AGT.PER | ROL.POS | "Person holds position" |
|
|
|
|
**IMPORTANT: Always include entity_type in relationship subject/object for validation!**
|
|
|
|
### ENTITY TYPES FOR RELATIONSHIP EXTRACTION
|
|
|
|
When extracting relationships, use these additional entity types:
|
|
- **GRP.ETH**: Ethnic groups (Joden, Sinti, Roma, etc.)
|
|
- **THG.CON**: Abstract concepts (stories, memories, heritage, mission)
|
|
- **THG.EVT**: Historical events (deportation, persecution, liberation)
|
|
- **TOP.HIS**: Historical places (concentration camps, transit camps)
|
|
|
|
## LAYOUT REGIONS (DOC hypernym)
|
|
|
|
Primary: HDR (heading), PAR (paragraph), SEN (sentence), LST (list), TBL (table)
|
|
Media: GAL (gallery), MAP (map), AUD (audio), VID (video), EMB (embedded)
|
|
Navigation: NAV (navigation), TOC (table of contents), IDX (index)
|
|
Front/Back: TTP (title page), DED (dedication), COL (colophon), BIB (bibliography), APP (appendix), GLO (glossary)
|
|
Commercial: ADV (advertisement), LOG (logo)
|
|
|
|
## OUTPUT FORMAT
|
|
|
|
Return a JSON object with this structure:
|
|
```json
|
|
{
|
|
"entities": [
|
|
{
|
|
"hypernym": "GRP",
|
|
"hyponym": "GRP.HER",
|
|
"text": "Rijksmuseum",
|
|
"xpath": "/html/body/div[1]/h1",
|
|
"confidence": 0.95,
|
|
"class_uri": "glam:HeritageCustodian",
|
|
"notes": "Main heritage institution name"
|
|
}
|
|
],
|
|
"layout_regions": [
|
|
{
|
|
"region": "HDR",
|
|
"level": 1,
|
|
"semantic_role": "PRIM",
|
|
"xpath": "/html/body/div[1]/h1",
|
|
"text_preview": "Rijksmuseum Amsterdam",
|
|
"contains_entities": ["GRP.HER:Rijksmuseum", "TOP.SET:Amsterdam"]
|
|
}
|
|
],
|
|
"claims": [
|
|
{
|
|
"claim_type": "full_name",
|
|
"claim_value": "Rijksmuseum Amsterdam",
|
|
"xpath": "/html/body/div[1]/h1",
|
|
"confidence": 0.95,
|
|
"source_entities": ["GRP.HER:Rijksmuseum"]
|
|
}
|
|
],
|
|
"relationships": [
|
|
{
|
|
"relationship_type": "REL.SPA.LOC",
|
|
"subject": {"entity_type": "GRP.HER", "text": "Rijksmuseum"},
|
|
"object": {"entity_type": "TOP.SET", "text": "Amsterdam"},
|
|
"predicate_uri": "schema:location",
|
|
"confidence": 0.90
|
|
}
|
|
]
|
|
}
|
|
```
|
|
|
|
## RULES
|
|
|
|
1. Every claim MUST have an XPath location in the source document
|
|
2. Use HYPONYM codes (e.g., GRP.HER, AGT.PER) not just hypernyms (e.g., GRP, AGT)
|
|
3. Include class_uri ontology mapping for each entity
|
|
4. Confidence scores: 0.9-1.0 (explicit), 0.7-0.9 (clear), 0.5-0.7 (inferred)
|
|
5. Entities within layout regions should be cross-referenced
|
|
6. Claims without XPath provenance are FABRICATED and must not be included
|
|
7. Extract relationships between entities (especially REL.SPA.LOC, REL.ORG.*, REL.CRE.*)
|
|
|
|
## ⚠️ CRITICAL: COMPREHENSIVE SEMANTIC EXTRACTION ⚠️
|
|
|
|
8. **DECOMPOSE EVERY NARRATIVE SENTENCE INTO TRIPLES** - A single sentence often contains 5-10 semantic relationships
|
|
9. **Extract ALL entities** - not just named entities, but also:
|
|
- Quantities (QTY.CNT: "more than 100,000")
|
|
- Ethnic groups (GRP.ETH: "Jews", "Sinti", "Roma")
|
|
- Abstract concepts (THG.CON: "stories", "memories", "heritage")
|
|
- Historical events (THG.EVT: "deportation", "persecution")
|
|
- Historical places (TOP.HIS: "concentration camps", "transit camps")
|
|
10. **Extract organizational activities** (REL.ORG.ACT): What does the institution DO? (preserve, tell, exhibit, research)
|
|
11. **Extract subject matter** (REL.SUB.ABT): What is the institution/collection ABOUT?
|
|
12. **Extract quantities** (REL.QTY.CNT): Numbers of visitors, items, people affected
|
|
13. **Extract spatial origins and destinations** (REL.SPA.ORG, REL.SPA.DST): Where did things/people come FROM and go TO?
|
|
14. **Extract event participation** (REL.EVT.PAR, REL.EVT.VIC): Who was involved in historical events?
|
|
|
|
### Example: Deep Semantic Parsing
|
|
|
|
Text: "Het museum bewaart meer dan 5000 voorwerpen uit de Tweede Wereldoorlog"
|
|
|
|
**INCORRECT** (shallow extraction):
|
|
- 1 entity: "Het museum" (GRP.HER)
|
|
- 0 relationships
|
|
|
|
**CORRECT** (deep semantic extraction):
|
|
- Entities:
|
|
- "Het museum" (GRP.HER)
|
|
- "meer dan 5000" (QTY.CNT)
|
|
- "voorwerpen" (THG.AFT - artifacts)
|
|
- "Tweede Wereldoorlog" (TMP.ERA)
|
|
- Relationships:
|
|
- REL.ORG.ACT: museum → performs → preservation (bewaart)
|
|
- REL.CUS.KEP: voorwerpen → kept by → museum
|
|
- REL.QTY.CNT: voorwerpen → quantity → meer dan 5000
|
|
- REL.TMP.DUR: voorwerpen → from period → Tweede Wereldoorlog
|
|
|
|
## CLAIM TYPES FOR HERITAGE INSTITUTIONS
|
|
|
|
- full_name: Official institution name
|
|
- short_name: Abbreviated name or acronym
|
|
- description: Institution description
|
|
- email: Contact email
|
|
- phone: Contact phone
|
|
- address: Physical address
|
|
- website: Official website URL
|
|
- social_media: Social media links (facebook, twitter, instagram, linkedin, youtube)
|
|
- opening_hours: Visitor hours
|
|
- admission_info: Ticket/entry information
|
|
- founding_date: When institution was established
|
|
- collection_count: Number of items in collection
|
|
- kvk_number: Dutch Chamber of Commerce number
|
|
- isil_code: International Standard Identifier for Libraries
|
|
- wikidata_id: Wikidata Q-number
|
|
- parent_organization: Parent/umbrella organization
|
|
"""
|
|
|
|
|
|
class LLMAnnotator:
|
|
"""
|
|
LLM-based document annotator.
|
|
|
|
Uses LLM inference for all entity recognition and claim extraction.
|
|
NO heuristic or pattern-based methods.
|
|
|
|
Example:
|
|
>>> config = LLMAnnotatorConfig(provider=LLMProvider.ZAI, model="glm-4")
|
|
>>> annotator = LLMAnnotator(config)
|
|
>>> session = await annotator.annotate(document)
|
|
>>> print(f"Found {len(session.entity_claims)} entities")
|
|
"""
|
|
|
|
def __init__(self, config: Optional[LLMAnnotatorConfig] = None):
|
|
"""
|
|
Initialize LLM annotator.
|
|
|
|
Args:
|
|
config: LLM configuration (defaults to Z.AI GLM-4)
|
|
"""
|
|
self.config = config or LLMAnnotatorConfig()
|
|
self._client = None
|
|
|
|
if not self.config.api_key:
|
|
raise ValueError(
|
|
f"API key not found for {self.config.provider.value}. "
|
|
f"Set environment variable or pass api_key in config."
|
|
)
|
|
|
|
async def annotate(
|
|
self,
|
|
document: Union[HTMLDocument, str, Path],
|
|
source_url: Optional[str] = None,
|
|
image_dir: Optional[Path] = None,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate a document using LLM inference.
|
|
|
|
Args:
|
|
document: HTMLDocument, HTML string, or path to HTML file
|
|
source_url: Optional source URL for provenance
|
|
image_dir: Optional directory containing downloaded images for vision analysis
|
|
|
|
Returns:
|
|
AnnotationSession with extracted claims
|
|
"""
|
|
# Load document if needed
|
|
html_content: str
|
|
source_file: Optional[str] = None
|
|
if isinstance(document, Path):
|
|
with open(document, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
source_url = source_url or str(document)
|
|
source_file = str(document)
|
|
# Auto-detect image directory if not provided
|
|
if image_dir is None:
|
|
image_dir = document.parent
|
|
elif isinstance(document, str):
|
|
# Check if it's a file path (short string, no HTML tags)
|
|
is_file_path = len(document) < 500 and not document.strip().startswith('<')
|
|
if is_file_path:
|
|
try:
|
|
path = Path(document)
|
|
if path.exists():
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
source_url = source_url or document
|
|
source_file = document
|
|
if image_dir is None:
|
|
image_dir = path.parent
|
|
else:
|
|
html_content = document
|
|
except OSError:
|
|
# Path too long or invalid
|
|
html_content = document
|
|
else:
|
|
html_content = document
|
|
elif isinstance(document, HTMLDocument):
|
|
html_content = document.raw_html
|
|
source_url = source_url or document.source_url
|
|
source_file = document.source_file
|
|
else:
|
|
raise TypeError(f"Unsupported document type: {type(document)}")
|
|
|
|
# Create session
|
|
session = AnnotationSession(
|
|
session_id=f"llm-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
|
|
source_url=source_url or "unknown",
|
|
source_file=source_file,
|
|
)
|
|
|
|
# Prepare prompt
|
|
user_prompt = self._prepare_prompt(html_content)
|
|
|
|
# Call LLM for text annotation
|
|
try:
|
|
response = await self._call_llm(user_prompt)
|
|
|
|
# Parse response
|
|
annotations = self._parse_response(response)
|
|
|
|
# Convert to claims
|
|
self._populate_session(session, annotations, source_url)
|
|
|
|
except Exception as e:
|
|
session.errors.append(f"LLM annotation failed: {e}")
|
|
|
|
# Image analysis (if enabled)
|
|
if self.config.extract_images:
|
|
try:
|
|
image_claims = await self.analyze_images_in_html(
|
|
html_content=html_content,
|
|
base_url=source_url,
|
|
image_dir=image_dir,
|
|
)
|
|
for claim in image_claims:
|
|
session.add_image_claim(claim)
|
|
|
|
if image_claims:
|
|
logger.info(f"Analyzed {len(image_claims)} images from document")
|
|
|
|
except Exception as e:
|
|
session.errors.append(f"Image analysis failed: {e}")
|
|
logger.warning(f"Image analysis failed: {e}")
|
|
|
|
session.completed_at = datetime.now(timezone.utc).isoformat()
|
|
return session
|
|
|
|
def _prepare_prompt(self, html_content: str) -> str:
|
|
"""Prepare the user prompt with document content."""
|
|
# Truncate if too long (LLM context limits)
|
|
max_chars = 30000
|
|
if len(html_content) > max_chars:
|
|
html_content = html_content[:max_chars] + "\n... [truncated]"
|
|
|
|
return f"""Analyze the following HTML document and extract all entities, layout regions, claims, and relationships.
|
|
|
|
Return a JSON object following the schema in the system prompt.
|
|
|
|
HTML DOCUMENT:
|
|
```html
|
|
{html_content}
|
|
```
|
|
|
|
## ⚠️ CRITICAL EXTRACTION REQUIREMENTS ⚠️
|
|
|
|
### 1. COMPREHENSIVE ENTITY EXTRACTION
|
|
Extract ALL entities, not just named entities:
|
|
- Heritage institutions (GRP.HER)
|
|
- Ethnic groups (GRP.ETH): Jews, Roma, Sinti, etc.
|
|
- Quantities (QTY.CNT): "more than 100,000", "5000 objects"
|
|
- Historical events (THG.EVT): deportation, persecution, liberation
|
|
- Abstract concepts (THG.CON): stories, memories, heritage, mission
|
|
- Historical places (TOP.HIS): concentration camps, transit camps
|
|
- Time periods (TMP.ERA): World War II, Holocaust
|
|
|
|
### 2. COMPREHENSIVE RELATIONSHIP EXTRACTION
|
|
Decompose EVERY narrative sentence into semantic triples:
|
|
- REL.ORG.ACT: What activities does the organization perform? (preserve, tell, exhibit, research, commemorate)
|
|
- REL.SUB.ABT: What is the collection/institution/story ABOUT?
|
|
- REL.QTY.CNT: Quantities of people, objects, visitors
|
|
- REL.SPA.ORG: Where did people/things come FROM?
|
|
- REL.SPA.DST: Where did people/things go TO?
|
|
- REL.EVT.PAR: Who participated in events (voluntary or forced)?
|
|
- REL.EVT.VIC: Who were victims of events?
|
|
|
|
### 3. EXAMPLE - WHAT WE EXPECT
|
|
|
|
For text: "In het Herinneringscentrum vertellen we de verhalen van meer dan honderdduizend Joden"
|
|
|
|
Extract:
|
|
- **Entities**:
|
|
- Herinneringscentrum (GRP.HER)
|
|
- de verhalen (THG.CON - stories/narratives)
|
|
- meer dan honderdduizend (QTY.CNT - >100,000)
|
|
- Joden (GRP.ETH - Jews as ethnic group)
|
|
|
|
- **Relationships**:
|
|
- REL.ORG.ACT: Herinneringscentrum → tells → verhalen
|
|
- REL.SUB.ABT: verhalen → about → Joden
|
|
- REL.QTY.CNT: Joden → quantity → meer dan honderdduizend
|
|
|
|
### 4. DO NOT:
|
|
- Skip abstract concepts or quantities
|
|
- Extract only named entities
|
|
- Ignore the semantic relationships within sentences
|
|
- Produce shallow extractions with few relationships
|
|
|
|
IMPORTANT: The richness of semantic extraction is critical. A single paragraph may contain 10-20 relationships.
|
|
"""
|
|
|
|
def _calculate_backoff_delay(self, attempt: int) -> float:
|
|
"""
|
|
Calculate delay for exponential backoff.
|
|
|
|
Args:
|
|
attempt: Current retry attempt number (0-indexed)
|
|
|
|
Returns:
|
|
Delay in seconds
|
|
"""
|
|
retry = self.config.retry
|
|
delay = retry.base_delay * (retry.exponential_base ** attempt)
|
|
delay = min(delay, retry.max_delay)
|
|
|
|
# Add jitter to prevent thundering herd
|
|
if retry.jitter:
|
|
delay = delay * (0.5 + random.random())
|
|
|
|
return delay
|
|
|
|
def _get_api_key_for_provider(self, provider: LLMProvider) -> Optional[str]:
|
|
"""Get API key for a specific provider from environment."""
|
|
env_vars = {
|
|
LLMProvider.ZAI: "ZAI_API_TOKEN",
|
|
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
}
|
|
return os.environ.get(env_vars.get(provider, ""))
|
|
|
|
async def _call_provider(
|
|
self,
|
|
provider: LLMProvider,
|
|
user_prompt: str,
|
|
api_key: Optional[str] = None,
|
|
) -> str:
|
|
"""
|
|
Call a specific LLM provider.
|
|
|
|
Args:
|
|
provider: Which provider to call
|
|
user_prompt: The user prompt to send
|
|
api_key: Optional API key override
|
|
|
|
Returns:
|
|
LLM response string
|
|
"""
|
|
# Use provided key or get from environment
|
|
key = api_key or self._get_api_key_for_provider(provider)
|
|
|
|
if not key:
|
|
raise ValueError(f"No API key available for {provider.value}")
|
|
|
|
if provider == LLMProvider.ZAI:
|
|
return await self._call_zai(user_prompt, key)
|
|
elif provider == LLMProvider.ANTHROPIC:
|
|
return await self._call_anthropic(user_prompt, key)
|
|
elif provider == LLMProvider.OPENAI:
|
|
return await self._call_openai(user_prompt, key)
|
|
else:
|
|
raise ValueError(f"Unsupported provider: {provider}")
|
|
|
|
async def _call_llm(self, user_prompt: str) -> str:
|
|
"""
|
|
Call the LLM API with retry logic and provider fallback.
|
|
|
|
Implements:
|
|
1. Exponential backoff with jitter for rate limits
|
|
2. Automatic fallback to alternative providers on failure
|
|
|
|
Returns:
|
|
LLM response string
|
|
|
|
Raises:
|
|
Exception: If all retries and fallbacks are exhausted
|
|
"""
|
|
import httpx
|
|
|
|
# Build provider chain: primary + fallbacks
|
|
providers_to_try = [self.config.provider]
|
|
if self.config.fallback_providers:
|
|
providers_to_try.extend(self.config.fallback_providers)
|
|
|
|
last_exception: Optional[Exception] = None
|
|
|
|
for provider in providers_to_try:
|
|
api_key = (
|
|
self.config.api_key
|
|
if provider == self.config.provider
|
|
else self._get_api_key_for_provider(provider)
|
|
)
|
|
|
|
if not api_key:
|
|
logger.info(f"Skipping {provider.value}: no API key available")
|
|
continue
|
|
|
|
logger.info(f"Trying provider: {provider.value}")
|
|
|
|
for attempt in range(self.config.retry.max_retries):
|
|
try:
|
|
return await self._call_provider(provider, user_prompt, api_key)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
status_code = e.response.status_code
|
|
|
|
if status_code in self.config.retry.retry_on_status:
|
|
delay = self._calculate_backoff_delay(attempt)
|
|
logger.warning(
|
|
f"Provider {provider.value} returned {status_code} "
|
|
f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
|
|
f"Retrying in {delay:.2f}s..."
|
|
)
|
|
await asyncio.sleep(delay)
|
|
last_exception = e
|
|
else:
|
|
# Non-retryable error, try next provider
|
|
logger.error(
|
|
f"Provider {provider.value} returned non-retryable "
|
|
f"status {status_code}: {e}"
|
|
)
|
|
last_exception = e
|
|
break
|
|
|
|
except httpx.TimeoutException as e:
|
|
delay = self._calculate_backoff_delay(attempt)
|
|
logger.warning(
|
|
f"Provider {provider.value} timed out "
|
|
f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
|
|
f"Retrying in {delay:.2f}s..."
|
|
)
|
|
await asyncio.sleep(delay)
|
|
last_exception = e
|
|
|
|
except Exception as e:
|
|
logger.error(f"Provider {provider.value} failed: {e}")
|
|
last_exception = e
|
|
break
|
|
|
|
# All retries exhausted for this provider, try next
|
|
logger.warning(f"Provider {provider.value} exhausted all retries")
|
|
|
|
# All providers failed
|
|
raise RuntimeError(
|
|
f"All LLM providers failed. Last error: {last_exception}"
|
|
) from last_exception
|
|
|
|
async def _call_zai(self, user_prompt: str, api_key: str) -> str:
|
|
"""
|
|
Call Z.AI API using Anthropic-compatible endpoint.
|
|
|
|
Z.AI GLM Coding Plan provides an Anthropic-compatible API at:
|
|
https://api.z.ai/api/anthropic/v1/messages
|
|
|
|
Uses same message format as Anthropic Claude API.
|
|
"""
|
|
import httpx
|
|
|
|
# Z.AI Anthropic-compatible endpoint
|
|
url = "https://api.z.ai/api/anthropic/v1/messages"
|
|
|
|
# Z.AI uses Anthropic-style headers
|
|
headers = {
|
|
"x-api-key": api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Map Z.AI model names - GLM models available via Anthropic API
|
|
# Default to claude-3-5-sonnet if model not explicitly set for Z.AI
|
|
model = self.config.model
|
|
if model.startswith("glm-"):
|
|
# Z.AI's Anthropic endpoint uses Claude model names
|
|
model = "claude-sonnet-4-20250514"
|
|
|
|
payload = {
|
|
"model": model,
|
|
"max_tokens": self.config.max_tokens,
|
|
"system": GLAM_NER_SYSTEM_PROMPT,
|
|
"messages": [
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
response = await client.post(url, headers=headers, json=payload)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Anthropic response format
|
|
return data["content"][0]["text"]
|
|
|
|
async def _call_anthropic(self, user_prompt: str, api_key: str) -> str:
|
|
"""Call Anthropic Claude API."""
|
|
import httpx
|
|
|
|
url = "https://api.anthropic.com/v1/messages"
|
|
|
|
headers = {
|
|
"x-api-key": api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Use Claude-specific model name for fallback
|
|
model = (
|
|
self.config.model
|
|
if self.config.provider == LLMProvider.ANTHROPIC
|
|
else "claude-3-5-sonnet-20241022"
|
|
)
|
|
|
|
payload = {
|
|
"model": model,
|
|
"max_tokens": self.config.max_tokens,
|
|
"system": GLAM_NER_SYSTEM_PROMPT,
|
|
"messages": [
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
response = await client.post(url, headers=headers, json=payload)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
return data["content"][0]["text"]
|
|
|
|
async def _call_openai(self, user_prompt: str, api_key: str) -> str:
|
|
"""Call OpenAI GPT-4 API."""
|
|
import httpx
|
|
|
|
url = "https://api.openai.com/v1/chat/completions"
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Use OpenAI-specific model name for fallback
|
|
model = (
|
|
self.config.model
|
|
if self.config.provider == LLMProvider.OPENAI
|
|
else "gpt-4o"
|
|
)
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": GLAM_NER_SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": self.config.temperature,
|
|
"max_tokens": self.config.max_tokens,
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
response = await client.post(url, headers=headers, json=payload)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
return data["choices"][0]["message"]["content"]
|
|
|
|
# =========================================================================
|
|
# IMAGE ANALYSIS METHODS (Z.AI GLM-4.5V Vision API)
|
|
# =========================================================================
|
|
|
|
async def _analyze_image(
|
|
self,
|
|
image_url: Optional[str] = None,
|
|
image_base64: Optional[str] = None,
|
|
image_path: Optional[str] = None,
|
|
alt_text: Optional[str] = None,
|
|
context: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Analyze an image using Z.AI GLM-4.5V vision model.
|
|
|
|
Extracts visual descriptions, entities, OCR text, and heritage relevance.
|
|
|
|
Args:
|
|
image_url: URL of the image (absolute or relative)
|
|
image_base64: Base64-encoded image data
|
|
image_path: Local file path to image
|
|
alt_text: HTML alt text for context
|
|
context: Surrounding text context from the page
|
|
|
|
Returns:
|
|
Dict with analysis results:
|
|
{
|
|
"description": "Natural language description",
|
|
"detected_entities": [{"type": "AGT.PER", "text": "...", "confidence": 0.9}],
|
|
"extracted_text": "OCR text if present",
|
|
"heritage_relevance": "Why this matters for heritage",
|
|
"image_type": "photograph|painting|document|map|artifact|other",
|
|
"era_estimate": "Estimated time period",
|
|
"style": "Photographic/artistic style",
|
|
"analysis_confidence": 0.85
|
|
}
|
|
"""
|
|
import httpx
|
|
import base64
|
|
|
|
# Prepare image content for API
|
|
image_content = None
|
|
|
|
if image_base64:
|
|
# Already base64 encoded
|
|
image_content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
|
|
elif image_path:
|
|
# Read and encode local file
|
|
try:
|
|
path = Path(image_path)
|
|
if path.exists():
|
|
with open(path, 'rb') as f:
|
|
img_data = base64.b64encode(f.read()).decode('utf-8')
|
|
# Detect MIME type from extension
|
|
ext = path.suffix.lower()
|
|
mime_types = {
|
|
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
|
'.png': 'image/png', '.gif': 'image/gif',
|
|
'.webp': 'image/webp', '.bmp': 'image/bmp'
|
|
}
|
|
mime_type = mime_types.get(ext, 'image/jpeg')
|
|
image_content = {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_data}"}}
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read image file {image_path}: {e}")
|
|
return {"error": f"Failed to read image: {e}"}
|
|
elif image_url:
|
|
# Use URL directly (works for absolute URLs)
|
|
if image_url.startswith('http://') or image_url.startswith('https://'):
|
|
image_content = {"type": "image_url", "image_url": {"url": image_url}}
|
|
else:
|
|
# Relative URL - can't analyze without base URL
|
|
logger.debug(f"Skipping relative URL image: {image_url}")
|
|
return {"error": "Relative URL - cannot analyze without base URL"}
|
|
|
|
if not image_content:
|
|
return {"error": "No valid image source provided"}
|
|
|
|
# Build prompt for heritage image analysis
|
|
prompt_parts = [
|
|
"Analyze this image from a heritage institution website.",
|
|
"",
|
|
"Provide a JSON response with the following fields:",
|
|
"- description: Detailed description of what the image shows",
|
|
"- detected_entities: Array of entities visible in the image, each with {type, text, confidence}",
|
|
" - Use GLAM-NER types: AGT.PER (person), WRK.VIS (artwork), THG.ART (artifact), TOP.BLD (building), GRP.ETH (ethnic group), etc.",
|
|
"- extracted_text: Any text visible in the image (OCR)",
|
|
"- heritage_relevance: Why this image is significant for heritage/cultural preservation",
|
|
"- image_type: One of: photograph, painting, document, map, artifact, museum_object, historical_photo, memorial, building, portrait, group_photo, exhibition, other",
|
|
"- era_estimate: Estimated time period of the content (e.g., '1940s', 'World War II', 'medieval', 'contemporary')",
|
|
"- style: Artistic or photographic style",
|
|
"- analysis_confidence: Your confidence in this analysis (0.0-1.0)",
|
|
]
|
|
|
|
if alt_text:
|
|
prompt_parts.extend(["", f"HTML alt text: {alt_text}"])
|
|
|
|
if context:
|
|
prompt_parts.extend(["", f"Page context: {context[:500]}..."])
|
|
|
|
prompt_parts.extend([
|
|
"",
|
|
"Return ONLY valid JSON, no markdown code blocks."
|
|
])
|
|
|
|
prompt = "\n".join(prompt_parts)
|
|
|
|
# Call Z.AI GLM-4.5V Vision API
|
|
# Z.AI uses OpenAI-compatible format for vision at a different endpoint
|
|
url = "https://api.z.ai/api/paas/v4/chat/completions"
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {self.config.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
payload = {
|
|
"model": self.config.vision_model, # "glm-4.5v"
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
image_content,
|
|
{"type": "text", "text": prompt}
|
|
]
|
|
}
|
|
],
|
|
"max_tokens": self.config.vision_max_tokens,
|
|
"temperature": 0.1,
|
|
}
|
|
|
|
# Retry logic with exponential backoff for vision API
|
|
max_retries = self.config.retry.max_retries
|
|
base_delay = self.config.retry.base_delay
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
response = await client.post(url, headers=headers, json=payload)
|
|
|
|
# Check for rate limit
|
|
if response.status_code == 429:
|
|
if attempt < max_retries:
|
|
delay = base_delay * (2 ** attempt)
|
|
if self.config.retry.jitter:
|
|
delay += random.uniform(0, delay * 0.1)
|
|
delay = min(delay, self.config.retry.max_delay)
|
|
logger.info(f"Vision API rate limited, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
return {"error": "Vision API rate limited after max retries"}
|
|
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Parse response content
|
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
|
|
# Try to parse as JSON
|
|
try:
|
|
result = json.loads(content)
|
|
return result
|
|
except json.JSONDecodeError:
|
|
# If not valid JSON, return the text as description
|
|
return {
|
|
"description": content,
|
|
"detected_entities": [],
|
|
"analysis_confidence": 0.5,
|
|
"error": "Response was not valid JSON"
|
|
}
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code in self.config.retry.retry_on_status and attempt < max_retries:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.info(f"Vision API error {e.response.status_code}, retrying in {delay:.1f}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
logger.warning(f"Vision API HTTP error: {e.response.status_code}")
|
|
return {"error": f"Vision API error: {e.response.status_code}"}
|
|
except Exception as e:
|
|
logger.warning(f"Vision API call failed: {e}")
|
|
return {"error": str(e)}
|
|
|
|
return {"error": "Vision API failed after retries"}
|
|
|
|
async def analyze_images_in_html(
|
|
self,
|
|
html_content: str,
|
|
base_url: Optional[str] = None,
|
|
image_dir: Optional[Path] = None,
|
|
) -> List[ImageClaim]:
|
|
"""
|
|
Extract and analyze all images from an HTML document.
|
|
|
|
Args:
|
|
html_content: Raw HTML content
|
|
base_url: Base URL for resolving relative image paths
|
|
image_dir: Local directory containing downloaded images
|
|
|
|
Returns:
|
|
List of ImageClaim objects with analysis results
|
|
"""
|
|
from lxml import html as lxml_html
|
|
from urllib.parse import urljoin
|
|
|
|
image_claims: List[ImageClaim] = []
|
|
|
|
# Parse HTML
|
|
try:
|
|
tree = lxml_html.fromstring(html_content)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse HTML for image extraction: {e}")
|
|
return []
|
|
|
|
# Find all img elements
|
|
img_elements = tree.xpath('//img[@src]')
|
|
|
|
# Limit number of images per page
|
|
if len(img_elements) > self.config.max_images_per_page:
|
|
logger.info(f"Limiting image analysis to {self.config.max_images_per_page} of {len(img_elements)} images")
|
|
img_elements = img_elements[:self.config.max_images_per_page]
|
|
|
|
for img in img_elements:
|
|
src = img.get('src', '')
|
|
alt = img.get('alt', '')
|
|
title = img.get('title', '')
|
|
width_str = img.get('width', '')
|
|
height_str = img.get('height', '')
|
|
|
|
# Skip data URIs and tiny images
|
|
if src.startswith('data:'):
|
|
continue
|
|
|
|
# Parse dimensions
|
|
try:
|
|
width = int(width_str) if width_str.isdigit() else None
|
|
height = int(height_str) if height_str.isdigit() else None
|
|
|
|
# Skip tiny images (likely icons/decorations)
|
|
if width and height:
|
|
if width < self.config.min_image_size or height < self.config.min_image_size:
|
|
continue
|
|
except (ValueError, TypeError):
|
|
width = height = None
|
|
|
|
# Get XPath for provenance
|
|
xpath = tree.getroottree().getpath(img)
|
|
|
|
# Get surrounding context text
|
|
parent = img.getparent()
|
|
context = ""
|
|
if parent is not None:
|
|
# Get text from parent and siblings
|
|
context_parts = []
|
|
if parent.text:
|
|
context_parts.append(parent.text.strip())
|
|
for sibling in parent:
|
|
if sibling.tail:
|
|
context_parts.append(sibling.tail.strip())
|
|
context = " ".join(context_parts)[:300]
|
|
|
|
# Resolve image URL
|
|
image_url = None
|
|
image_path = None
|
|
image_base64 = None
|
|
|
|
if src.startswith('http://') or src.startswith('https://'):
|
|
image_url = src
|
|
elif base_url:
|
|
image_url = urljoin(base_url, src)
|
|
|
|
# Check for local copy
|
|
if image_dir:
|
|
# Try to find the image in the local directory
|
|
src_filename = Path(src).name
|
|
local_candidates = [
|
|
image_dir / src_filename,
|
|
image_dir / src.lstrip('/'),
|
|
image_dir / src,
|
|
]
|
|
for candidate in local_candidates:
|
|
if candidate.exists():
|
|
image_path = str(candidate)
|
|
break
|
|
|
|
# Analyze the image
|
|
analysis = await self._analyze_image(
|
|
image_url=image_url,
|
|
image_path=image_path,
|
|
image_base64=image_base64,
|
|
alt_text=alt or title,
|
|
context=context,
|
|
)
|
|
|
|
# Skip if analysis failed
|
|
if analysis.get("error"):
|
|
logger.debug(f"Skipping image {src}: {analysis.get('error')}")
|
|
continue
|
|
|
|
# Create ImageClaim
|
|
claim = ImageClaim(
|
|
image_url=image_url or src,
|
|
image_path=image_path,
|
|
alt_text=alt if alt else None,
|
|
title=title if title else None,
|
|
width=width,
|
|
height=height,
|
|
xpath=xpath,
|
|
description=analysis.get("description"),
|
|
detected_entities=analysis.get("detected_entities", []),
|
|
extracted_text=analysis.get("extracted_text"),
|
|
heritage_relevance=analysis.get("heritage_relevance"),
|
|
image_type=analysis.get("image_type"),
|
|
era_estimate=analysis.get("era_estimate"),
|
|
style=analysis.get("style"),
|
|
analysis_model=self.config.vision_model,
|
|
analysis_confidence=analysis.get("analysis_confidence"),
|
|
provenance=Provenance(
|
|
namespace="glam-ner",
|
|
path=xpath,
|
|
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
agent=f"LLMAnnotator/{self.config.vision_model}",
|
|
context_convention="GLAM-NER v1.7.0-unified/vision",
|
|
confidence=analysis.get("analysis_confidence", 0.5),
|
|
),
|
|
)
|
|
|
|
image_claims.append(claim)
|
|
|
|
return image_claims
|
|
|
|
def _parse_response(self, response: str) -> Dict[str, Any]:
|
|
"""Parse LLM response JSON."""
|
|
# Find JSON in response (may be wrapped in markdown code blocks)
|
|
import re
|
|
|
|
# Try to find JSON block
|
|
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response)
|
|
if json_match:
|
|
json_str = json_match.group(1)
|
|
else:
|
|
# Try to parse whole response as JSON
|
|
json_str = response
|
|
|
|
try:
|
|
return json.loads(json_str)
|
|
except json.JSONDecodeError:
|
|
# Return empty structure if parsing fails
|
|
return {"entities": [], "layout_regions": [], "claims": []}
|
|
|
|
def _populate_session(
|
|
self,
|
|
session: AnnotationSession,
|
|
annotations: Dict[str, Any],
|
|
source_url: Optional[str],
|
|
) -> None:
|
|
"""Populate session with parsed annotations."""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Build entity ID lookup for relationship linking
|
|
entity_id_lookup: Dict[str, str] = {}
|
|
|
|
# Process entities
|
|
for entity in annotations.get("entities", []):
|
|
# Parse hypernym - can come from explicit field or be inferred from hyponym
|
|
hypernym_str = entity.get("hypernym", "THG")
|
|
hyponym_str = entity.get("hyponym", "")
|
|
|
|
# If hyponym has dot notation (e.g., "GRP.HER"), extract hypernym from it
|
|
if hyponym_str and "." in hyponym_str:
|
|
hypernym_str = hyponym_str.split(".")[0]
|
|
|
|
try:
|
|
hypernym = EntityHypernym(hypernym_str)
|
|
except ValueError:
|
|
hypernym = EntityHypernym.THG
|
|
|
|
# Generate unique claim ID
|
|
claim_id = f"entity-{len(session.entity_claims)+1}"
|
|
|
|
# Store lookup for relationship linking (by text span)
|
|
entity_text = entity.get("text", "")
|
|
if entity_text:
|
|
entity_id_lookup[entity_text] = claim_id
|
|
|
|
# Get class_uri from hyponym (or hypernym as fallback)
|
|
# hyponym takes precedence for more specific ontology mapping
|
|
class_uri = None
|
|
if hyponym_str:
|
|
class_uri = get_ontology_class(hyponym_str)
|
|
if not class_uri:
|
|
class_uri = get_ontology_class(hypernym_str)
|
|
|
|
# Map LLM response fields to EntityClaim fields
|
|
# EntityClaim inherits text_content from Claim base class
|
|
claim = EntityClaim(
|
|
claim_id=claim_id,
|
|
hypernym=hypernym,
|
|
hyponym=hyponym_str if hyponym_str else "unknown",
|
|
text_content=entity_text, # LLM returns "text", we use text_content
|
|
class_uri=class_uri, # Auto-populated from hyponym/hypernym
|
|
isil_id=entity.get("isil_id"), # If present from LLM
|
|
cidoc_class=entity.get("cidoc_class"), # Backwards compatibility
|
|
recognition_confidence=entity.get("confidence", 0.5),
|
|
provenance=Provenance(
|
|
namespace="glam-ner",
|
|
path=entity.get("xpath", ""),
|
|
timestamp=timestamp,
|
|
agent=f"{self.config.provider.value}/{self.config.model}",
|
|
context_convention=self.config.context_convention,
|
|
),
|
|
)
|
|
session.entity_claims.append(claim)
|
|
|
|
# Process layout regions
|
|
for region in annotations.get("layout_regions", []):
|
|
try:
|
|
region_type = LayoutRegion(region.get("region", "PAR"))
|
|
except ValueError:
|
|
region_type = LayoutRegion.PAR
|
|
|
|
try:
|
|
semantic_role = SemanticRole(region.get("semantic_role", "PRIM"))
|
|
except ValueError:
|
|
semantic_role = SemanticRole.PRIM
|
|
|
|
# LayoutClaim uses text_content from base Claim class
|
|
claim = LayoutClaim(
|
|
claim_id=f"layout-{len(session.layout_claims)+1}",
|
|
region=region_type,
|
|
semantic_role=semantic_role,
|
|
xpath=region.get("xpath", ""),
|
|
text_content=region.get("text_preview", "")[:200], # Use text_content
|
|
provenance=Provenance(
|
|
namespace="glam-ner",
|
|
path=region.get("xpath", ""),
|
|
timestamp=timestamp,
|
|
agent=f"{self.config.provider.value}/{self.config.model}",
|
|
context_convention=self.config.context_convention,
|
|
),
|
|
)
|
|
session.layout_claims.append(claim)
|
|
|
|
# Process relationships
|
|
for rel in annotations.get("relationships", []):
|
|
self._process_relationship(
|
|
session=session,
|
|
rel_data=rel,
|
|
entity_id_lookup=entity_id_lookup,
|
|
timestamp=timestamp,
|
|
)
|
|
|
|
# Process claims (aggregate)
|
|
for claim_data in annotations.get("claims", []):
|
|
claim = AggregateClaim(
|
|
claim_id=f"claim-{len(session.aggregate_claims)+1}",
|
|
claim_type=claim_data.get("claim_type", "unknown"),
|
|
claim_value=claim_data.get("claim_value", ""),
|
|
text_content=claim_data.get("claim_value", ""), # Store value in text_content too
|
|
provenance=Provenance(
|
|
namespace="glam-ner",
|
|
path=claim_data.get("xpath", ""),
|
|
timestamp=timestamp,
|
|
agent=f"{self.config.provider.value}/{self.config.model}",
|
|
context_convention=self.config.context_convention,
|
|
confidence=claim_data.get("confidence", 0.5),
|
|
),
|
|
)
|
|
session.aggregate_claims.append(claim)
|
|
|
|
def _process_relationship(
|
|
self,
|
|
session: AnnotationSession,
|
|
rel_data: Dict[str, Any],
|
|
entity_id_lookup: Dict[str, str],
|
|
timestamp: str,
|
|
) -> None:
|
|
"""Process a relationship from LLM response and add to session."""
|
|
# Parse relationship type
|
|
rel_type = rel_data.get("relationship_type", "REL.CRE")
|
|
rel_hypernym = None
|
|
rel_hyponym = None
|
|
|
|
# Extract hypernym and hyponym from relationship type
|
|
if rel_type and "." in rel_type:
|
|
parts = rel_type.split(".")
|
|
if len(parts) >= 2:
|
|
hypernym_str = f"{parts[0]}.{parts[1]}" # e.g., "REL.CRE"
|
|
try:
|
|
rel_hypernym = RelationshipHypernym(hypernym_str)
|
|
except ValueError:
|
|
pass
|
|
rel_hyponym = rel_type # Full code, e.g., "REL.CRE.AUT"
|
|
|
|
# Parse subject
|
|
subject_data = rel_data.get("subject", {})
|
|
subject_text = subject_data.get("text", "")
|
|
subject = RelationshipSubject(
|
|
entity_id=entity_id_lookup.get(subject_text),
|
|
entity_type=subject_data.get("entity_type") or subject_data.get("type"), # Support both keys
|
|
span_text=subject_text,
|
|
uri=subject_data.get("uri"),
|
|
)
|
|
|
|
# Parse object
|
|
object_data = rel_data.get("object", {})
|
|
object_text = object_data.get("text", "")
|
|
obj = RelationshipObject(
|
|
entity_id=entity_id_lookup.get(object_text),
|
|
entity_type=object_data.get("entity_type") or object_data.get("type"), # Support both keys
|
|
span_text=object_text,
|
|
uri=object_data.get("uri"),
|
|
)
|
|
|
|
# Parse predicate
|
|
predicate_data = rel_data.get("predicate", {})
|
|
predicate = RelationshipPredicate(
|
|
uri=predicate_data.get("uri"),
|
|
label=predicate_data.get("label", rel_type),
|
|
direction=predicate_data.get("direction", "FORWARD"),
|
|
)
|
|
|
|
# Parse temporal scope (if present)
|
|
temporal_scope = None
|
|
temporal_data = rel_data.get("temporal", {})
|
|
if temporal_data:
|
|
temporal_scope = TemporalScope(
|
|
start_date=temporal_data.get("start_date"),
|
|
end_date=temporal_data.get("end_date"),
|
|
temporal_modifier=temporal_data.get("modifier"),
|
|
)
|
|
|
|
# Parse spatial scope (if present)
|
|
spatial_scope = None
|
|
spatial_data = rel_data.get("spatial", {})
|
|
if spatial_data:
|
|
spatial_scope = SpatialScope(
|
|
place_id=spatial_data.get("place_id"),
|
|
place_name=spatial_data.get("place_name"),
|
|
geo_uri=spatial_data.get("geo_uri"),
|
|
)
|
|
|
|
# Parse qualifiers (if present)
|
|
qualifiers = []
|
|
for qual_data in rel_data.get("qualifiers", []):
|
|
qualifiers.append(RelationshipQualifier(
|
|
qualifier_type=qual_data.get("type", ""),
|
|
qualifier_value=qual_data.get("value", ""),
|
|
qualifier_uri=qual_data.get("uri"),
|
|
))
|
|
|
|
# Create relationship claim
|
|
claim = RelationshipClaim(
|
|
claim_id=f"rel-{len(session.relationship_claims)+1}",
|
|
relationship_hypernym=rel_hypernym,
|
|
relationship_hyponym=rel_hyponym,
|
|
subject=subject,
|
|
predicate=predicate,
|
|
object=obj,
|
|
temporal_scope=temporal_scope,
|
|
spatial_scope=spatial_scope,
|
|
qualifiers=qualifiers,
|
|
negation=rel_data.get("negation", False),
|
|
hypothetical=rel_data.get("hypothetical", False),
|
|
source_claim=rel_data.get("source_claim", False),
|
|
attributed_to=rel_data.get("attributed_to"),
|
|
extraction_confidence=rel_data.get("confidence", 0.5),
|
|
text_content=rel_data.get("text", ""), # Original text span
|
|
provenance=Provenance(
|
|
namespace="glam-ner",
|
|
path=rel_data.get("xpath", ""),
|
|
timestamp=timestamp,
|
|
agent=f"{self.config.provider.value}/{self.config.model}",
|
|
context_convention=self.config.context_convention,
|
|
confidence=rel_data.get("confidence", 0.5),
|
|
),
|
|
)
|
|
|
|
# Validate domain/range constraints
|
|
if rel_hyponym:
|
|
validation_result = validate_relationship_constraints(
|
|
relationship_type=rel_hyponym,
|
|
subject_type=subject.entity_type,
|
|
object_type=obj.entity_type,
|
|
strict=False, # Treat violations as warnings, not errors
|
|
)
|
|
|
|
# Add any validation warnings to session errors
|
|
if validation_result.warnings:
|
|
for warning in validation_result.warnings:
|
|
session.errors.append(f"[VALIDATION WARNING] {claim.claim_id}: {warning}")
|
|
logger.warning(f"Relationship validation: {warning}")
|
|
|
|
if validation_result.errors:
|
|
for error in validation_result.errors:
|
|
session.errors.append(f"[VALIDATION ERROR] {claim.claim_id}: {error}")
|
|
logger.error(f"Relationship validation: {error}")
|
|
|
|
session.add_relationship_claim(claim)
|
|
|
|
async def annotate_batch(
|
|
self,
|
|
documents: List[Union[HTMLDocument, str, Path]],
|
|
source_urls: Optional[List[str]] = None,
|
|
concurrency: int = 3,
|
|
) -> List[AnnotationSession]:
|
|
"""
|
|
Annotate multiple documents concurrently.
|
|
|
|
Args:
|
|
documents: List of documents to annotate
|
|
source_urls: Optional list of source URLs
|
|
concurrency: Maximum concurrent requests
|
|
|
|
Returns:
|
|
List of AnnotationSessions
|
|
"""
|
|
urls: List[Optional[str]] = list(source_urls) if source_urls else [None] * len(documents)
|
|
|
|
semaphore = asyncio.Semaphore(concurrency)
|
|
|
|
async def annotate_with_semaphore(doc: Union[HTMLDocument, str, Path], url: Optional[str]) -> AnnotationSession:
|
|
async with semaphore:
|
|
return await self.annotate(doc, url)
|
|
|
|
tasks = [
|
|
annotate_with_semaphore(doc, url)
|
|
for doc, url in zip(documents, urls)
|
|
]
|
|
|
|
return await asyncio.gather(*tasks)
|
|
|
|
async def annotate_with_schema(
|
|
self,
|
|
document: Union[HTMLDocument, str, Path],
|
|
schema: Optional[GLAMSchema] = None,
|
|
source_url: Optional[str] = None,
|
|
validate_output: bool = True,
|
|
) -> Tuple[AnnotationSession, Dict[str, Any]]:
|
|
"""
|
|
Annotate a document using schema-driven extraction.
|
|
|
|
This method uses GLAMSchema to:
|
|
1. Generate targeted extraction prompts
|
|
2. Extract structured fields defined in the schema
|
|
3. Optionally validate output against JSON Schema
|
|
|
|
Args:
|
|
document: HTMLDocument, HTML string, or path to HTML file
|
|
schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
|
|
source_url: Optional source URL for provenance
|
|
validate_output: Whether to validate extracted data against schema
|
|
|
|
Returns:
|
|
Tuple of (AnnotationSession, structured_data dict)
|
|
|
|
Example:
|
|
>>> schema = (
|
|
... GLAMSchema("custom")
|
|
... .entities("GRP", "TOP")
|
|
... .structure()
|
|
... .field("name::str::Institution name") # GLiNER2 syntax
|
|
... .field("type::[MUSEUM|ARCHIVE]::str::Type")
|
|
... .build()
|
|
... )
|
|
>>> session, data = await annotator.annotate_with_schema(doc, schema)
|
|
>>> print(data["structured"]["name"])
|
|
"""
|
|
# Use default schema if not provided
|
|
if schema is None:
|
|
schema = heritage_custodian_schema()
|
|
|
|
# Load document
|
|
html_content: str
|
|
if isinstance(document, Path):
|
|
with open(document, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
source_url = source_url or str(document)
|
|
elif isinstance(document, str):
|
|
# Check if it's a file path (short string, no HTML tags)
|
|
is_file_path = len(document) < 500 and not document.strip().startswith('<')
|
|
if is_file_path:
|
|
try:
|
|
path = Path(document)
|
|
if path.exists():
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
source_url = source_url or document
|
|
else:
|
|
html_content = document
|
|
except OSError:
|
|
# Path too long or invalid
|
|
html_content = document
|
|
else:
|
|
html_content = document
|
|
elif isinstance(document, HTMLDocument):
|
|
html_content = document.raw_html
|
|
source_url = source_url or document.source_url
|
|
else:
|
|
raise TypeError(f"Unsupported document type: {type(document)}")
|
|
|
|
# Create session
|
|
session = AnnotationSession(
|
|
session_id=f"schema-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
|
|
source_url=source_url or "unknown",
|
|
)
|
|
|
|
# Generate schema-aware prompt
|
|
schema_prompt = schema.to_llm_prompt(include_examples=True, output_format="json")
|
|
user_prompt = self._prepare_schema_prompt(html_content, schema_prompt)
|
|
|
|
structured_data: Dict[str, Any] = {}
|
|
|
|
try:
|
|
# Call LLM with schema-aware prompt
|
|
response = await self._call_llm(user_prompt)
|
|
|
|
# Parse response
|
|
annotations = self._parse_response(response)
|
|
|
|
# Extract structured data from response
|
|
structured_data = annotations.get("structured", {})
|
|
|
|
# Also extract classifications if present
|
|
if "classifications" in annotations:
|
|
structured_data["_classifications"] = annotations["classifications"]
|
|
|
|
# Extract relations if present
|
|
if "relations" in annotations:
|
|
structured_data["_relations"] = annotations["relations"]
|
|
|
|
# Validate against JSON Schema if requested
|
|
if validate_output and structured_data:
|
|
validation_errors = self._validate_structured_output(structured_data, schema)
|
|
if validation_errors:
|
|
session.errors.extend(validation_errors)
|
|
|
|
# Populate session with entity and claim data
|
|
self._populate_session(session, annotations, source_url)
|
|
|
|
# Add structured data to session config
|
|
session.config["structured_data"] = structured_data
|
|
session.config["schema_name"] = schema.name
|
|
|
|
except Exception as e:
|
|
session.errors.append(f"Schema-driven annotation failed: {e}")
|
|
|
|
session.completed_at = datetime.now(timezone.utc).isoformat()
|
|
return session, structured_data
|
|
|
|
def _prepare_schema_prompt(self, html_content: str, schema_prompt: str) -> str:
|
|
"""Prepare prompt with schema instructions and document content."""
|
|
# Truncate if too long
|
|
max_chars = 25000 # Leave room for schema prompt
|
|
if len(html_content) > max_chars:
|
|
html_content = html_content[:max_chars] + "\n... [truncated]"
|
|
|
|
return f"""{schema_prompt}
|
|
|
|
---
|
|
|
|
## Document to Analyze
|
|
|
|
Extract all information following the schema above from this HTML document:
|
|
|
|
```html
|
|
{html_content}
|
|
```
|
|
|
|
## Instructions
|
|
|
|
1. Extract ALL entities matching the specified hypernyms
|
|
2. Fill in ALL structured fields from the schema
|
|
3. Include XPath locations for provenance
|
|
4. Use confidence scores appropriately
|
|
5. Return ONLY a valid JSON object matching the output format
|
|
|
|
IMPORTANT: The "structured" field in your response must contain the extracted field values.
|
|
"""
|
|
|
|
def _validate_structured_output(
|
|
self,
|
|
data: Dict[str, Any],
|
|
schema: GLAMSchema,
|
|
) -> List[str]:
|
|
"""
|
|
Validate structured output against schema.
|
|
|
|
Args:
|
|
data: Extracted structured data
|
|
schema: GLAMSchema used for extraction
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid)
|
|
"""
|
|
errors = []
|
|
|
|
# Check required fields
|
|
for field in schema.fields:
|
|
if field.required and field.name not in data:
|
|
errors.append(f"Missing required field: {field.name}")
|
|
|
|
# Validate field types and choices
|
|
for field in schema.fields:
|
|
if field.name not in data:
|
|
continue
|
|
|
|
value = data[field.name]
|
|
|
|
# Check choices
|
|
if field.choices and value:
|
|
if field.dtype == "list":
|
|
invalid_values = [v for v in value if v not in field.choices]
|
|
if invalid_values:
|
|
errors.append(
|
|
f"Invalid values for {field.name}: {invalid_values}. "
|
|
f"Valid: {field.choices}"
|
|
)
|
|
elif value not in field.choices:
|
|
errors.append(
|
|
f"Invalid value for {field.name}: {value}. "
|
|
f"Valid: {field.choices}"
|
|
)
|
|
|
|
# Check patterns
|
|
if field.pattern and value and isinstance(value, str):
|
|
import re
|
|
if not re.match(field.pattern, value):
|
|
errors.append(
|
|
f"Field {field.name} does not match pattern {field.pattern}: {value}"
|
|
)
|
|
|
|
return errors
|
|
|
|
async def extract_structured(
|
|
self,
|
|
document: Union[HTMLDocument, str, Path],
|
|
fields: List[str],
|
|
source_url: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Quick structured extraction using GLiNER2-style field specs.
|
|
|
|
This is a convenience method for simple extractions without
|
|
full annotation session overhead.
|
|
|
|
Args:
|
|
document: Document to extract from
|
|
fields: List of GLiNER2-style field specs
|
|
e.g., ["name::str::Institution name",
|
|
"type::[MUSEUM|ARCHIVE]::str::Type"]
|
|
source_url: Optional source URL
|
|
|
|
Returns:
|
|
Dict of extracted field values
|
|
|
|
Example:
|
|
>>> data = await annotator.extract_structured(
|
|
... html_doc,
|
|
... ["name::str::Full name",
|
|
... "email::str::Contact email",
|
|
... "type::[MUSEUM|ARCHIVE|LIBRARY]::str::Institution type"]
|
|
... )
|
|
>>> print(data["name"])
|
|
"""
|
|
# Build schema from field specs
|
|
schema = GLAMSchema("quick_extraction").structure()
|
|
|
|
for field_spec in fields:
|
|
parsed = FieldSpec.from_gliner2_syntax(field_spec)
|
|
schema.fields.append(parsed)
|
|
|
|
schema = schema.build()
|
|
|
|
# Run extraction
|
|
_, structured_data = await self.annotate_with_schema(
|
|
document,
|
|
schema=schema,
|
|
source_url=source_url,
|
|
validate_output=False, # Skip validation for quick extraction
|
|
)
|
|
|
|
return structured_data
|
|
|
|
|
|
# =============================================================================
|
|
# CONVENIENCE FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def create_llm_annotator(
|
|
provider: str = "zai",
|
|
model: Optional[str] = None,
|
|
api_key: Optional[str] = None,
|
|
enable_fallback: bool = True,
|
|
max_retries: int = 5,
|
|
) -> LLMAnnotator:
|
|
"""
|
|
Create an LLM annotator with the specified provider.
|
|
|
|
Args:
|
|
provider: "zai", "anthropic", or "openai"
|
|
model: Optional model name (uses provider default if not specified)
|
|
api_key: Optional API key (uses environment variable if not specified)
|
|
enable_fallback: Enable automatic fallback to other providers on failure
|
|
max_retries: Maximum retry attempts per provider
|
|
|
|
Returns:
|
|
Configured LLMAnnotator instance
|
|
"""
|
|
provider_enum = LLMProvider(provider)
|
|
|
|
default_models = {
|
|
LLMProvider.ZAI: "glm-4.6",
|
|
LLMProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
|
LLMProvider.OPENAI: "gpt-4o",
|
|
}
|
|
|
|
# Configure retry
|
|
retry_config = RetryConfig(max_retries=max_retries)
|
|
|
|
# Configure fallback providers
|
|
fallback_providers = (
|
|
[p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
|
|
if p != provider_enum]
|
|
if enable_fallback else []
|
|
)
|
|
|
|
config = LLMAnnotatorConfig(
|
|
provider=provider_enum,
|
|
model=model or default_models[provider_enum],
|
|
api_key=api_key,
|
|
retry=retry_config,
|
|
fallback_providers=fallback_providers,
|
|
)
|
|
|
|
return LLMAnnotator(config)
|
|
|
|
|
|
async def annotate_html_file(
|
|
file_path: Union[str, Path],
|
|
provider: str = "zai",
|
|
model: Optional[str] = None,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate an HTML file using LLM.
|
|
|
|
Args:
|
|
file_path: Path to HTML file
|
|
provider: LLM provider ("zai", "anthropic", "openai")
|
|
model: Optional model name
|
|
|
|
Returns:
|
|
AnnotationSession with extracted claims
|
|
"""
|
|
annotator = create_llm_annotator(provider=provider, model=model)
|
|
return await annotator.annotate(file_path)
|
|
|
|
|
|
async def annotate_with_schema(
|
|
file_path: Union[str, Path],
|
|
schema: Optional[GLAMSchema] = None,
|
|
provider: str = "zai",
|
|
model: Optional[str] = None,
|
|
) -> Tuple[AnnotationSession, Dict[str, Any]]:
|
|
"""
|
|
Annotate an HTML file using schema-driven extraction.
|
|
|
|
Args:
|
|
file_path: Path to HTML file
|
|
schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
|
|
provider: LLM provider ("zai", "anthropic", "openai")
|
|
model: Optional model name
|
|
|
|
Returns:
|
|
Tuple of (AnnotationSession, structured_data dict)
|
|
|
|
Example:
|
|
>>> schema = (
|
|
... GLAMSchema("museum")
|
|
... .entities("GRP", "TOP")
|
|
... .structure()
|
|
... .field("name::str::Museum name")
|
|
... .field("city::str::City location")
|
|
... .build()
|
|
... )
|
|
>>> session, data = await annotate_with_schema("museum.html", schema)
|
|
>>> print(data["name"])
|
|
"""
|
|
annotator = create_llm_annotator(provider=provider, model=model)
|
|
return await annotator.annotate_with_schema(file_path, schema=schema)
|
|
|
|
|
|
__all__ = [
|
|
"LLMProvider",
|
|
"LLMAnnotatorConfig",
|
|
"RetryConfig",
|
|
"LLMAnnotator",
|
|
"GLAM_NER_SYSTEM_PROMPT",
|
|
"create_llm_annotator",
|
|
"annotate_html_file",
|
|
"annotate_with_schema",
|
|
]
|