""" LLM-Based Agentic Annotator for GLAM Documents. This module provides LLM-only entity annotation following GLAM-NER v1.7.0. NO HEURISTIC/PATTERN-BASED METHODS - all recognition is done via LLM inference. Supported LLM Providers: - Z.AI (Zhipu AI) GLM-4 (default) - Anthropic Claude - OpenAI GPT-4 Based on GLAM-NER v1.7.0-unified Entity Annotation Convention. Features: - Exponential backoff retry for rate limits (429) - Automatic provider fallback (Z.AI → Claude → OpenAI) - Configurable retry attempts and delays """ import asyncio import json import logging import os import random from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from enum import Enum # Configure logging logger = logging.getLogger(__name__) # Import base classes from .base import ( AnnotationSession, EntityClaim, LayoutClaim, AggregateClaim, ImageClaim, Provenance, EntityHypernym, LayoutRegion, SemanticRole, RelationshipClaim, RelationshipSubject, RelationshipObject, RelationshipPredicate, RelationshipHypernym, TemporalScope, SpatialScope, RelationshipQualifier, get_ontology_class, validate_relationship_constraints, ) from .html_parser import HTMLDocument from .schema_builder import GLAMSchema, FieldSpec, heritage_custodian_schema class LLMProvider(str, Enum): """Supported LLM providers.""" ZAI = "zai" # Zhipu AI GLM-4 ANTHROPIC = "anthropic" # Claude OPENAI = "openai" # GPT-4 @dataclass class RetryConfig: """Configuration for retry logic with exponential backoff.""" max_retries: int = 5 # Maximum retry attempts base_delay: float = 1.0 # Initial delay in seconds max_delay: float = 60.0 # Maximum delay cap exponential_base: float = 2.0 # Exponential backoff base jitter: bool = True # Add random jitter to prevent thundering herd retry_on_status: tuple = (429, 500, 502, 503, 504) # HTTP status codes to retry @dataclass class LLMAnnotatorConfig: """Configuration for LLM-based annotation.""" provider: LLMProvider = LLMProvider.ZAI model: str = "glm-4.6" # Z.AI's latest model api_key: Optional[str] = None temperature: float = 0.1 # Low temp for consistent extraction max_tokens: int = 4096 timeout: int = 120 # Longer timeout for LLM calls # Annotation settings extract_entities: bool = True extract_layout: bool = True extract_claims: bool = True extract_images: bool = True # NEW: Enable image analysis via vision model # Vision model settings (for image analysis) vision_model: str = "glm-4.5v" # Z.AI's vision model vision_max_tokens: int = 2048 # Max tokens for vision response max_images_per_page: int = 10 # Limit images analyzed per page min_image_size: int = 50 # Minimum dimension (width or height) to analyze # Provenance settings context_convention: str = "GLAM-NER v1.7.0-unified" # Retry settings retry: RetryConfig = field(default_factory=RetryConfig) # Fallback providers (tried in order when primary fails) fallback_providers: Optional[List[LLMProvider]] = None def __post_init__(self): """Load API key from environment if not provided.""" if self.api_key is None: if self.provider == LLMProvider.ZAI: self.api_key = os.environ.get("ZAI_API_TOKEN") elif self.provider == LLMProvider.ANTHROPIC: self.api_key = os.environ.get("ANTHROPIC_API_KEY") elif self.provider == LLMProvider.OPENAI: self.api_key = os.environ.get("OPENAI_API_KEY") # Default fallback chain if not specified if self.fallback_providers is None: self.fallback_providers = [ p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI] if p != self.provider ] # ============================================================================= # GLAM-NER v1.7.0 SYSTEM PROMPT # ============================================================================= GLAM_NER_SYSTEM_PROMPT = """You are an expert entity annotator following the GLAM-NER v1.7.0-unified Entity Annotation Convention. Your task is to extract structured claims from heritage institution documents with full provenance. ## HYPERNYMS AND HYPONYMS (10 types with subcategories) ### 1. AGT (Agent): Humans, animals, AI, fictional characters **Subcategories:** - AGT.PER: Person - INDIVIDUAL human beings with SPECIFIC NAMES (maps to crm:E21_Person) ⚠️ STRICT EXCLUSIONS - Do NOT tag as AGT.PER: • Groups/collectives: "staff", "members", "curators", "colleagues", "board", "team", "committee", "participants", "community" • Plural person references: "archivists", "researchers", "visitors", "filmmakers", "historians" • Role descriptions without names: "the curator", "a researcher", "museum director" • Organizations/events with role words: "FIAF Commission members", "conference colleagues", "board members" • Networks: "VPRO/Tegenlicht network", "ACE member institutions" • Topic references: "Verhalen van Bolsward" (stories about something) • Conference/event participants: "Women and Silent Screen Conference participants" • Fund/foundation board: "Prince Claus Fund board members" • Festival communities: "Le Giornate del Cinema Muto community" • Generic collectives: "community", "network", "consortium", "association" ✓ ONLY tag as AGT.PER: Named individuals like "Giovanna Fossati", "Dr. Jan van der Berg", "Martin Scorsese" - AGT.STF: Staff - personnel in professional roles (maps to pico:PersonObservation) - AGT.COL: Collective - named collectives without formal structure - AGT.FIC: Fictional - characters from fiction/mythology - AGT.MYT: Mythological - gods, deities, legendary figures - AGT.ANI: Animal - named individual animals with agency - AGT.ART: Artificial - AI systems, robots, software agents (maps to prov:SoftwareAgent) Examples: "Dr. Jan van der Berg" → AGT.PER, "Giovanna Fossati" → AGT.PER, "the museum director" → AGT.STF ❌ NOT AGT.PER: "AMIA conference colleagues", "Prince Claus Fund board members", "festival community" ### 2. GRP (Group): Organizations, collectives, formal and informal **Subcategories:** - GRP.HER: Heritage institutions - museums, archives, libraries (maps to glam:HeritageCustodian) - GRP.PAR: Parent/governing bodies (maps to rico:CorporateBody) - GRP.UNT: Organizational units/departments (maps to org:OrganizationalUnit) - GRP.COR: Corporations and businesses (maps to schema:Corporation) - GRP.GOV: Government agencies (maps to schema:GovernmentOrganization) - GRP.EDU: Educational institutions (maps to schema:EducationalOrganization) - GRP.REL: Religious organizations (maps to schema:ReligiousOrganization) - GRP.ASS: Associations and societies (maps to org:FormalOrganization) - GRP.INF: Informal groups - movements, families, dynasties - GRP.HIS: Historical organizations - defunct entities - GRP.ETH: Ethnic groups - Jews, Roma, Sinti, indigenous peoples (maps to crm:E74_Group) Examples: "Rijksmuseum" → GRP.HER, "Ministry of Culture" → GRP.GOV, "Joden" → GRP.ETH ### 3. TOP (Toponym): Place names, nominal geographic references **Subcategories:** - TOP.SET: Settlement - cities, towns, villages (maps to schema:City) - TOP.REG: Region - provinces, states, counties (maps to schema:AdministrativeArea) - TOP.CTY: Country - nations, sovereign states (maps to schema:Country) - TOP.ADR: Address - street addresses (maps to schema:PostalAddress) - TOP.IAD: Institutional address - TOP.BLD: Building - named buildings, monuments (maps to crm:E18_Physical_Thing) - TOP.NAT: Natural features - mountains, rivers - TOP.HIS: Historical places - concentration camps, transit camps, former territories (maps to crm:E53_Place) - TOP.LEG: Legendary/fictional places Examples: "Amsterdam" → TOP.SET, "the Netherlands" → TOP.CTY, "Auschwitz" → TOP.HIS ### 4. GEO (Geometry): Coordinates, shapes, spatial data **Subcategories:** - GEO.PNT: Point coordinates (maps to geo:Point) - GEO.LIN: Line/path (maps to geo:LineString) - GEO.POL: Polygon/area (maps to geo:Polygon) - GEO.BOX: Bounding box (maps to geo:Envelope) Examples: "52.3676° N, 4.9041° E" → GEO.PNT ### 5. TMP (Temporal): Dates, times, durations, periods **Subcategories:** - TMP.DAT: Absolute date - specific point (maps to time:Instant) [alias: TMP.DAB] - TMP.DAB: Date Absolute - specific date "1885-03-22" (maps to time:Instant) - TMP.DRL: Date Relative - "last year", "recently", "two weeks ago" (maps to time:Instant) - TMP.TIM: Time of day (maps to time:Instant) [alias: TMP.TAB] - TMP.TAB: Time Absolute - specific time "14:30:00" (maps to time:Instant) - TMP.TRL: Time Relative - "later that evening", "soon after" (maps to time:Instant) - TMP.DUR: Duration/period - "three hours", "from 1885 to 1890" (maps to time:Duration) - TMP.RNG: Date range - "1885-1890", "March 1-15" (maps to time:Interval) - TMP.SET: Recurring time - "every Monday", "annually" - TMP.OPH: Opening hours - "Tuesday-Sunday 10:00-17:00" (maps to schema:OpeningHoursSpecification) - TMP.REL: Relative time - "before", "after" [deprecated, use TMP.DRL/TMP.TRL] - TMP.CEN: Century - "17th century", "the 1800s" (maps to crm:E4_Period) - TMP.ERA: Historical era/period name - "Renaissance", "Bronze Age" (maps to crm:E4_Period) - TMP.EXP: Exhibition period - "10 February - 4 June 2023" (maps to time:Interval) Examples: "1885" → TMP.DAB, "18th century" → TMP.CEN, "every Tuesday" → TMP.SET, "10:00-17:00" → TMP.OPH ### 6. APP (Appellation): Identifiers, codes, reference numbers **Subcategories:** - APP.ISL: ISIL code (maps to crm:E42_Identifier) - APP.WKD: Wikidata ID (maps to crm:E42_Identifier) - APP.VIF: VIAF ID (maps to crm:E42_Identifier) - APP.DOI: DOI - APP.URL: URL/URI (maps to schema:URL) - APP.ISBN: ISBN - APP.ISSN: ISSN - APP.KVK: Dutch Chamber of Commerce number - APP.TTL: Title of work (maps to crm:E35_Title) [alias: APP.TIT] - APP.TIT: Title of work (maps to crm:E35_Title) - APP.NAM: Personal name - structured (maps to pnv:PersonName) [alias: APP.PNM] - APP.PNM: Personal name - structured (maps to pnv:PersonName) - APP.AWD: Award name - APP.COL: Collection name - APP.EXH: Exhibition name/title (maps to crm:E35_Title) Examples: "ISIL NL-AmRM" → APP.ISL, "Q190804" → APP.WKD, "Rembrandt and His Era" → APP.EXH ### 7. ROL (Role): Titles, positions, honorifics, occupations **Subcategories:** - ROL.OCC: Occupation - profession, trade, job title (maps to schema:Occupation) - ROL.TTL: Title/honorific - "Dr.", "Prof.", academic/professional titles (maps to schema:Role) - ROL.HON: Honorific - "Sir", "Dame", "The Honorable" (maps to schema:honorificPrefix) - ROL.NOB: Nobility title - "Duke", "Baron", "Count", hereditary titles (maps to schema:honorificSuffix) - ROL.POS: Position/office - "Director", "Chairman" (maps to org:Post) - ROL.REL: Relational role - father, mother, kinship (maps to bio:Relationship) - ROL.REL.REL: Religious role - "Bishop", "Rabbi", "Imam" (maps to schema:Role) Examples: "Director" → ROL.POS, "Prof. Dr." → ROL.TTL, "Duke of Wellington" → ROL.NOB, "Rabbi" → ROL.REL.REL ### 8. WRK (Work): Works following FRBR model **Subcategories:** - WRK.WRK: FRBR Work - abstract (maps to frbroo:F1_Work) [alias: WRK.ABS] - WRK.ABS: Abstract work (maps to frbroo:F1_Work) - WRK.EXP: FRBR Expression (maps to frbroo:F2_Expression) - WRK.MAN: FRBR Manifestation (maps to frbroo:F3_Manifestation) - WRK.ITM: FRBR Item (maps to frbroo:F5_Item) - WRK.MSS: Manuscript - handwritten/unpublished work (maps to rico:Record) - WRK.ARC: Archival record/document (maps to rico:Record) - WRK.TXT: Textual work (maps to schema:Book) - WRK.VIS: Visual work (maps to schema:VisualArtwork) - WRK.MUS: Musical work (maps to schema:MusicComposition) - WRK.PER: Performance (maps to schema:PerformingArtsEvent) - WRK.CIN: Cinematic work (maps to schema:Movie) - WRK.OBJ: Physical object/artifact (maps to crm:E22_Human-Made_Object) - WRK.COL: Collection (maps to crm:E78_Curated_Holding) - WRK.SER: Series (maps to schema:CreativeWorkSeries) - WRK.WEB: Web resource/page (maps to schema:WebPage) - WRK.URL: URL reference to work/link (maps to schema:URL) - WRK.EML: Email message (maps to schema:Message) - WRK.SOC: Social media post/content (maps to schema:SocialMediaPosting) - WRK.CIT: Citation/bibliographic reference (maps to schema:Citation) Examples: "The Night Watch" → WRK.VIS, "Annual Report 2023" → WRK.TXT, "15th-century codex" → WRK.MSS ### 9. QTY (Quantity): Measurements, counts, numeric values **Subcategories:** - QTY.CNT: Count (maps to crm:E54_Dimension) - QTY.MSR: Measurement (maps to crm:E54_Dimension) - QTY.PCT: Percentage - QTY.CUR: Currency/monetary (maps to schema:MonetaryAmount) - QTY.ORD: Ordinal (maps to crm:E60_Number) - QTY.RNG: Range Examples: "over 8,000 artworks" → QTY.CNT, "€2.5 million" → QTY.CUR ### 10. THG (Thing): Physical objects, artifacts, concepts, events **Subcategories:** - THG.ART: Artwork (maps to crm:E22_Human-Made_Object) - THG.AFT: Artifact - human-made object of historical significance (maps to crm:E22_Human-Made_Object) - THG.SPC: Specimen - natural history specimen, scientific sample (maps to crm:E20_Biological_Object) - THG.DOC: Document (maps to foaf:Document) - THG.PHO: Photograph (maps to schema:Photograph) - THG.OBJ: Physical object - generic (maps to crm:E19_Physical_Object) - THG.EVT: Historical event - deportation, persecution, liberation, war (maps to crm:E5_Event) - THG.CON: Concept/abstract thing - stories, memories, heritage, mission (maps to crm:E28_Conceptual_Object) - THG.TAX: Taxonomic term - species (maps to crm:E55_Type) - THG.LNG: Language (maps to crm:E56_Language) - THG.MAT: Material - bronze, marble, paper, etc. (maps to crm:E57_Material) Examples: "17th-century painting" → THG.ART, "deportation" → THG.EVT, "the stories" → THG.CON, "Dutch" → THG.LNG ## RELATIONSHIP TYPES AND CONSTRAINTS Relationships connect two entities. Each relationship has domain (subject) and range (object) constraints. **⚠️ CRITICAL: COMPREHENSIVE SEMANTIC TRIPLE EXTRACTION ⚠️** You MUST extract ALL semantic relationships from narrative text, not just named entity relationships. Decompose every sentence into its constituent semantic triples (subject-predicate-object). Example text: "In het Herinneringscentrum Kamp Westerbork vertellen we de verhalen van meer dan honderdduizend Joden en Sinti en Roma die vanuit Nederland naar vernietigings- en concentratiekampen werden gedeporteerd" This SINGLE sentence contains these triples: 1. REL.ORG.ACT: Herinneringscentrum Kamp Westerbork → performs activity → tell stories 2. REL.SUB.ABT: the stories → are about → Joden (Jews) 3. REL.SUB.ABT: the stories → are about → Sinti and Roma 4. REL.QTY.CNT: Jews/Sinti/Roma → quantity → more than 100,000 5. REL.SPA.ORG: deportees → originated from → Nederland 6. REL.SPA.DST: deportees → destination → concentration camps 7. REL.SPA.DST: deportees → destination → extermination camps 8. REL.EVT.PAR: Jews/Sinti/Roma → participated in → deportation (forced) ### REL.CRE (Creation) - Agent creates Work | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.CRE.AUT | AGT.PER, AGT.GRP | WRK.TXT | "Martin Luther authored 95 Theses" | | REL.CRE.ART | AGT.PER | WRK.VIS, THG.ART | "Rembrandt painted The Night Watch" | | REL.CRE.COM | AGT.PER | WRK.MUS | "Beethoven composed Symphony No. 9" | | REL.CRE.PHO | AGT.PER | THG.PHO | "Photographer captured portrait" | | REL.CRE.DES | AGT.PER, AGT.GRP | WRK.OBJ | "Architect designed building" | ### REL.SPA (Spatial) - Located in / Contains / Origin / Destination | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.SPA.LOC | AGT, EVT, GRP, WRK | TOP | "Museum located in Amsterdam" | | REL.SPA.WTH | TOP | TOP | "Amsterdam within North Holland" | | REL.SPA.CON | TOP | TOP | "Netherlands contains Amsterdam" | | REL.SPA.ORG | AGT.PER, WRK, GRP.ETH | TOP | "Jews came from Netherlands" | | REL.SPA.DST | AGT, EVT, GRP | TOP | "Deported to concentration camps" | ### REL.SOC (Social) - Person-to-person relations | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.SOC.FAM.SPO | AGT.PER | AGT.PER | "Martin Luther married Katharina von Bora" | | REL.SOC.FAM.PAR | AGT.PER | AGT.PER | "Parent of child" | | REL.SOC.PRO.STU | AGT.PER | AGT.PER | "Student studied under master" | | REL.SOC.MEM | AGT.PER, GRP | GRP | "Person/org member of organization" | | REL.SOC.EMP | AGT.PER | GRP | "Employee works for company" | ### REL.ORG (Organizational) - Group activities and relations | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.ORG.PAR | GRP | GRP | "Parent organization" | | REL.ORG.SUB | GRP | GRP | "Subsidiary organization" | | REL.ORG.SUC | GRP | GRP | "Successor organization" | | REL.ORG.FND | AGT.PER, GRP | GRP | "Founder established organization" | | REL.ORG.ACT | GRP.HER, GRP | THG.CON, WRK | "Museum tells stories" / "Archive preserves documents" | | REL.ORG.MIS | GRP.HER, GRP | THG.CON | "Organization's mission is..." | | REL.ORG.SRV | GRP.HER, GRP | GRP, AGT | "Museum serves researchers" | ### REL.CUS (Custodial) - Ownership/Keeping | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.CUS.KEP | WRK, THG | GRP.HER | "Artwork kept by Rijksmuseum" | | REL.CUS.OWN | WRK, THG | AGT.PER, GRP | "Collector owns painting" | | REL.CUS.COL | WRK, THG | WRK.COL | "Item in collection" | | REL.CUS.DNT | WRK, THG | AGT.PER | "Donated by benefactor" | ### REL.WRK (Work/FRBR) - Work relations | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.WRK.EXP | WRK.EXP | WRK.WRK | "Expression of work" | | REL.WRK.PRT | WRK | WRK | "Part of larger work" | | REL.WRK.SER | WRK | WRK.SER | "Volume in series" | | REL.WRK.TRN | WRK.EXP | WRK.WRK | "Translation of work" | ### REL.SUB (Subject/About) - Topics and content | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.SUB.ABT | WRK, GRP.HER, THG | AGT, GRP, EVT, TOP, THG | "Stories about Jews and Roma" | | REL.SUB.DEP | WRK.VIS, THG.PHO | AGT, TOP, EVT | "Photo depicts memorial" | | REL.SUB.THM | GRP.HER, WRK.COL | THG.CON | "Collection themes: WWII, Holocaust" | ### REL.EVT (Event) - Participation and historical events | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.EVT.PAR | AGT, GRP, GRP.ETH | EVT, THG.EVT | "Jews participated in deportation" | | REL.EVT.ORG | AGT, GRP | EVT | "Nazis organized deportations" | | REL.EVT.LOC | EVT | TOP | "Deportations from Netherlands" | | REL.EVT.VIC | AGT, GRP, GRP.ETH | EVT | "Jews were victims of persecution" | | REL.EVT.TIM | EVT | TMP | "Deportations in 1942-1944" | ### REL.QTY (Quantity) - Numeric relations | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.QTY.CNT | GRP, GRP.ETH, WRK.COL | QTY.CNT | "More than 100,000 people" | | REL.QTY.MSR | THG, TOP | QTY.MSR | "Building is 500 sqm" | | REL.QTY.YRS | GRP.HER, AGT | QTY.CNT, TMP | "Museum operating for 50 years" | ### REL.ROL (Role) - Occupation/Position | Hyponym | Domain (Subject) | Range (Object) | Example | |---------|------------------|----------------|---------| | REL.ROL.OCC | AGT.PER | ROL.OCC | "Person has occupation" | | REL.ROL.HLD | AGT.PER | ROL.POS | "Person holds position" | **IMPORTANT: Always include entity_type in relationship subject/object for validation!** ### ENTITY TYPES FOR RELATIONSHIP EXTRACTION When extracting relationships, use these additional entity types: - **GRP.ETH**: Ethnic groups (Joden, Sinti, Roma, etc.) - **THG.CON**: Abstract concepts (stories, memories, heritage, mission) - **THG.EVT**: Historical events (deportation, persecution, liberation) - **TOP.HIS**: Historical places (concentration camps, transit camps) ## LAYOUT REGIONS (DOC hypernym) Primary: HDR (heading), PAR (paragraph), SEN (sentence), LST (list), TBL (table) Media: GAL (gallery), MAP (map), AUD (audio), VID (video), EMB (embedded) Navigation: NAV (navigation), TOC (table of contents), IDX (index) Front/Back: TTP (title page), DED (dedication), COL (colophon), BIB (bibliography), APP (appendix), GLO (glossary) Commercial: ADV (advertisement), LOG (logo) ## OUTPUT FORMAT Return a JSON object with this structure: ```json { "entities": [ { "hypernym": "GRP", "hyponym": "GRP.HER", "text": "Rijksmuseum", "xpath": "/html/body/div[1]/h1", "confidence": 0.95, "class_uri": "glam:HeritageCustodian", "notes": "Main heritage institution name" } ], "layout_regions": [ { "region": "HDR", "level": 1, "semantic_role": "PRIM", "xpath": "/html/body/div[1]/h1", "text_preview": "Rijksmuseum Amsterdam", "contains_entities": ["GRP.HER:Rijksmuseum", "TOP.SET:Amsterdam"] } ], "claims": [ { "claim_type": "full_name", "claim_value": "Rijksmuseum Amsterdam", "xpath": "/html/body/div[1]/h1", "confidence": 0.95, "source_entities": ["GRP.HER:Rijksmuseum"] } ], "relationships": [ { "relationship_type": "REL.SPA.LOC", "subject": {"entity_type": "GRP.HER", "text": "Rijksmuseum"}, "object": {"entity_type": "TOP.SET", "text": "Amsterdam"}, "predicate_uri": "schema:location", "confidence": 0.90 } ] } ``` ## RULES 1. Every claim MUST have an XPath location in the source document 2. Use HYPONYM codes (e.g., GRP.HER, AGT.PER) not just hypernyms (e.g., GRP, AGT) 3. Include class_uri ontology mapping for each entity 4. Confidence scores: 0.9-1.0 (explicit), 0.7-0.9 (clear), 0.5-0.7 (inferred) 5. Entities within layout regions should be cross-referenced 6. Claims without XPath provenance are FABRICATED and must not be included 7. Extract relationships between entities (especially REL.SPA.LOC, REL.ORG.*, REL.CRE.*) ## ⚠️ CRITICAL: COMPREHENSIVE SEMANTIC EXTRACTION ⚠️ 8. **DECOMPOSE EVERY NARRATIVE SENTENCE INTO TRIPLES** - A single sentence often contains 5-10 semantic relationships 9. **Extract ALL entities** - not just named entities, but also: - Quantities (QTY.CNT: "more than 100,000") - Ethnic groups (GRP.ETH: "Jews", "Sinti", "Roma") - Abstract concepts (THG.CON: "stories", "memories", "heritage") - Historical events (THG.EVT: "deportation", "persecution") - Historical places (TOP.HIS: "concentration camps", "transit camps") 10. **Extract organizational activities** (REL.ORG.ACT): What does the institution DO? (preserve, tell, exhibit, research) 11. **Extract subject matter** (REL.SUB.ABT): What is the institution/collection ABOUT? 12. **Extract quantities** (REL.QTY.CNT): Numbers of visitors, items, people affected 13. **Extract spatial origins and destinations** (REL.SPA.ORG, REL.SPA.DST): Where did things/people come FROM and go TO? 14. **Extract event participation** (REL.EVT.PAR, REL.EVT.VIC): Who was involved in historical events? ### Example: Deep Semantic Parsing Text: "Het museum bewaart meer dan 5000 voorwerpen uit de Tweede Wereldoorlog" **INCORRECT** (shallow extraction): - 1 entity: "Het museum" (GRP.HER) - 0 relationships **CORRECT** (deep semantic extraction): - Entities: - "Het museum" (GRP.HER) - "meer dan 5000" (QTY.CNT) - "voorwerpen" (THG.AFT - artifacts) - "Tweede Wereldoorlog" (TMP.ERA) - Relationships: - REL.ORG.ACT: museum → performs → preservation (bewaart) - REL.CUS.KEP: voorwerpen → kept by → museum - REL.QTY.CNT: voorwerpen → quantity → meer dan 5000 - REL.TMP.DUR: voorwerpen → from period → Tweede Wereldoorlog ## CLAIM TYPES FOR HERITAGE INSTITUTIONS - full_name: Official institution name - short_name: Abbreviated name or acronym - description: Institution description - email: Contact email - phone: Contact phone - address: Physical address - website: Official website URL - social_media: Social media links (facebook, twitter, instagram, linkedin, youtube) - opening_hours: Visitor hours - admission_info: Ticket/entry information - founding_date: When institution was established - collection_count: Number of items in collection - kvk_number: Dutch Chamber of Commerce number - isil_code: International Standard Identifier for Libraries - wikidata_id: Wikidata Q-number - parent_organization: Parent/umbrella organization """ class LLMAnnotator: """ LLM-based document annotator. Uses LLM inference for all entity recognition and claim extraction. NO heuristic or pattern-based methods. Example: >>> config = LLMAnnotatorConfig(provider=LLMProvider.ZAI, model="glm-4") >>> annotator = LLMAnnotator(config) >>> session = await annotator.annotate(document) >>> print(f"Found {len(session.entity_claims)} entities") """ def __init__(self, config: Optional[LLMAnnotatorConfig] = None): """ Initialize LLM annotator. Args: config: LLM configuration (defaults to Z.AI GLM-4) """ self.config = config or LLMAnnotatorConfig() self._client = None if not self.config.api_key: raise ValueError( f"API key not found for {self.config.provider.value}. " f"Set environment variable or pass api_key in config." ) async def annotate( self, document: Union[HTMLDocument, str, Path], source_url: Optional[str] = None, image_dir: Optional[Path] = None, ) -> AnnotationSession: """ Annotate a document using LLM inference. Args: document: HTMLDocument, HTML string, or path to HTML file source_url: Optional source URL for provenance image_dir: Optional directory containing downloaded images for vision analysis Returns: AnnotationSession with extracted claims """ # Load document if needed html_content: str source_file: Optional[str] = None if isinstance(document, Path): with open(document, 'r', encoding='utf-8') as f: html_content = f.read() source_url = source_url or str(document) source_file = str(document) # Auto-detect image directory if not provided if image_dir is None: image_dir = document.parent elif isinstance(document, str): # Check if it's a file path (short string, no HTML tags) is_file_path = len(document) < 500 and not document.strip().startswith('<') if is_file_path: try: path = Path(document) if path.exists(): with open(path, 'r', encoding='utf-8') as f: html_content = f.read() source_url = source_url or document source_file = document if image_dir is None: image_dir = path.parent else: html_content = document except OSError: # Path too long or invalid html_content = document else: html_content = document elif isinstance(document, HTMLDocument): html_content = document.raw_html source_url = source_url or document.source_url source_file = document.source_file else: raise TypeError(f"Unsupported document type: {type(document)}") # Create session session = AnnotationSession( session_id=f"llm-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", source_url=source_url or "unknown", source_file=source_file, ) # Prepare prompt user_prompt = self._prepare_prompt(html_content) # Call LLM for text annotation try: response = await self._call_llm(user_prompt) # Parse response annotations = self._parse_response(response) # Convert to claims self._populate_session(session, annotations, source_url) except Exception as e: session.errors.append(f"LLM annotation failed: {e}") # Image analysis (if enabled) if self.config.extract_images: try: image_claims = await self.analyze_images_in_html( html_content=html_content, base_url=source_url, image_dir=image_dir, ) for claim in image_claims: session.add_image_claim(claim) if image_claims: logger.info(f"Analyzed {len(image_claims)} images from document") except Exception as e: session.errors.append(f"Image analysis failed: {e}") logger.warning(f"Image analysis failed: {e}") session.completed_at = datetime.now(timezone.utc).isoformat() return session def _prepare_prompt(self, html_content: str) -> str: """Prepare the user prompt with document content.""" # Truncate if too long (LLM context limits) max_chars = 30000 if len(html_content) > max_chars: html_content = html_content[:max_chars] + "\n... [truncated]" return f"""Analyze the following HTML document and extract all entities, layout regions, claims, and relationships. Return a JSON object following the schema in the system prompt. HTML DOCUMENT: ```html {html_content} ``` ## ⚠️ CRITICAL EXTRACTION REQUIREMENTS ⚠️ ### 1. COMPREHENSIVE ENTITY EXTRACTION Extract ALL entities, not just named entities: - Heritage institutions (GRP.HER) - Ethnic groups (GRP.ETH): Jews, Roma, Sinti, etc. - Quantities (QTY.CNT): "more than 100,000", "5000 objects" - Historical events (THG.EVT): deportation, persecution, liberation - Abstract concepts (THG.CON): stories, memories, heritage, mission - Historical places (TOP.HIS): concentration camps, transit camps - Time periods (TMP.ERA): World War II, Holocaust ### 2. COMPREHENSIVE RELATIONSHIP EXTRACTION Decompose EVERY narrative sentence into semantic triples: - REL.ORG.ACT: What activities does the organization perform? (preserve, tell, exhibit, research, commemorate) - REL.SUB.ABT: What is the collection/institution/story ABOUT? - REL.QTY.CNT: Quantities of people, objects, visitors - REL.SPA.ORG: Where did people/things come FROM? - REL.SPA.DST: Where did people/things go TO? - REL.EVT.PAR: Who participated in events (voluntary or forced)? - REL.EVT.VIC: Who were victims of events? ### 3. EXAMPLE - WHAT WE EXPECT For text: "In het Herinneringscentrum vertellen we de verhalen van meer dan honderdduizend Joden" Extract: - **Entities**: - Herinneringscentrum (GRP.HER) - de verhalen (THG.CON - stories/narratives) - meer dan honderdduizend (QTY.CNT - >100,000) - Joden (GRP.ETH - Jews as ethnic group) - **Relationships**: - REL.ORG.ACT: Herinneringscentrum → tells → verhalen - REL.SUB.ABT: verhalen → about → Joden - REL.QTY.CNT: Joden → quantity → meer dan honderdduizend ### 4. DO NOT: - Skip abstract concepts or quantities - Extract only named entities - Ignore the semantic relationships within sentences - Produce shallow extractions with few relationships IMPORTANT: The richness of semantic extraction is critical. A single paragraph may contain 10-20 relationships. """ def _calculate_backoff_delay(self, attempt: int) -> float: """ Calculate delay for exponential backoff. Args: attempt: Current retry attempt number (0-indexed) Returns: Delay in seconds """ retry = self.config.retry delay = retry.base_delay * (retry.exponential_base ** attempt) delay = min(delay, retry.max_delay) # Add jitter to prevent thundering herd if retry.jitter: delay = delay * (0.5 + random.random()) return delay def _get_api_key_for_provider(self, provider: LLMProvider) -> Optional[str]: """Get API key for a specific provider from environment.""" env_vars = { LLMProvider.ZAI: "ZAI_API_TOKEN", LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY", LLMProvider.OPENAI: "OPENAI_API_KEY", } return os.environ.get(env_vars.get(provider, "")) async def _call_provider( self, provider: LLMProvider, user_prompt: str, api_key: Optional[str] = None, ) -> str: """ Call a specific LLM provider. Args: provider: Which provider to call user_prompt: The user prompt to send api_key: Optional API key override Returns: LLM response string """ # Use provided key or get from environment key = api_key or self._get_api_key_for_provider(provider) if not key: raise ValueError(f"No API key available for {provider.value}") if provider == LLMProvider.ZAI: return await self._call_zai(user_prompt, key) elif provider == LLMProvider.ANTHROPIC: return await self._call_anthropic(user_prompt, key) elif provider == LLMProvider.OPENAI: return await self._call_openai(user_prompt, key) else: raise ValueError(f"Unsupported provider: {provider}") async def _call_llm(self, user_prompt: str) -> str: """ Call the LLM API with retry logic and provider fallback. Implements: 1. Exponential backoff with jitter for rate limits 2. Automatic fallback to alternative providers on failure Returns: LLM response string Raises: Exception: If all retries and fallbacks are exhausted """ import httpx # Build provider chain: primary + fallbacks providers_to_try = [self.config.provider] if self.config.fallback_providers: providers_to_try.extend(self.config.fallback_providers) last_exception: Optional[Exception] = None for provider in providers_to_try: api_key = ( self.config.api_key if provider == self.config.provider else self._get_api_key_for_provider(provider) ) if not api_key: logger.info(f"Skipping {provider.value}: no API key available") continue logger.info(f"Trying provider: {provider.value}") for attempt in range(self.config.retry.max_retries): try: return await self._call_provider(provider, user_prompt, api_key) except httpx.HTTPStatusError as e: status_code = e.response.status_code if status_code in self.config.retry.retry_on_status: delay = self._calculate_backoff_delay(attempt) logger.warning( f"Provider {provider.value} returned {status_code} " f"(attempt {attempt + 1}/{self.config.retry.max_retries}). " f"Retrying in {delay:.2f}s..." ) await asyncio.sleep(delay) last_exception = e else: # Non-retryable error, try next provider logger.error( f"Provider {provider.value} returned non-retryable " f"status {status_code}: {e}" ) last_exception = e break except httpx.TimeoutException as e: delay = self._calculate_backoff_delay(attempt) logger.warning( f"Provider {provider.value} timed out " f"(attempt {attempt + 1}/{self.config.retry.max_retries}). " f"Retrying in {delay:.2f}s..." ) await asyncio.sleep(delay) last_exception = e except Exception as e: logger.error(f"Provider {provider.value} failed: {e}") last_exception = e break # All retries exhausted for this provider, try next logger.warning(f"Provider {provider.value} exhausted all retries") # All providers failed raise RuntimeError( f"All LLM providers failed. Last error: {last_exception}" ) from last_exception async def _call_zai(self, user_prompt: str, api_key: str) -> str: """ Call Z.AI API using Anthropic-compatible endpoint. Z.AI GLM Coding Plan provides an Anthropic-compatible API at: https://api.z.ai/api/anthropic/v1/messages Uses same message format as Anthropic Claude API. """ import httpx # Z.AI Anthropic-compatible endpoint url = "https://api.z.ai/api/anthropic/v1/messages" # Z.AI uses Anthropic-style headers headers = { "x-api-key": api_key, "anthropic-version": "2023-06-01", "Content-Type": "application/json", } # Map Z.AI model names - GLM models available via Anthropic API # Default to claude-3-5-sonnet if model not explicitly set for Z.AI model = self.config.model if model.startswith("glm-"): # Z.AI's Anthropic endpoint uses Claude model names model = "claude-sonnet-4-20250514" payload = { "model": model, "max_tokens": self.config.max_tokens, "system": GLAM_NER_SYSTEM_PROMPT, "messages": [ {"role": "user", "content": user_prompt}, ], } async with httpx.AsyncClient(timeout=self.config.timeout) as client: response = await client.post(url, headers=headers, json=payload) response.raise_for_status() data = response.json() # Anthropic response format return data["content"][0]["text"] async def _call_anthropic(self, user_prompt: str, api_key: str) -> str: """Call Anthropic Claude API.""" import httpx url = "https://api.anthropic.com/v1/messages" headers = { "x-api-key": api_key, "anthropic-version": "2023-06-01", "Content-Type": "application/json", } # Use Claude-specific model name for fallback model = ( self.config.model if self.config.provider == LLMProvider.ANTHROPIC else "claude-3-5-sonnet-20241022" ) payload = { "model": model, "max_tokens": self.config.max_tokens, "system": GLAM_NER_SYSTEM_PROMPT, "messages": [ {"role": "user", "content": user_prompt}, ], } async with httpx.AsyncClient(timeout=self.config.timeout) as client: response = await client.post(url, headers=headers, json=payload) response.raise_for_status() data = response.json() return data["content"][0]["text"] async def _call_openai(self, user_prompt: str, api_key: str) -> str: """Call OpenAI GPT-4 API.""" import httpx url = "https://api.openai.com/v1/chat/completions" headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } # Use OpenAI-specific model name for fallback model = ( self.config.model if self.config.provider == LLMProvider.OPENAI else "gpt-4o" ) payload = { "model": model, "messages": [ {"role": "system", "content": GLAM_NER_SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], "temperature": self.config.temperature, "max_tokens": self.config.max_tokens, } async with httpx.AsyncClient(timeout=self.config.timeout) as client: response = await client.post(url, headers=headers, json=payload) response.raise_for_status() data = response.json() return data["choices"][0]["message"]["content"] # ========================================================================= # IMAGE ANALYSIS METHODS (Z.AI GLM-4.5V Vision API) # ========================================================================= async def _analyze_image( self, image_url: Optional[str] = None, image_base64: Optional[str] = None, image_path: Optional[str] = None, alt_text: Optional[str] = None, context: Optional[str] = None, ) -> Dict[str, Any]: """ Analyze an image using Z.AI GLM-4.5V vision model. Extracts visual descriptions, entities, OCR text, and heritage relevance. Args: image_url: URL of the image (absolute or relative) image_base64: Base64-encoded image data image_path: Local file path to image alt_text: HTML alt text for context context: Surrounding text context from the page Returns: Dict with analysis results: { "description": "Natural language description", "detected_entities": [{"type": "AGT.PER", "text": "...", "confidence": 0.9}], "extracted_text": "OCR text if present", "heritage_relevance": "Why this matters for heritage", "image_type": "photograph|painting|document|map|artifact|other", "era_estimate": "Estimated time period", "style": "Photographic/artistic style", "analysis_confidence": 0.85 } """ import httpx import base64 # Prepare image content for API image_content = None if image_base64: # Already base64 encoded image_content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}} elif image_path: # Read and encode local file try: path = Path(image_path) if path.exists(): with open(path, 'rb') as f: img_data = base64.b64encode(f.read()).decode('utf-8') # Detect MIME type from extension ext = path.suffix.lower() mime_types = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp' } mime_type = mime_types.get(ext, 'image/jpeg') image_content = {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_data}"}} except Exception as e: logger.warning(f"Failed to read image file {image_path}: {e}") return {"error": f"Failed to read image: {e}"} elif image_url: # Use URL directly (works for absolute URLs) if image_url.startswith('http://') or image_url.startswith('https://'): image_content = {"type": "image_url", "image_url": {"url": image_url}} else: # Relative URL - can't analyze without base URL logger.debug(f"Skipping relative URL image: {image_url}") return {"error": "Relative URL - cannot analyze without base URL"} if not image_content: return {"error": "No valid image source provided"} # Build prompt for heritage image analysis prompt_parts = [ "Analyze this image from a heritage institution website.", "", "Provide a JSON response with the following fields:", "- description: Detailed description of what the image shows", "- detected_entities: Array of entities visible in the image, each with {type, text, confidence}", " - Use GLAM-NER types: AGT.PER (person), WRK.VIS (artwork), THG.ART (artifact), TOP.BLD (building), GRP.ETH (ethnic group), etc.", "- extracted_text: Any text visible in the image (OCR)", "- heritage_relevance: Why this image is significant for heritage/cultural preservation", "- image_type: One of: photograph, painting, document, map, artifact, museum_object, historical_photo, memorial, building, portrait, group_photo, exhibition, other", "- era_estimate: Estimated time period of the content (e.g., '1940s', 'World War II', 'medieval', 'contemporary')", "- style: Artistic or photographic style", "- analysis_confidence: Your confidence in this analysis (0.0-1.0)", ] if alt_text: prompt_parts.extend(["", f"HTML alt text: {alt_text}"]) if context: prompt_parts.extend(["", f"Page context: {context[:500]}..."]) prompt_parts.extend([ "", "Return ONLY valid JSON, no markdown code blocks." ]) prompt = "\n".join(prompt_parts) # Call Z.AI GLM-4.5V Vision API # Z.AI uses OpenAI-compatible format for vision at a different endpoint url = "https://api.z.ai/api/paas/v4/chat/completions" headers = { "Authorization": f"Bearer {self.config.api_key}", "Content-Type": "application/json", } payload = { "model": self.config.vision_model, # "glm-4.5v" "messages": [ { "role": "user", "content": [ image_content, {"type": "text", "text": prompt} ] } ], "max_tokens": self.config.vision_max_tokens, "temperature": 0.1, } # Retry logic with exponential backoff for vision API max_retries = self.config.retry.max_retries base_delay = self.config.retry.base_delay for attempt in range(max_retries + 1): try: async with httpx.AsyncClient(timeout=60) as client: response = await client.post(url, headers=headers, json=payload) # Check for rate limit if response.status_code == 429: if attempt < max_retries: delay = base_delay * (2 ** attempt) if self.config.retry.jitter: delay += random.uniform(0, delay * 0.1) delay = min(delay, self.config.retry.max_delay) logger.info(f"Vision API rate limited, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})") await asyncio.sleep(delay) continue else: return {"error": "Vision API rate limited after max retries"} response.raise_for_status() data = response.json() # Parse response content content = data.get("choices", [{}])[0].get("message", {}).get("content", "") # Try to parse as JSON try: result = json.loads(content) return result except json.JSONDecodeError: # If not valid JSON, return the text as description return { "description": content, "detected_entities": [], "analysis_confidence": 0.5, "error": "Response was not valid JSON" } except httpx.HTTPStatusError as e: if e.response.status_code in self.config.retry.retry_on_status and attempt < max_retries: delay = base_delay * (2 ** attempt) logger.info(f"Vision API error {e.response.status_code}, retrying in {delay:.1f}s") await asyncio.sleep(delay) continue logger.warning(f"Vision API HTTP error: {e.response.status_code}") return {"error": f"Vision API error: {e.response.status_code}"} except Exception as e: logger.warning(f"Vision API call failed: {e}") return {"error": str(e)} return {"error": "Vision API failed after retries"} async def analyze_images_in_html( self, html_content: str, base_url: Optional[str] = None, image_dir: Optional[Path] = None, ) -> List[ImageClaim]: """ Extract and analyze all images from an HTML document. Args: html_content: Raw HTML content base_url: Base URL for resolving relative image paths image_dir: Local directory containing downloaded images Returns: List of ImageClaim objects with analysis results """ from lxml import html as lxml_html from urllib.parse import urljoin image_claims: List[ImageClaim] = [] # Parse HTML try: tree = lxml_html.fromstring(html_content) except Exception as e: logger.warning(f"Failed to parse HTML for image extraction: {e}") return [] # Find all img elements img_elements = tree.xpath('//img[@src]') # Limit number of images per page if len(img_elements) > self.config.max_images_per_page: logger.info(f"Limiting image analysis to {self.config.max_images_per_page} of {len(img_elements)} images") img_elements = img_elements[:self.config.max_images_per_page] for img in img_elements: src = img.get('src', '') alt = img.get('alt', '') title = img.get('title', '') width_str = img.get('width', '') height_str = img.get('height', '') # Skip data URIs and tiny images if src.startswith('data:'): continue # Parse dimensions try: width = int(width_str) if width_str.isdigit() else None height = int(height_str) if height_str.isdigit() else None # Skip tiny images (likely icons/decorations) if width and height: if width < self.config.min_image_size or height < self.config.min_image_size: continue except (ValueError, TypeError): width = height = None # Get XPath for provenance xpath = tree.getroottree().getpath(img) # Get surrounding context text parent = img.getparent() context = "" if parent is not None: # Get text from parent and siblings context_parts = [] if parent.text: context_parts.append(parent.text.strip()) for sibling in parent: if sibling.tail: context_parts.append(sibling.tail.strip()) context = " ".join(context_parts)[:300] # Resolve image URL image_url = None image_path = None image_base64 = None if src.startswith('http://') or src.startswith('https://'): image_url = src elif base_url: image_url = urljoin(base_url, src) # Check for local copy if image_dir: # Try to find the image in the local directory src_filename = Path(src).name local_candidates = [ image_dir / src_filename, image_dir / src.lstrip('/'), image_dir / src, ] for candidate in local_candidates: if candidate.exists(): image_path = str(candidate) break # Analyze the image analysis = await self._analyze_image( image_url=image_url, image_path=image_path, image_base64=image_base64, alt_text=alt or title, context=context, ) # Skip if analysis failed if analysis.get("error"): logger.debug(f"Skipping image {src}: {analysis.get('error')}") continue # Create ImageClaim claim = ImageClaim( image_url=image_url or src, image_path=image_path, alt_text=alt if alt else None, title=title if title else None, width=width, height=height, xpath=xpath, description=analysis.get("description"), detected_entities=analysis.get("detected_entities", []), extracted_text=analysis.get("extracted_text"), heritage_relevance=analysis.get("heritage_relevance"), image_type=analysis.get("image_type"), era_estimate=analysis.get("era_estimate"), style=analysis.get("style"), analysis_model=self.config.vision_model, analysis_confidence=analysis.get("analysis_confidence"), provenance=Provenance( namespace="glam-ner", path=xpath, timestamp=datetime.now(timezone.utc).isoformat(), agent=f"LLMAnnotator/{self.config.vision_model}", context_convention="GLAM-NER v1.7.0-unified/vision", confidence=analysis.get("analysis_confidence", 0.5), ), ) image_claims.append(claim) return image_claims def _parse_response(self, response: str) -> Dict[str, Any]: """Parse LLM response JSON.""" # Find JSON in response (may be wrapped in markdown code blocks) import re # Try to find JSON block json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response) if json_match: json_str = json_match.group(1) else: # Try to parse whole response as JSON json_str = response try: return json.loads(json_str) except json.JSONDecodeError: # Return empty structure if parsing fails return {"entities": [], "layout_regions": [], "claims": []} def _populate_session( self, session: AnnotationSession, annotations: Dict[str, Any], source_url: Optional[str], ) -> None: """Populate session with parsed annotations.""" timestamp = datetime.now(timezone.utc).isoformat() # Build entity ID lookup for relationship linking entity_id_lookup: Dict[str, str] = {} # Process entities for entity in annotations.get("entities", []): # Parse hypernym - can come from explicit field or be inferred from hyponym hypernym_str = entity.get("hypernym", "THG") hyponym_str = entity.get("hyponym", "") # If hyponym has dot notation (e.g., "GRP.HER"), extract hypernym from it if hyponym_str and "." in hyponym_str: hypernym_str = hyponym_str.split(".")[0] try: hypernym = EntityHypernym(hypernym_str) except ValueError: hypernym = EntityHypernym.THG # Generate unique claim ID claim_id = f"entity-{len(session.entity_claims)+1}" # Store lookup for relationship linking (by text span) entity_text = entity.get("text", "") if entity_text: entity_id_lookup[entity_text] = claim_id # Get class_uri from hyponym (or hypernym as fallback) # hyponym takes precedence for more specific ontology mapping class_uri = None if hyponym_str: class_uri = get_ontology_class(hyponym_str) if not class_uri: class_uri = get_ontology_class(hypernym_str) # Map LLM response fields to EntityClaim fields # EntityClaim inherits text_content from Claim base class claim = EntityClaim( claim_id=claim_id, hypernym=hypernym, hyponym=hyponym_str if hyponym_str else "unknown", text_content=entity_text, # LLM returns "text", we use text_content class_uri=class_uri, # Auto-populated from hyponym/hypernym isil_id=entity.get("isil_id"), # If present from LLM cidoc_class=entity.get("cidoc_class"), # Backwards compatibility recognition_confidence=entity.get("confidence", 0.5), provenance=Provenance( namespace="glam-ner", path=entity.get("xpath", ""), timestamp=timestamp, agent=f"{self.config.provider.value}/{self.config.model}", context_convention=self.config.context_convention, ), ) session.entity_claims.append(claim) # Process layout regions for region in annotations.get("layout_regions", []): try: region_type = LayoutRegion(region.get("region", "PAR")) except ValueError: region_type = LayoutRegion.PAR try: semantic_role = SemanticRole(region.get("semantic_role", "PRIM")) except ValueError: semantic_role = SemanticRole.PRIM # LayoutClaim uses text_content from base Claim class claim = LayoutClaim( claim_id=f"layout-{len(session.layout_claims)+1}", region=region_type, semantic_role=semantic_role, xpath=region.get("xpath", ""), text_content=region.get("text_preview", "")[:200], # Use text_content provenance=Provenance( namespace="glam-ner", path=region.get("xpath", ""), timestamp=timestamp, agent=f"{self.config.provider.value}/{self.config.model}", context_convention=self.config.context_convention, ), ) session.layout_claims.append(claim) # Process relationships for rel in annotations.get("relationships", []): self._process_relationship( session=session, rel_data=rel, entity_id_lookup=entity_id_lookup, timestamp=timestamp, ) # Process claims (aggregate) for claim_data in annotations.get("claims", []): claim = AggregateClaim( claim_id=f"claim-{len(session.aggregate_claims)+1}", claim_type=claim_data.get("claim_type", "unknown"), claim_value=claim_data.get("claim_value", ""), text_content=claim_data.get("claim_value", ""), # Store value in text_content too provenance=Provenance( namespace="glam-ner", path=claim_data.get("xpath", ""), timestamp=timestamp, agent=f"{self.config.provider.value}/{self.config.model}", context_convention=self.config.context_convention, confidence=claim_data.get("confidence", 0.5), ), ) session.aggregate_claims.append(claim) def _process_relationship( self, session: AnnotationSession, rel_data: Dict[str, Any], entity_id_lookup: Dict[str, str], timestamp: str, ) -> None: """Process a relationship from LLM response and add to session.""" # Parse relationship type rel_type = rel_data.get("relationship_type", "REL.CRE") rel_hypernym = None rel_hyponym = None # Extract hypernym and hyponym from relationship type if rel_type and "." in rel_type: parts = rel_type.split(".") if len(parts) >= 2: hypernym_str = f"{parts[0]}.{parts[1]}" # e.g., "REL.CRE" try: rel_hypernym = RelationshipHypernym(hypernym_str) except ValueError: pass rel_hyponym = rel_type # Full code, e.g., "REL.CRE.AUT" # Parse subject subject_data = rel_data.get("subject", {}) subject_text = subject_data.get("text", "") subject = RelationshipSubject( entity_id=entity_id_lookup.get(subject_text), entity_type=subject_data.get("entity_type") or subject_data.get("type"), # Support both keys span_text=subject_text, uri=subject_data.get("uri"), ) # Parse object object_data = rel_data.get("object", {}) object_text = object_data.get("text", "") obj = RelationshipObject( entity_id=entity_id_lookup.get(object_text), entity_type=object_data.get("entity_type") or object_data.get("type"), # Support both keys span_text=object_text, uri=object_data.get("uri"), ) # Parse predicate predicate_data = rel_data.get("predicate", {}) predicate = RelationshipPredicate( uri=predicate_data.get("uri"), label=predicate_data.get("label", rel_type), direction=predicate_data.get("direction", "FORWARD"), ) # Parse temporal scope (if present) temporal_scope = None temporal_data = rel_data.get("temporal", {}) if temporal_data: temporal_scope = TemporalScope( start_date=temporal_data.get("start_date"), end_date=temporal_data.get("end_date"), temporal_modifier=temporal_data.get("modifier"), ) # Parse spatial scope (if present) spatial_scope = None spatial_data = rel_data.get("spatial", {}) if spatial_data: spatial_scope = SpatialScope( place_id=spatial_data.get("place_id"), place_name=spatial_data.get("place_name"), geo_uri=spatial_data.get("geo_uri"), ) # Parse qualifiers (if present) qualifiers = [] for qual_data in rel_data.get("qualifiers", []): qualifiers.append(RelationshipQualifier( qualifier_type=qual_data.get("type", ""), qualifier_value=qual_data.get("value", ""), qualifier_uri=qual_data.get("uri"), )) # Create relationship claim claim = RelationshipClaim( claim_id=f"rel-{len(session.relationship_claims)+1}", relationship_hypernym=rel_hypernym, relationship_hyponym=rel_hyponym, subject=subject, predicate=predicate, object=obj, temporal_scope=temporal_scope, spatial_scope=spatial_scope, qualifiers=qualifiers, negation=rel_data.get("negation", False), hypothetical=rel_data.get("hypothetical", False), source_claim=rel_data.get("source_claim", False), attributed_to=rel_data.get("attributed_to"), extraction_confidence=rel_data.get("confidence", 0.5), text_content=rel_data.get("text", ""), # Original text span provenance=Provenance( namespace="glam-ner", path=rel_data.get("xpath", ""), timestamp=timestamp, agent=f"{self.config.provider.value}/{self.config.model}", context_convention=self.config.context_convention, confidence=rel_data.get("confidence", 0.5), ), ) # Validate domain/range constraints if rel_hyponym: validation_result = validate_relationship_constraints( relationship_type=rel_hyponym, subject_type=subject.entity_type, object_type=obj.entity_type, strict=False, # Treat violations as warnings, not errors ) # Add any validation warnings to session errors if validation_result.warnings: for warning in validation_result.warnings: session.errors.append(f"[VALIDATION WARNING] {claim.claim_id}: {warning}") logger.warning(f"Relationship validation: {warning}") if validation_result.errors: for error in validation_result.errors: session.errors.append(f"[VALIDATION ERROR] {claim.claim_id}: {error}") logger.error(f"Relationship validation: {error}") session.add_relationship_claim(claim) async def annotate_batch( self, documents: List[Union[HTMLDocument, str, Path]], source_urls: Optional[List[str]] = None, concurrency: int = 3, ) -> List[AnnotationSession]: """ Annotate multiple documents concurrently. Args: documents: List of documents to annotate source_urls: Optional list of source URLs concurrency: Maximum concurrent requests Returns: List of AnnotationSessions """ urls: List[Optional[str]] = list(source_urls) if source_urls else [None] * len(documents) semaphore = asyncio.Semaphore(concurrency) async def annotate_with_semaphore(doc: Union[HTMLDocument, str, Path], url: Optional[str]) -> AnnotationSession: async with semaphore: return await self.annotate(doc, url) tasks = [ annotate_with_semaphore(doc, url) for doc, url in zip(documents, urls) ] return await asyncio.gather(*tasks) async def annotate_with_schema( self, document: Union[HTMLDocument, str, Path], schema: Optional[GLAMSchema] = None, source_url: Optional[str] = None, validate_output: bool = True, ) -> Tuple[AnnotationSession, Dict[str, Any]]: """ Annotate a document using schema-driven extraction. This method uses GLAMSchema to: 1. Generate targeted extraction prompts 2. Extract structured fields defined in the schema 3. Optionally validate output against JSON Schema Args: document: HTMLDocument, HTML string, or path to HTML file schema: GLAMSchema for extraction (defaults to heritage_custodian_schema) source_url: Optional source URL for provenance validate_output: Whether to validate extracted data against schema Returns: Tuple of (AnnotationSession, structured_data dict) Example: >>> schema = ( ... GLAMSchema("custom") ... .entities("GRP", "TOP") ... .structure() ... .field("name::str::Institution name") # GLiNER2 syntax ... .field("type::[MUSEUM|ARCHIVE]::str::Type") ... .build() ... ) >>> session, data = await annotator.annotate_with_schema(doc, schema) >>> print(data["structured"]["name"]) """ # Use default schema if not provided if schema is None: schema = heritage_custodian_schema() # Load document html_content: str if isinstance(document, Path): with open(document, 'r', encoding='utf-8') as f: html_content = f.read() source_url = source_url or str(document) elif isinstance(document, str): # Check if it's a file path (short string, no HTML tags) is_file_path = len(document) < 500 and not document.strip().startswith('<') if is_file_path: try: path = Path(document) if path.exists(): with open(path, 'r', encoding='utf-8') as f: html_content = f.read() source_url = source_url or document else: html_content = document except OSError: # Path too long or invalid html_content = document else: html_content = document elif isinstance(document, HTMLDocument): html_content = document.raw_html source_url = source_url or document.source_url else: raise TypeError(f"Unsupported document type: {type(document)}") # Create session session = AnnotationSession( session_id=f"schema-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", source_url=source_url or "unknown", ) # Generate schema-aware prompt schema_prompt = schema.to_llm_prompt(include_examples=True, output_format="json") user_prompt = self._prepare_schema_prompt(html_content, schema_prompt) structured_data: Dict[str, Any] = {} try: # Call LLM with schema-aware prompt response = await self._call_llm(user_prompt) # Parse response annotations = self._parse_response(response) # Extract structured data from response structured_data = annotations.get("structured", {}) # Also extract classifications if present if "classifications" in annotations: structured_data["_classifications"] = annotations["classifications"] # Extract relations if present if "relations" in annotations: structured_data["_relations"] = annotations["relations"] # Validate against JSON Schema if requested if validate_output and structured_data: validation_errors = self._validate_structured_output(structured_data, schema) if validation_errors: session.errors.extend(validation_errors) # Populate session with entity and claim data self._populate_session(session, annotations, source_url) # Add structured data to session config session.config["structured_data"] = structured_data session.config["schema_name"] = schema.name except Exception as e: session.errors.append(f"Schema-driven annotation failed: {e}") session.completed_at = datetime.now(timezone.utc).isoformat() return session, structured_data def _prepare_schema_prompt(self, html_content: str, schema_prompt: str) -> str: """Prepare prompt with schema instructions and document content.""" # Truncate if too long max_chars = 25000 # Leave room for schema prompt if len(html_content) > max_chars: html_content = html_content[:max_chars] + "\n... [truncated]" return f"""{schema_prompt} --- ## Document to Analyze Extract all information following the schema above from this HTML document: ```html {html_content} ``` ## Instructions 1. Extract ALL entities matching the specified hypernyms 2. Fill in ALL structured fields from the schema 3. Include XPath locations for provenance 4. Use confidence scores appropriately 5. Return ONLY a valid JSON object matching the output format IMPORTANT: The "structured" field in your response must contain the extracted field values. """ def _validate_structured_output( self, data: Dict[str, Any], schema: GLAMSchema, ) -> List[str]: """ Validate structured output against schema. Args: data: Extracted structured data schema: GLAMSchema used for extraction Returns: List of validation error messages (empty if valid) """ errors = [] # Check required fields for field in schema.fields: if field.required and field.name not in data: errors.append(f"Missing required field: {field.name}") # Validate field types and choices for field in schema.fields: if field.name not in data: continue value = data[field.name] # Check choices if field.choices and value: if field.dtype == "list": invalid_values = [v for v in value if v not in field.choices] if invalid_values: errors.append( f"Invalid values for {field.name}: {invalid_values}. " f"Valid: {field.choices}" ) elif value not in field.choices: errors.append( f"Invalid value for {field.name}: {value}. " f"Valid: {field.choices}" ) # Check patterns if field.pattern and value and isinstance(value, str): import re if not re.match(field.pattern, value): errors.append( f"Field {field.name} does not match pattern {field.pattern}: {value}" ) return errors async def extract_structured( self, document: Union[HTMLDocument, str, Path], fields: List[str], source_url: Optional[str] = None, ) -> Dict[str, Any]: """ Quick structured extraction using GLiNER2-style field specs. This is a convenience method for simple extractions without full annotation session overhead. Args: document: Document to extract from fields: List of GLiNER2-style field specs e.g., ["name::str::Institution name", "type::[MUSEUM|ARCHIVE]::str::Type"] source_url: Optional source URL Returns: Dict of extracted field values Example: >>> data = await annotator.extract_structured( ... html_doc, ... ["name::str::Full name", ... "email::str::Contact email", ... "type::[MUSEUM|ARCHIVE|LIBRARY]::str::Institution type"] ... ) >>> print(data["name"]) """ # Build schema from field specs schema = GLAMSchema("quick_extraction").structure() for field_spec in fields: parsed = FieldSpec.from_gliner2_syntax(field_spec) schema.fields.append(parsed) schema = schema.build() # Run extraction _, structured_data = await self.annotate_with_schema( document, schema=schema, source_url=source_url, validate_output=False, # Skip validation for quick extraction ) return structured_data # ============================================================================= # CONVENIENCE FUNCTIONS # ============================================================================= def create_llm_annotator( provider: str = "zai", model: Optional[str] = None, api_key: Optional[str] = None, enable_fallback: bool = True, max_retries: int = 5, ) -> LLMAnnotator: """ Create an LLM annotator with the specified provider. Args: provider: "zai", "anthropic", or "openai" model: Optional model name (uses provider default if not specified) api_key: Optional API key (uses environment variable if not specified) enable_fallback: Enable automatic fallback to other providers on failure max_retries: Maximum retry attempts per provider Returns: Configured LLMAnnotator instance """ provider_enum = LLMProvider(provider) default_models = { LLMProvider.ZAI: "glm-4.6", LLMProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", LLMProvider.OPENAI: "gpt-4o", } # Configure retry retry_config = RetryConfig(max_retries=max_retries) # Configure fallback providers fallback_providers = ( [p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI] if p != provider_enum] if enable_fallback else [] ) config = LLMAnnotatorConfig( provider=provider_enum, model=model or default_models[provider_enum], api_key=api_key, retry=retry_config, fallback_providers=fallback_providers, ) return LLMAnnotator(config) async def annotate_html_file( file_path: Union[str, Path], provider: str = "zai", model: Optional[str] = None, ) -> AnnotationSession: """ Annotate an HTML file using LLM. Args: file_path: Path to HTML file provider: LLM provider ("zai", "anthropic", "openai") model: Optional model name Returns: AnnotationSession with extracted claims """ annotator = create_llm_annotator(provider=provider, model=model) return await annotator.annotate(file_path) async def annotate_with_schema( file_path: Union[str, Path], schema: Optional[GLAMSchema] = None, provider: str = "zai", model: Optional[str] = None, ) -> Tuple[AnnotationSession, Dict[str, Any]]: """ Annotate an HTML file using schema-driven extraction. Args: file_path: Path to HTML file schema: GLAMSchema for extraction (defaults to heritage_custodian_schema) provider: LLM provider ("zai", "anthropic", "openai") model: Optional model name Returns: Tuple of (AnnotationSession, structured_data dict) Example: >>> schema = ( ... GLAMSchema("museum") ... .entities("GRP", "TOP") ... .structure() ... .field("name::str::Museum name") ... .field("city::str::City location") ... .build() ... ) >>> session, data = await annotate_with_schema("museum.html", schema) >>> print(data["name"]) """ annotator = create_llm_annotator(provider=provider, model=model) return await annotator.annotate_with_schema(file_path, schema=schema) __all__ = [ "LLMProvider", "LLMAnnotatorConfig", "RetryConfig", "LLMAnnotator", "GLAM_NER_SYSTEM_PROMPT", "create_llm_annotator", "annotate_html_file", "annotate_with_schema", ]