glam/src/glam_extractor/annotators/llm_annotator.py

"""
LLM-Based Agentic Annotator for GLAM Documents.

This module provides LLM-only entity annotation following GLAM-NER v1.7.0.
NO HEURISTIC/PATTERN-BASED METHODS - all recognition is done via LLM inference.

Supported LLM Providers:
- Z.AI (Zhipu AI) GLM-4 (default)
- Anthropic Claude
- OpenAI GPT-4

Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.

Features:
- Exponential backoff retry for rate limits (429)
- Automatic provider fallback (Z.AI → Claude → OpenAI)
- Configurable retry attempts and delays
"""

import asyncio
import json
import logging
import os
import random
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from enum import Enum

# Configure logging
logger = logging.getLogger(__name__)

# Import base classes
from .base import (
    AnnotationSession,
    EntityClaim,
    LayoutClaim,
    AggregateClaim,
    ImageClaim,
    Provenance,
    EntityHypernym,
    LayoutRegion,
    SemanticRole,
    RelationshipClaim,
    RelationshipSubject,
    RelationshipObject,
    RelationshipPredicate,
    RelationshipHypernym,
    TemporalScope,
    SpatialScope,
    RelationshipQualifier,
    get_ontology_class,
    validate_relationship_constraints,
)
from .html_parser import HTMLDocument
from .schema_builder import GLAMSchema, FieldSpec, heritage_custodian_schema


class LLMProvider(str, Enum):
    """Supported LLM providers."""
    ZAI = "zai"           # Zhipu AI GLM-4
    ANTHROPIC = "anthropic"  # Claude
    OPENAI = "openai"      # GPT-4


@dataclass
class RetryConfig:
    """Configuration for retry logic with exponential backoff."""
    max_retries: int = 5              # Maximum retry attempts
    base_delay: float = 1.0           # Initial delay in seconds
    max_delay: float = 60.0           # Maximum delay cap
    exponential_base: float = 2.0     # Exponential backoff base
    jitter: bool = True               # Add random jitter to prevent thundering herd
    retry_on_status: tuple = (429, 500, 502, 503, 504)  # HTTP status codes to retry


@dataclass
class LLMAnnotatorConfig:
    """Configuration for LLM-based annotation."""
    provider: LLMProvider = LLMProvider.ZAI
    model: str = "glm-4.6"  # Z.AI's latest model
    api_key: Optional[str] = None
    temperature: float = 0.1  # Low temp for consistent extraction
    max_tokens: int = 4096
    timeout: int = 120  # Longer timeout for LLM calls

    # Annotation settings
    extract_entities: bool = True
    extract_layout: bool = True
    extract_claims: bool = True
    extract_images: bool = True  # NEW: Enable image analysis via vision model

    # Vision model settings (for image analysis)
    vision_model: str = "glm-4.5v"  # Z.AI's vision model
    vision_max_tokens: int = 2048  # Max tokens for vision response
    max_images_per_page: int = 10  # Limit images analyzed per page
    min_image_size: int = 50  # Minimum dimension (width or height) to analyze

    # Provenance settings
    context_convention: str = "GLAM-NER v1.7.0-unified"

    # Retry settings
    retry: RetryConfig = field(default_factory=RetryConfig)

    # Fallback providers (tried in order when primary fails)
    fallback_providers: Optional[List[LLMProvider]] = None

    def __post_init__(self):
        """Load API key from environment if not provided."""
        if self.api_key is None:
            if self.provider == LLMProvider.ZAI:
                self.api_key = os.environ.get("ZAI_API_TOKEN")
            elif self.provider == LLMProvider.ANTHROPIC:
                self.api_key = os.environ.get("ANTHROPIC_API_KEY")
            elif self.provider == LLMProvider.OPENAI:
                self.api_key = os.environ.get("OPENAI_API_KEY")

        # Default fallback chain if not specified
        if self.fallback_providers is None:
            self.fallback_providers = [
                p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
                if p != self.provider
            ]


# =============================================================================
# GLAM-NER v1.7.0 SYSTEM PROMPT
# =============================================================================

GLAM_NER_SYSTEM_PROMPT = """You are an expert entity annotator following the GLAM-NER v1.7.0-unified Entity Annotation Convention.

Your task is to extract structured claims from heritage institution documents with full provenance.

## HYPERNYMS AND HYPONYMS (10 types with subcategories)

### 1. AGT (Agent): Humans, animals, AI, fictional characters
   **Subcategories:**
   - AGT.PER: Person - INDIVIDUAL human beings with SPECIFIC NAMES (maps to crm:E21_Person)
     ⚠️ STRICT EXCLUSIONS - Do NOT tag as AGT.PER:
       • Groups/collectives: "staff", "members", "curators", "colleagues", "board", "team", "committee", "participants", "community"
       • Plural person references: "archivists", "researchers", "visitors", "filmmakers", "historians"
       • Role descriptions without names: "the curator", "a researcher", "museum director"
       • Organizations/events with role words: "FIAF Commission members", "conference colleagues", "board members"
       • Networks: "VPRO/Tegenlicht network", "ACE member institutions"
       • Topic references: "Verhalen van Bolsward" (stories about something)
       • Conference/event participants: "Women and Silent Screen Conference participants"
       • Fund/foundation board: "Prince Claus Fund board members"
       • Festival communities: "Le Giornate del Cinema Muto community"
       • Generic collectives: "community", "network", "consortium", "association"
     ✓ ONLY tag as AGT.PER: Named individuals like "Giovanna Fossati", "Dr. Jan van der Berg", "Martin Scorsese"
   - AGT.STF: Staff - personnel in professional roles (maps to pico:PersonObservation)
   - AGT.COL: Collective - named collectives without formal structure
   - AGT.FIC: Fictional - characters from fiction/mythology
   - AGT.MYT: Mythological - gods, deities, legendary figures
   - AGT.ANI: Animal - named individual animals with agency
   - AGT.ART: Artificial - AI systems, robots, software agents (maps to prov:SoftwareAgent)

   Examples: "Dr. Jan van der Berg" → AGT.PER, "Giovanna Fossati" → AGT.PER, "the museum director" → AGT.STF
   ❌ NOT AGT.PER: "AMIA conference colleagues", "Prince Claus Fund board members", "festival community"

### 2. GRP (Group): Organizations, collectives, formal and informal
   **Subcategories:**
   - GRP.HER: Heritage institutions - museums, archives, libraries (maps to glam:HeritageCustodian)
   - GRP.PAR: Parent/governing bodies (maps to rico:CorporateBody)
   - GRP.UNT: Organizational units/departments (maps to org:OrganizationalUnit)
   - GRP.COR: Corporations and businesses (maps to schema:Corporation)
   - GRP.GOV: Government agencies (maps to schema:GovernmentOrganization)
   - GRP.EDU: Educational institutions (maps to schema:EducationalOrganization)
   - GRP.REL: Religious organizations (maps to schema:ReligiousOrganization)
   - GRP.ASS: Associations and societies (maps to org:FormalOrganization)
   - GRP.INF: Informal groups - movements, families, dynasties
   - GRP.HIS: Historical organizations - defunct entities
   - GRP.ETH: Ethnic groups - Jews, Roma, Sinti, indigenous peoples (maps to crm:E74_Group)

   Examples: "Rijksmuseum" → GRP.HER, "Ministry of Culture" → GRP.GOV, "Joden" → GRP.ETH

### 3. TOP (Toponym): Place names, nominal geographic references
   **Subcategories:**
   - TOP.SET: Settlement - cities, towns, villages (maps to schema:City)
   - TOP.REG: Region - provinces, states, counties (maps to schema:AdministrativeArea)
   - TOP.CTY: Country - nations, sovereign states (maps to schema:Country)
   - TOP.ADR: Address - street addresses (maps to schema:PostalAddress)
   - TOP.IAD: Institutional address
   - TOP.BLD: Building - named buildings, monuments (maps to crm:E18_Physical_Thing)
   - TOP.NAT: Natural features - mountains, rivers
   - TOP.HIS: Historical places - concentration camps, transit camps, former territories (maps to crm:E53_Place)
   - TOP.LEG: Legendary/fictional places

   Examples: "Amsterdam" → TOP.SET, "the Netherlands" → TOP.CTY, "Auschwitz" → TOP.HIS

### 4. GEO (Geometry): Coordinates, shapes, spatial data
   **Subcategories:**
   - GEO.PNT: Point coordinates (maps to geo:Point)
   - GEO.LIN: Line/path (maps to geo:LineString)
   - GEO.POL: Polygon/area (maps to geo:Polygon)
   - GEO.BOX: Bounding box (maps to geo:Envelope)

   Examples: "52.3676° N, 4.9041° E" → GEO.PNT

### 5. TMP (Temporal): Dates, times, durations, periods
   **Subcategories:**
   - TMP.DAT: Absolute date - specific point (maps to time:Instant) [alias: TMP.DAB]
   - TMP.DAB: Date Absolute - specific date "1885-03-22" (maps to time:Instant)
   - TMP.DRL: Date Relative - "last year", "recently", "two weeks ago" (maps to time:Instant)
   - TMP.TIM: Time of day (maps to time:Instant) [alias: TMP.TAB]
   - TMP.TAB: Time Absolute - specific time "14:30:00" (maps to time:Instant)
   - TMP.TRL: Time Relative - "later that evening", "soon after" (maps to time:Instant)
   - TMP.DUR: Duration/period - "three hours", "from 1885 to 1890" (maps to time:Duration)
   - TMP.RNG: Date range - "1885-1890", "March 1-15" (maps to time:Interval)
   - TMP.SET: Recurring time - "every Monday", "annually"
   - TMP.OPH: Opening hours - "Tuesday-Sunday 10:00-17:00" (maps to schema:OpeningHoursSpecification)
   - TMP.REL: Relative time - "before", "after" [deprecated, use TMP.DRL/TMP.TRL]
   - TMP.CEN: Century - "17th century", "the 1800s" (maps to crm:E4_Period)
   - TMP.ERA: Historical era/period name - "Renaissance", "Bronze Age" (maps to crm:E4_Period)
   - TMP.EXP: Exhibition period - "10 February - 4 June 2023" (maps to time:Interval)

   Examples: "1885" → TMP.DAB, "18th century" → TMP.CEN, "every Tuesday" → TMP.SET, "10:00-17:00" → TMP.OPH

### 6. APP (Appellation): Identifiers, codes, reference numbers
   **Subcategories:**
   - APP.ISL: ISIL code (maps to crm:E42_Identifier)
   - APP.WKD: Wikidata ID (maps to crm:E42_Identifier)
   - APP.VIF: VIAF ID (maps to crm:E42_Identifier)
   - APP.DOI: DOI
   - APP.URL: URL/URI (maps to schema:URL)
   - APP.ISBN: ISBN
   - APP.ISSN: ISSN
   - APP.KVK: Dutch Chamber of Commerce number
   - APP.TTL: Title of work (maps to crm:E35_Title) [alias: APP.TIT]
   - APP.TIT: Title of work (maps to crm:E35_Title)
   - APP.NAM: Personal name - structured (maps to pnv:PersonName) [alias: APP.PNM]
   - APP.PNM: Personal name - structured (maps to pnv:PersonName)
   - APP.AWD: Award name
   - APP.COL: Collection name
   - APP.EXH: Exhibition name/title (maps to crm:E35_Title)

   Examples: "ISIL NL-AmRM" → APP.ISL, "Q190804" → APP.WKD, "Rembrandt and His Era" → APP.EXH

### 7. ROL (Role): Titles, positions, honorifics, occupations
   **Subcategories:**
   - ROL.OCC: Occupation - profession, trade, job title (maps to schema:Occupation)
   - ROL.TTL: Title/honorific - "Dr.", "Prof.", academic/professional titles (maps to schema:Role)
   - ROL.HON: Honorific - "Sir", "Dame", "The Honorable" (maps to schema:honorificPrefix)
   - ROL.NOB: Nobility title - "Duke", "Baron", "Count", hereditary titles (maps to schema:honorificSuffix)
   - ROL.POS: Position/office - "Director", "Chairman" (maps to org:Post)
   - ROL.REL: Relational role - father, mother, kinship (maps to bio:Relationship)
   - ROL.REL.REL: Religious role - "Bishop", "Rabbi", "Imam" (maps to schema:Role)

   Examples: "Director" → ROL.POS, "Prof. Dr." → ROL.TTL, "Duke of Wellington" → ROL.NOB, "Rabbi" → ROL.REL.REL

### 8. WRK (Work): Works following FRBR model
   **Subcategories:**
   - WRK.WRK: FRBR Work - abstract (maps to frbroo:F1_Work) [alias: WRK.ABS]
   - WRK.ABS: Abstract work (maps to frbroo:F1_Work)
   - WRK.EXP: FRBR Expression (maps to frbroo:F2_Expression)
   - WRK.MAN: FRBR Manifestation (maps to frbroo:F3_Manifestation)
   - WRK.ITM: FRBR Item (maps to frbroo:F5_Item)
   - WRK.MSS: Manuscript - handwritten/unpublished work (maps to rico:Record)
   - WRK.ARC: Archival record/document (maps to rico:Record)
   - WRK.TXT: Textual work (maps to schema:Book)
   - WRK.VIS: Visual work (maps to schema:VisualArtwork)
   - WRK.MUS: Musical work (maps to schema:MusicComposition)
   - WRK.PER: Performance (maps to schema:PerformingArtsEvent)
   - WRK.CIN: Cinematic work (maps to schema:Movie)
   - WRK.OBJ: Physical object/artifact (maps to crm:E22_Human-Made_Object)
   - WRK.COL: Collection (maps to crm:E78_Curated_Holding)
   - WRK.SER: Series (maps to schema:CreativeWorkSeries)
   - WRK.WEB: Web resource/page (maps to schema:WebPage)
   - WRK.URL: URL reference to work/link (maps to schema:URL)
   - WRK.EML: Email message (maps to schema:Message)
   - WRK.SOC: Social media post/content (maps to schema:SocialMediaPosting)
   - WRK.CIT: Citation/bibliographic reference (maps to schema:Citation)

   Examples: "The Night Watch" → WRK.VIS, "Annual Report 2023" → WRK.TXT, "15th-century codex" → WRK.MSS

### 9. QTY (Quantity): Measurements, counts, numeric values
   **Subcategories:**
   - QTY.CNT: Count (maps to crm:E54_Dimension)
   - QTY.MSR: Measurement (maps to crm:E54_Dimension)
   - QTY.PCT: Percentage
   - QTY.CUR: Currency/monetary (maps to schema:MonetaryAmount)
   - QTY.ORD: Ordinal (maps to crm:E60_Number)
   - QTY.RNG: Range

   Examples: "over 8,000 artworks" → QTY.CNT, "€2.5 million" → QTY.CUR

### 10. THG (Thing): Physical objects, artifacts, concepts, events
   **Subcategories:**
   - THG.ART: Artwork (maps to crm:E22_Human-Made_Object)
   - THG.AFT: Artifact - human-made object of historical significance (maps to crm:E22_Human-Made_Object)
   - THG.SPC: Specimen - natural history specimen, scientific sample (maps to crm:E20_Biological_Object)
   - THG.DOC: Document (maps to foaf:Document)
   - THG.PHO: Photograph (maps to schema:Photograph)
   - THG.OBJ: Physical object - generic (maps to crm:E19_Physical_Object)
   - THG.EVT: Historical event - deportation, persecution, liberation, war (maps to crm:E5_Event)
   - THG.CON: Concept/abstract thing - stories, memories, heritage, mission (maps to crm:E28_Conceptual_Object)
   - THG.TAX: Taxonomic term - species (maps to crm:E55_Type)
   - THG.LNG: Language (maps to crm:E56_Language)
   - THG.MAT: Material - bronze, marble, paper, etc. (maps to crm:E57_Material)

   Examples: "17th-century painting" → THG.ART, "deportation" → THG.EVT, "the stories" → THG.CON, "Dutch" → THG.LNG

## RELATIONSHIP TYPES AND CONSTRAINTS

Relationships connect two entities. Each relationship has domain (subject) and range (object) constraints.

**⚠️ CRITICAL: COMPREHENSIVE SEMANTIC TRIPLE EXTRACTION ⚠️**

You MUST extract ALL semantic relationships from narrative text, not just named entity relationships.
Decompose every sentence into its constituent semantic triples (subject-predicate-object).

Example text: "In het Herinneringscentrum Kamp Westerbork vertellen we de verhalen van meer dan honderdduizend Joden en Sinti en Roma die vanuit Nederland naar vernietigings- en concentratiekampen werden gedeporteerd"

This SINGLE sentence contains these triples:
1. REL.ORG.ACT: Herinneringscentrum Kamp Westerbork → performs activity → tell stories
2. REL.SUB.ABT: the stories → are about → Joden (Jews)
3. REL.SUB.ABT: the stories → are about → Sinti and Roma
4. REL.QTY.CNT: Jews/Sinti/Roma → quantity → more than 100,000
5. REL.SPA.ORG: deportees → originated from → Nederland
6. REL.SPA.DST: deportees → destination → concentration camps
7. REL.SPA.DST: deportees → destination → extermination camps
8. REL.EVT.PAR: Jews/Sinti/Roma → participated in → deportation (forced)

### REL.CRE (Creation) - Agent creates Work
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.CRE.AUT | AGT.PER, AGT.GRP | WRK.TXT | "Martin Luther authored 95 Theses" |
| REL.CRE.ART | AGT.PER | WRK.VIS, THG.ART | "Rembrandt painted The Night Watch" |
| REL.CRE.COM | AGT.PER | WRK.MUS | "Beethoven composed Symphony No. 9" |
| REL.CRE.PHO | AGT.PER | THG.PHO | "Photographer captured portrait" |
| REL.CRE.DES | AGT.PER, AGT.GRP | WRK.OBJ | "Architect designed building" |

### REL.SPA (Spatial) - Located in / Contains / Origin / Destination
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SPA.LOC | AGT, EVT, GRP, WRK | TOP | "Museum located in Amsterdam" |
| REL.SPA.WTH | TOP | TOP | "Amsterdam within North Holland" |
| REL.SPA.CON | TOP | TOP | "Netherlands contains Amsterdam" |
| REL.SPA.ORG | AGT.PER, WRK, GRP.ETH | TOP | "Jews came from Netherlands" |
| REL.SPA.DST | AGT, EVT, GRP | TOP | "Deported to concentration camps" |

### REL.SOC (Social) - Person-to-person relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SOC.FAM.SPO | AGT.PER | AGT.PER | "Martin Luther married Katharina von Bora" |
| REL.SOC.FAM.PAR | AGT.PER | AGT.PER | "Parent of child" |
| REL.SOC.PRO.STU | AGT.PER | AGT.PER | "Student studied under master" |
| REL.SOC.MEM | AGT.PER, GRP | GRP | "Person/org member of organization" |
| REL.SOC.EMP | AGT.PER | GRP | "Employee works for company" |

### REL.ORG (Organizational) - Group activities and relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.ORG.PAR | GRP | GRP | "Parent organization" |
| REL.ORG.SUB | GRP | GRP | "Subsidiary organization" |
| REL.ORG.SUC | GRP | GRP | "Successor organization" |
| REL.ORG.FND | AGT.PER, GRP | GRP | "Founder established organization" |
| REL.ORG.ACT | GRP.HER, GRP | THG.CON, WRK | "Museum tells stories" / "Archive preserves documents" |
| REL.ORG.MIS | GRP.HER, GRP | THG.CON | "Organization's mission is..." |
| REL.ORG.SRV | GRP.HER, GRP | GRP, AGT | "Museum serves researchers" |

### REL.CUS (Custodial) - Ownership/Keeping
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.CUS.KEP | WRK, THG | GRP.HER | "Artwork kept by Rijksmuseum" |
| REL.CUS.OWN | WRK, THG | AGT.PER, GRP | "Collector owns painting" |
| REL.CUS.COL | WRK, THG | WRK.COL | "Item in collection" |
| REL.CUS.DNT | WRK, THG | AGT.PER | "Donated by benefactor" |

### REL.WRK (Work/FRBR) - Work relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.WRK.EXP | WRK.EXP | WRK.WRK | "Expression of work" |
| REL.WRK.PRT | WRK | WRK | "Part of larger work" |
| REL.WRK.SER | WRK | WRK.SER | "Volume in series" |
| REL.WRK.TRN | WRK.EXP | WRK.WRK | "Translation of work" |

### REL.SUB (Subject/About) - Topics and content
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.SUB.ABT | WRK, GRP.HER, THG | AGT, GRP, EVT, TOP, THG | "Stories about Jews and Roma" |
| REL.SUB.DEP | WRK.VIS, THG.PHO | AGT, TOP, EVT | "Photo depicts memorial" |
| REL.SUB.THM | GRP.HER, WRK.COL | THG.CON | "Collection themes: WWII, Holocaust" |

### REL.EVT (Event) - Participation and historical events
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.EVT.PAR | AGT, GRP, GRP.ETH | EVT, THG.EVT | "Jews participated in deportation" |
| REL.EVT.ORG | AGT, GRP | EVT | "Nazis organized deportations" |
| REL.EVT.LOC | EVT | TOP | "Deportations from Netherlands" |
| REL.EVT.VIC | AGT, GRP, GRP.ETH | EVT | "Jews were victims of persecution" |
| REL.EVT.TIM | EVT | TMP | "Deportations in 1942-1944" |

### REL.QTY (Quantity) - Numeric relations
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.QTY.CNT | GRP, GRP.ETH, WRK.COL | QTY.CNT | "More than 100,000 people" |
| REL.QTY.MSR | THG, TOP | QTY.MSR | "Building is 500 sqm" |
| REL.QTY.YRS | GRP.HER, AGT | QTY.CNT, TMP | "Museum operating for 50 years" |

### REL.ROL (Role) - Occupation/Position
| Hyponym | Domain (Subject) | Range (Object) | Example |
|---------|------------------|----------------|---------|
| REL.ROL.OCC | AGT.PER | ROL.OCC | "Person has occupation" |
| REL.ROL.HLD | AGT.PER | ROL.POS | "Person holds position" |

**IMPORTANT: Always include entity_type in relationship subject/object for validation!**

### ENTITY TYPES FOR RELATIONSHIP EXTRACTION

When extracting relationships, use these additional entity types:
- **GRP.ETH**: Ethnic groups (Joden, Sinti, Roma, etc.)
- **THG.CON**: Abstract concepts (stories, memories, heritage, mission)
- **THG.EVT**: Historical events (deportation, persecution, liberation)
- **TOP.HIS**: Historical places (concentration camps, transit camps)

## LAYOUT REGIONS (DOC hypernym)

Primary: HDR (heading), PAR (paragraph), SEN (sentence), LST (list), TBL (table)
Media: GAL (gallery), MAP (map), AUD (audio), VID (video), EMB (embedded)
Navigation: NAV (navigation), TOC (table of contents), IDX (index)
Front/Back: TTP (title page), DED (dedication), COL (colophon), BIB (bibliography), APP (appendix), GLO (glossary)
Commercial: ADV (advertisement), LOG (logo)

## OUTPUT FORMAT

Return a JSON object with this structure:
```json
{
  "entities": [
    {
      "hypernym": "GRP",
      "hyponym": "GRP.HER",
      "text": "Rijksmuseum",
      "xpath": "/html/body/div[1]/h1",
      "confidence": 0.95,
      "class_uri": "glam:HeritageCustodian",
      "notes": "Main heritage institution name"
    }
  ],
  "layout_regions": [
    {
      "region": "HDR",
      "level": 1,
      "semantic_role": "PRIM",
      "xpath": "/html/body/div[1]/h1",
      "text_preview": "Rijksmuseum Amsterdam",
      "contains_entities": ["GRP.HER:Rijksmuseum", "TOP.SET:Amsterdam"]
    }
  ],
  "claims": [
    {
      "claim_type": "full_name",
      "claim_value": "Rijksmuseum Amsterdam",
      "xpath": "/html/body/div[1]/h1",
      "confidence": 0.95,
      "source_entities": ["GRP.HER:Rijksmuseum"]
    }
  ],
  "relationships": [
    {
      "relationship_type": "REL.SPA.LOC",
      "subject": {"entity_type": "GRP.HER", "text": "Rijksmuseum"},
      "object": {"entity_type": "TOP.SET", "text": "Amsterdam"},
      "predicate_uri": "schema:location",
      "confidence": 0.90
    }
  ]
}
```

## RULES

1. Every claim MUST have an XPath location in the source document
2. Use HYPONYM codes (e.g., GRP.HER, AGT.PER) not just hypernyms (e.g., GRP, AGT)
3. Include class_uri ontology mapping for each entity
4. Confidence scores: 0.9-1.0 (explicit), 0.7-0.9 (clear), 0.5-0.7 (inferred)
5. Entities within layout regions should be cross-referenced
6. Claims without XPath provenance are FABRICATED and must not be included
7. Extract relationships between entities (especially REL.SPA.LOC, REL.ORG.*, REL.CRE.*)

## ⚠️ CRITICAL: COMPREHENSIVE SEMANTIC EXTRACTION ⚠️

8. **DECOMPOSE EVERY NARRATIVE SENTENCE INTO TRIPLES** - A single sentence often contains 5-10 semantic relationships
9. **Extract ALL entities** - not just named entities, but also:
   - Quantities (QTY.CNT: "more than 100,000")
   - Ethnic groups (GRP.ETH: "Jews", "Sinti", "Roma")
   - Abstract concepts (THG.CON: "stories", "memories", "heritage")
   - Historical events (THG.EVT: "deportation", "persecution")
   - Historical places (TOP.HIS: "concentration camps", "transit camps")
10. **Extract organizational activities** (REL.ORG.ACT): What does the institution DO? (preserve, tell, exhibit, research)
11. **Extract subject matter** (REL.SUB.ABT): What is the institution/collection ABOUT?
12. **Extract quantities** (REL.QTY.CNT): Numbers of visitors, items, people affected
13. **Extract spatial origins and destinations** (REL.SPA.ORG, REL.SPA.DST): Where did things/people come FROM and go TO?
14. **Extract event participation** (REL.EVT.PAR, REL.EVT.VIC): Who was involved in historical events?

### Example: Deep Semantic Parsing

Text: "Het museum bewaart meer dan 5000 voorwerpen uit de Tweede Wereldoorlog"

**INCORRECT** (shallow extraction):
- 1 entity: "Het museum" (GRP.HER)
- 0 relationships

**CORRECT** (deep semantic extraction):
- Entities:
  - "Het museum" (GRP.HER)
  - "meer dan 5000" (QTY.CNT)
  - "voorwerpen" (THG.AFT - artifacts)
  - "Tweede Wereldoorlog" (TMP.ERA)
- Relationships:
  - REL.ORG.ACT: museum → performs → preservation (bewaart)
  - REL.CUS.KEP: voorwerpen → kept by → museum
  - REL.QTY.CNT: voorwerpen → quantity → meer dan 5000
  - REL.TMP.DUR: voorwerpen → from period → Tweede Wereldoorlog

## CLAIM TYPES FOR HERITAGE INSTITUTIONS

- full_name: Official institution name
- short_name: Abbreviated name or acronym
- description: Institution description
- email: Contact email
- phone: Contact phone
- address: Physical address
- website: Official website URL
- social_media: Social media links (facebook, twitter, instagram, linkedin, youtube)
- opening_hours: Visitor hours
- admission_info: Ticket/entry information
- founding_date: When institution was established
- collection_count: Number of items in collection
- kvk_number: Dutch Chamber of Commerce number
- isil_code: International Standard Identifier for Libraries
- wikidata_id: Wikidata Q-number
- parent_organization: Parent/umbrella organization
"""


class LLMAnnotator:
    """
    LLM-based document annotator.

    Uses LLM inference for all entity recognition and claim extraction.
    NO heuristic or pattern-based methods.

    Example:
        >>> config = LLMAnnotatorConfig(provider=LLMProvider.ZAI, model="glm-4")
        >>> annotator = LLMAnnotator(config)
        >>> session = await annotator.annotate(document)
        >>> print(f"Found {len(session.entity_claims)} entities")
    """

    def __init__(self, config: Optional[LLMAnnotatorConfig] = None):
        """
        Initialize LLM annotator.

        Args:
            config: LLM configuration (defaults to Z.AI GLM-4)
        """
        self.config = config or LLMAnnotatorConfig()
        self._client = None

        if not self.config.api_key:
            raise ValueError(
                f"API key not found for {self.config.provider.value}. "
                f"Set environment variable or pass api_key in config."
            )

    async def annotate(
        self,
        document: Union[HTMLDocument, str, Path],
        source_url: Optional[str] = None,
        image_dir: Optional[Path] = None,
    ) -> AnnotationSession:
        """
        Annotate a document using LLM inference.

        Args:
            document: HTMLDocument, HTML string, or path to HTML file
            source_url: Optional source URL for provenance
            image_dir: Optional directory containing downloaded images for vision analysis

        Returns:
            AnnotationSession with extracted claims
        """
        # Load document if needed
        html_content: str
        source_file: Optional[str] = None
        if isinstance(document, Path):
            with open(document, 'r', encoding='utf-8') as f:
                html_content = f.read()
            source_url = source_url or str(document)
            source_file = str(document)
            # Auto-detect image directory if not provided
            if image_dir is None:
                image_dir = document.parent
        elif isinstance(document, str):
            # Check if it's a file path (short string, no HTML tags)
            is_file_path = len(document) < 500 and not document.strip().startswith('<')
            if is_file_path:
                try:
                    path = Path(document)
                    if path.exists():
                        with open(path, 'r', encoding='utf-8') as f:
                            html_content = f.read()
                        source_url = source_url or document
                        source_file = document
                        if image_dir is None:
                            image_dir = path.parent
                    else:
                        html_content = document
                except OSError:
                    # Path too long or invalid
                    html_content = document
            else:
                html_content = document
        elif isinstance(document, HTMLDocument):
            html_content = document.raw_html
            source_url = source_url or document.source_url
            source_file = document.source_file
        else:
            raise TypeError(f"Unsupported document type: {type(document)}")

        # Create session
        session = AnnotationSession(
            session_id=f"llm-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
            source_url=source_url or "unknown",
            source_file=source_file,
        )

        # Prepare prompt
        user_prompt = self._prepare_prompt(html_content)

        # Call LLM for text annotation
        try:
            response = await self._call_llm(user_prompt)

            # Parse response
            annotations = self._parse_response(response)

            # Convert to claims
            self._populate_session(session, annotations, source_url)

        except Exception as e:
            session.errors.append(f"LLM annotation failed: {e}")

        # Image analysis (if enabled)
        if self.config.extract_images:
            try:
                image_claims = await self.analyze_images_in_html(
                    html_content=html_content,
                    base_url=source_url,
                    image_dir=image_dir,
                )
                for claim in image_claims:
                    session.add_image_claim(claim)

                if image_claims:
                    logger.info(f"Analyzed {len(image_claims)} images from document")

            except Exception as e:
                session.errors.append(f"Image analysis failed: {e}")
                logger.warning(f"Image analysis failed: {e}")

        session.completed_at = datetime.now(timezone.utc).isoformat()
        return session

    def _prepare_prompt(self, html_content: str) -> str:
        """Prepare the user prompt with document content."""
        # Truncate if too long (LLM context limits)
        max_chars = 30000
        if len(html_content) > max_chars:
            html_content = html_content[:max_chars] + "\n... [truncated]"

        return f"""Analyze the following HTML document and extract all entities, layout regions, claims, and relationships.

Return a JSON object following the schema in the system prompt.

HTML DOCUMENT:
```html
{html_content}
```

## ⚠️ CRITICAL EXTRACTION REQUIREMENTS ⚠️

### 1. COMPREHENSIVE ENTITY EXTRACTION
Extract ALL entities, not just named entities:
- Heritage institutions (GRP.HER)
- Ethnic groups (GRP.ETH): Jews, Roma, Sinti, etc.
- Quantities (QTY.CNT): "more than 100,000", "5000 objects"
- Historical events (THG.EVT): deportation, persecution, liberation
- Abstract concepts (THG.CON): stories, memories, heritage, mission
- Historical places (TOP.HIS): concentration camps, transit camps
- Time periods (TMP.ERA): World War II, Holocaust

### 2. COMPREHENSIVE RELATIONSHIP EXTRACTION
Decompose EVERY narrative sentence into semantic triples:
- REL.ORG.ACT: What activities does the organization perform? (preserve, tell, exhibit, research, commemorate)
- REL.SUB.ABT: What is the collection/institution/story ABOUT?
- REL.QTY.CNT: Quantities of people, objects, visitors
- REL.SPA.ORG: Where did people/things come FROM?
- REL.SPA.DST: Where did people/things go TO?
- REL.EVT.PAR: Who participated in events (voluntary or forced)?
- REL.EVT.VIC: Who were victims of events?

### 3. EXAMPLE - WHAT WE EXPECT

For text: "In het Herinneringscentrum vertellen we de verhalen van meer dan honderdduizend Joden"

Extract:
- **Entities**:
  - Herinneringscentrum (GRP.HER)
  - de verhalen (THG.CON - stories/narratives)
  - meer dan honderdduizend (QTY.CNT - >100,000)
  - Joden (GRP.ETH - Jews as ethnic group)

- **Relationships**:
  - REL.ORG.ACT: Herinneringscentrum → tells → verhalen
  - REL.SUB.ABT: verhalen → about → Joden
  - REL.QTY.CNT: Joden → quantity → meer dan honderdduizend

### 4. DO NOT:
- Skip abstract concepts or quantities
- Extract only named entities
- Ignore the semantic relationships within sentences
- Produce shallow extractions with few relationships

IMPORTANT: The richness of semantic extraction is critical. A single paragraph may contain 10-20 relationships.
"""

    def _calculate_backoff_delay(self, attempt: int) -> float:
        """
        Calculate delay for exponential backoff.

        Args:
            attempt: Current retry attempt number (0-indexed)

        Returns:
            Delay in seconds
        """
        retry = self.config.retry
        delay = retry.base_delay * (retry.exponential_base ** attempt)
        delay = min(delay, retry.max_delay)

        # Add jitter to prevent thundering herd
        if retry.jitter:
            delay = delay * (0.5 + random.random())

        return delay

    def _get_api_key_for_provider(self, provider: LLMProvider) -> Optional[str]:
        """Get API key for a specific provider from environment."""
        env_vars = {
            LLMProvider.ZAI: "ZAI_API_TOKEN",
            LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
            LLMProvider.OPENAI: "OPENAI_API_KEY",
        }
        return os.environ.get(env_vars.get(provider, ""))

    async def _call_provider(
        self,
        provider: LLMProvider,
        user_prompt: str,
        api_key: Optional[str] = None,
    ) -> str:
        """
        Call a specific LLM provider.

        Args:
            provider: Which provider to call
            user_prompt: The user prompt to send
            api_key: Optional API key override

        Returns:
            LLM response string
        """
        # Use provided key or get from environment
        key = api_key or self._get_api_key_for_provider(provider)

        if not key:
            raise ValueError(f"No API key available for {provider.value}")

        if provider == LLMProvider.ZAI:
            return await self._call_zai(user_prompt, key)
        elif provider == LLMProvider.ANTHROPIC:
            return await self._call_anthropic(user_prompt, key)
        elif provider == LLMProvider.OPENAI:
            return await self._call_openai(user_prompt, key)
        else:
            raise ValueError(f"Unsupported provider: {provider}")

    async def _call_llm(self, user_prompt: str) -> str:
        """
        Call the LLM API with retry logic and provider fallback.

        Implements:
        1. Exponential backoff with jitter for rate limits
        2. Automatic fallback to alternative providers on failure

        Returns:
            LLM response string

        Raises:
            Exception: If all retries and fallbacks are exhausted
        """
        import httpx

        # Build provider chain: primary + fallbacks
        providers_to_try = [self.config.provider]
        if self.config.fallback_providers:
            providers_to_try.extend(self.config.fallback_providers)

        last_exception: Optional[Exception] = None

        for provider in providers_to_try:
            api_key = (
                self.config.api_key
                if provider == self.config.provider
                else self._get_api_key_for_provider(provider)
            )

            if not api_key:
                logger.info(f"Skipping {provider.value}: no API key available")
                continue

            logger.info(f"Trying provider: {provider.value}")

            for attempt in range(self.config.retry.max_retries):
                try:
                    return await self._call_provider(provider, user_prompt, api_key)

                except httpx.HTTPStatusError as e:
                    status_code = e.response.status_code

                    if status_code in self.config.retry.retry_on_status:
                        delay = self._calculate_backoff_delay(attempt)
                        logger.warning(
                            f"Provider {provider.value} returned {status_code} "
                            f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
                            f"Retrying in {delay:.2f}s..."
                        )
                        await asyncio.sleep(delay)
                        last_exception = e
                    else:
                        # Non-retryable error, try next provider
                        logger.error(
                            f"Provider {provider.value} returned non-retryable "
                            f"status {status_code}: {e}"
                        )
                        last_exception = e
                        break

                except httpx.TimeoutException as e:
                    delay = self._calculate_backoff_delay(attempt)
                    logger.warning(
                        f"Provider {provider.value} timed out "
                        f"(attempt {attempt + 1}/{self.config.retry.max_retries}). "
                        f"Retrying in {delay:.2f}s..."
                    )
                    await asyncio.sleep(delay)
                    last_exception = e

                except Exception as e:
                    logger.error(f"Provider {provider.value} failed: {e}")
                    last_exception = e
                    break

            # All retries exhausted for this provider, try next
            logger.warning(f"Provider {provider.value} exhausted all retries")

        # All providers failed
        raise RuntimeError(
            f"All LLM providers failed. Last error: {last_exception}"
        ) from last_exception

    async def _call_zai(self, user_prompt: str, api_key: str) -> str:
        """
        Call Z.AI API using Anthropic-compatible endpoint.

        Z.AI GLM Coding Plan provides an Anthropic-compatible API at:
        https://api.z.ai/api/anthropic/v1/messages

        Uses same message format as Anthropic Claude API.
        """
        import httpx

        # Z.AI Anthropic-compatible endpoint
        url = "https://api.z.ai/api/anthropic/v1/messages"

        # Z.AI uses Anthropic-style headers
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "Content-Type": "application/json",
        }

        # Map Z.AI model names - GLM models available via Anthropic API
        # Default to claude-3-5-sonnet if model not explicitly set for Z.AI
        model = self.config.model
        if model.startswith("glm-"):
            # Z.AI's Anthropic endpoint uses Claude model names
            model = "claude-sonnet-4-20250514"

        payload = {
            "model": model,
            "max_tokens": self.config.max_tokens,
            "system": GLAM_NER_SYSTEM_PROMPT,
            "messages": [
                {"role": "user", "content": user_prompt},
            ],
        }

        async with httpx.AsyncClient(timeout=self.config.timeout) as client:
            response = await client.post(url, headers=headers, json=payload)
            response.raise_for_status()
            data = response.json()

            # Anthropic response format
            return data["content"][0]["text"]

    async def _call_anthropic(self, user_prompt: str, api_key: str) -> str:
        """Call Anthropic Claude API."""
        import httpx

        url = "https://api.anthropic.com/v1/messages"

        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "Content-Type": "application/json",
        }

        # Use Claude-specific model name for fallback
        model = (
            self.config.model
            if self.config.provider == LLMProvider.ANTHROPIC
            else "claude-3-5-sonnet-20241022"
        )

        payload = {
            "model": model,
            "max_tokens": self.config.max_tokens,
            "system": GLAM_NER_SYSTEM_PROMPT,
            "messages": [
                {"role": "user", "content": user_prompt},
            ],
        }

        async with httpx.AsyncClient(timeout=self.config.timeout) as client:
            response = await client.post(url, headers=headers, json=payload)
            response.raise_for_status()
            data = response.json()

            return data["content"][0]["text"]

    async def _call_openai(self, user_prompt: str, api_key: str) -> str:
        """Call OpenAI GPT-4 API."""
        import httpx

        url = "https://api.openai.com/v1/chat/completions"

        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }

        # Use OpenAI-specific model name for fallback
        model = (
            self.config.model
            if self.config.provider == LLMProvider.OPENAI
            else "gpt-4o"
        )

        payload = {
            "model": model,
            "messages": [
                {"role": "system", "content": GLAM_NER_SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            "temperature": self.config.temperature,
            "max_tokens": self.config.max_tokens,
        }

        async with httpx.AsyncClient(timeout=self.config.timeout) as client:
            response = await client.post(url, headers=headers, json=payload)
            response.raise_for_status()
            data = response.json()

            return data["choices"][0]["message"]["content"]

    # =========================================================================
    # IMAGE ANALYSIS METHODS (Z.AI GLM-4.5V Vision API)
    # =========================================================================

    async def _analyze_image(
        self,
        image_url: Optional[str] = None,
        image_base64: Optional[str] = None,
        image_path: Optional[str] = None,
        alt_text: Optional[str] = None,
        context: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Analyze an image using Z.AI GLM-4.5V vision model.

        Extracts visual descriptions, entities, OCR text, and heritage relevance.

        Args:
            image_url: URL of the image (absolute or relative)
            image_base64: Base64-encoded image data
            image_path: Local file path to image
            alt_text: HTML alt text for context
            context: Surrounding text context from the page

        Returns:
            Dict with analysis results:
            {
                "description": "Natural language description",
                "detected_entities": [{"type": "AGT.PER", "text": "...", "confidence": 0.9}],
                "extracted_text": "OCR text if present",
                "heritage_relevance": "Why this matters for heritage",
                "image_type": "photograph|painting|document|map|artifact|other",
                "era_estimate": "Estimated time period",
                "style": "Photographic/artistic style",
                "analysis_confidence": 0.85
            }
        """
        import httpx
        import base64

        # Prepare image content for API
        image_content = None

        if image_base64:
            # Already base64 encoded
            image_content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
        elif image_path:
            # Read and encode local file
            try:
                path = Path(image_path)
                if path.exists():
                    with open(path, 'rb') as f:
                        img_data = base64.b64encode(f.read()).decode('utf-8')
                    # Detect MIME type from extension
                    ext = path.suffix.lower()
                    mime_types = {
                        '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
                        '.png': 'image/png', '.gif': 'image/gif',
                        '.webp': 'image/webp', '.bmp': 'image/bmp'
                    }
                    mime_type = mime_types.get(ext, 'image/jpeg')
                    image_content = {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_data}"}}
            except Exception as e:
                logger.warning(f"Failed to read image file {image_path}: {e}")
                return {"error": f"Failed to read image: {e}"}
        elif image_url:
            # Use URL directly (works for absolute URLs)
            if image_url.startswith('http://') or image_url.startswith('https://'):
                image_content = {"type": "image_url", "image_url": {"url": image_url}}
            else:
                # Relative URL - can't analyze without base URL
                logger.debug(f"Skipping relative URL image: {image_url}")
                return {"error": "Relative URL - cannot analyze without base URL"}

        if not image_content:
            return {"error": "No valid image source provided"}

        # Build prompt for heritage image analysis
        prompt_parts = [
            "Analyze this image from a heritage institution website.",
            "",
            "Provide a JSON response with the following fields:",
            "- description: Detailed description of what the image shows",
            "- detected_entities: Array of entities visible in the image, each with {type, text, confidence}",
            "  - Use GLAM-NER types: AGT.PER (person), WRK.VIS (artwork), THG.ART (artifact), TOP.BLD (building), GRP.ETH (ethnic group), etc.",
            "- extracted_text: Any text visible in the image (OCR)",
            "- heritage_relevance: Why this image is significant for heritage/cultural preservation",
            "- image_type: One of: photograph, painting, document, map, artifact, museum_object, historical_photo, memorial, building, portrait, group_photo, exhibition, other",
            "- era_estimate: Estimated time period of the content (e.g., '1940s', 'World War II', 'medieval', 'contemporary')",
            "- style: Artistic or photographic style",
            "- analysis_confidence: Your confidence in this analysis (0.0-1.0)",
        ]

        if alt_text:
            prompt_parts.extend(["", f"HTML alt text: {alt_text}"])

        if context:
            prompt_parts.extend(["", f"Page context: {context[:500]}..."])

        prompt_parts.extend([
            "",
            "Return ONLY valid JSON, no markdown code blocks."
        ])

        prompt = "\n".join(prompt_parts)

        # Call Z.AI GLM-4.5V Vision API
        # Z.AI uses OpenAI-compatible format for vision at a different endpoint
        url = "https://api.z.ai/api/paas/v4/chat/completions"

        headers = {
            "Authorization": f"Bearer {self.config.api_key}",
            "Content-Type": "application/json",
        }

        payload = {
            "model": self.config.vision_model,  # "glm-4.5v"
            "messages": [
                {
                    "role": "user",
                    "content": [
                        image_content,
                        {"type": "text", "text": prompt}
                    ]
                }
            ],
            "max_tokens": self.config.vision_max_tokens,
            "temperature": 0.1,
        }

        # Retry logic with exponential backoff for vision API
        max_retries = self.config.retry.max_retries
        base_delay = self.config.retry.base_delay

        for attempt in range(max_retries + 1):
            try:
                async with httpx.AsyncClient(timeout=60) as client:
                    response = await client.post(url, headers=headers, json=payload)

                    # Check for rate limit
                    if response.status_code == 429:
                        if attempt < max_retries:
                            delay = base_delay * (2 ** attempt)
                            if self.config.retry.jitter:
                                delay += random.uniform(0, delay * 0.1)
                            delay = min(delay, self.config.retry.max_delay)
                            logger.info(f"Vision API rate limited, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
                            await asyncio.sleep(delay)
                            continue
                        else:
                            return {"error": "Vision API rate limited after max retries"}

                    response.raise_for_status()
                    data = response.json()

                    # Parse response content
                    content = data.get("choices", [{}])[0].get("message", {}).get("content", "")

                    # Try to parse as JSON
                    try:
                        result = json.loads(content)
                        return result
                    except json.JSONDecodeError:
                        # If not valid JSON, return the text as description
                        return {
                            "description": content,
                            "detected_entities": [],
                            "analysis_confidence": 0.5,
                            "error": "Response was not valid JSON"
                        }

            except httpx.HTTPStatusError as e:
                if e.response.status_code in self.config.retry.retry_on_status and attempt < max_retries:
                    delay = base_delay * (2 ** attempt)
                    logger.info(f"Vision API error {e.response.status_code}, retrying in {delay:.1f}s")
                    await asyncio.sleep(delay)
                    continue
                logger.warning(f"Vision API HTTP error: {e.response.status_code}")
                return {"error": f"Vision API error: {e.response.status_code}"}
            except Exception as e:
                logger.warning(f"Vision API call failed: {e}")
                return {"error": str(e)}

        return {"error": "Vision API failed after retries"}

    async def analyze_images_in_html(
        self,
        html_content: str,
        base_url: Optional[str] = None,
        image_dir: Optional[Path] = None,
    ) -> List[ImageClaim]:
        """
        Extract and analyze all images from an HTML document.

        Args:
            html_content: Raw HTML content
            base_url: Base URL for resolving relative image paths
            image_dir: Local directory containing downloaded images

        Returns:
            List of ImageClaim objects with analysis results
        """
        from lxml import html as lxml_html
        from urllib.parse import urljoin

        image_claims: List[ImageClaim] = []

        # Parse HTML
        try:
            tree = lxml_html.fromstring(html_content)
        except Exception as e:
            logger.warning(f"Failed to parse HTML for image extraction: {e}")
            return []

        # Find all img elements
        img_elements = tree.xpath('//img[@src]')

        # Limit number of images per page
        if len(img_elements) > self.config.max_images_per_page:
            logger.info(f"Limiting image analysis to {self.config.max_images_per_page} of {len(img_elements)} images")
            img_elements = img_elements[:self.config.max_images_per_page]

        for img in img_elements:
            src = img.get('src', '')
            alt = img.get('alt', '')
            title = img.get('title', '')
            width_str = img.get('width', '')
            height_str = img.get('height', '')

            # Skip data URIs and tiny images
            if src.startswith('data:'):
                continue

            # Parse dimensions
            try:
                width = int(width_str) if width_str.isdigit() else None
                height = int(height_str) if height_str.isdigit() else None

                # Skip tiny images (likely icons/decorations)
                if width and height:
                    if width < self.config.min_image_size or height < self.config.min_image_size:
                        continue
            except (ValueError, TypeError):
                width = height = None

            # Get XPath for provenance
            xpath = tree.getroottree().getpath(img)

            # Get surrounding context text
            parent = img.getparent()
            context = ""
            if parent is not None:
                # Get text from parent and siblings
                context_parts = []
                if parent.text:
                    context_parts.append(parent.text.strip())
                for sibling in parent:
                    if sibling.tail:
                        context_parts.append(sibling.tail.strip())
                context = " ".join(context_parts)[:300]

            # Resolve image URL
            image_url = None
            image_path = None
            image_base64 = None

            if src.startswith('http://') or src.startswith('https://'):
                image_url = src
            elif base_url:
                image_url = urljoin(base_url, src)

            # Check for local copy
            if image_dir:
                # Try to find the image in the local directory
                src_filename = Path(src).name
                local_candidates = [
                    image_dir / src_filename,
                    image_dir / src.lstrip('/'),
                    image_dir / src,
                ]
                for candidate in local_candidates:
                    if candidate.exists():
                        image_path = str(candidate)
                        break

            # Analyze the image
            analysis = await self._analyze_image(
                image_url=image_url,
                image_path=image_path,
                image_base64=image_base64,
                alt_text=alt or title,
                context=context,
            )

            # Skip if analysis failed
            if analysis.get("error"):
                logger.debug(f"Skipping image {src}: {analysis.get('error')}")
                continue

            # Create ImageClaim
            claim = ImageClaim(
                image_url=image_url or src,
                image_path=image_path,
                alt_text=alt if alt else None,
                title=title if title else None,
                width=width,
                height=height,
                xpath=xpath,
                description=analysis.get("description"),
                detected_entities=analysis.get("detected_entities", []),
                extracted_text=analysis.get("extracted_text"),
                heritage_relevance=analysis.get("heritage_relevance"),
                image_type=analysis.get("image_type"),
                era_estimate=analysis.get("era_estimate"),
                style=analysis.get("style"),
                analysis_model=self.config.vision_model,
                analysis_confidence=analysis.get("analysis_confidence"),
                provenance=Provenance(
                    namespace="glam-ner",
                    path=xpath,
                    timestamp=datetime.now(timezone.utc).isoformat(),
                    agent=f"LLMAnnotator/{self.config.vision_model}",
                    context_convention="GLAM-NER v1.7.0-unified/vision",
                    confidence=analysis.get("analysis_confidence", 0.5),
                ),
            )

            image_claims.append(claim)

        return image_claims

    def _parse_response(self, response: str) -> Dict[str, Any]:
        """Parse LLM response JSON."""
        # Find JSON in response (may be wrapped in markdown code blocks)
        import re

        # Try to find JSON block
        json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to parse whole response as JSON
            json_str = response

        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            # Return empty structure if parsing fails
            return {"entities": [], "layout_regions": [], "claims": []}

    def _populate_session(
        self,
        session: AnnotationSession,
        annotations: Dict[str, Any],
        source_url: Optional[str],
    ) -> None:
        """Populate session with parsed annotations."""
        timestamp = datetime.now(timezone.utc).isoformat()

        # Build entity ID lookup for relationship linking
        entity_id_lookup: Dict[str, str] = {}

        # Process entities
        for entity in annotations.get("entities", []):
            # Parse hypernym - can come from explicit field or be inferred from hyponym
            hypernym_str = entity.get("hypernym", "THG")
            hyponym_str = entity.get("hyponym", "")

            # If hyponym has dot notation (e.g., "GRP.HER"), extract hypernym from it
            if hyponym_str and "." in hyponym_str:
                hypernym_str = hyponym_str.split(".")[0]

            try:
                hypernym = EntityHypernym(hypernym_str)
            except ValueError:
                hypernym = EntityHypernym.THG

            # Generate unique claim ID
            claim_id = f"entity-{len(session.entity_claims)+1}"

            # Store lookup for relationship linking (by text span)
            entity_text = entity.get("text", "")
            if entity_text:
                entity_id_lookup[entity_text] = claim_id

            # Get class_uri from hyponym (or hypernym as fallback)
            # hyponym takes precedence for more specific ontology mapping
            class_uri = None
            if hyponym_str:
                class_uri = get_ontology_class(hyponym_str)
            if not class_uri:
                class_uri = get_ontology_class(hypernym_str)

            # Map LLM response fields to EntityClaim fields
            # EntityClaim inherits text_content from Claim base class
            claim = EntityClaim(
                claim_id=claim_id,
                hypernym=hypernym,
                hyponym=hyponym_str if hyponym_str else "unknown",
                text_content=entity_text,  # LLM returns "text", we use text_content
                class_uri=class_uri,  # Auto-populated from hyponym/hypernym
                isil_id=entity.get("isil_id"),  # If present from LLM
                cidoc_class=entity.get("cidoc_class"),  # Backwards compatibility
                recognition_confidence=entity.get("confidence", 0.5),
                provenance=Provenance(
                    namespace="glam-ner",
                    path=entity.get("xpath", ""),
                    timestamp=timestamp,
                    agent=f"{self.config.provider.value}/{self.config.model}",
                    context_convention=self.config.context_convention,
                ),
            )
            session.entity_claims.append(claim)

        # Process layout regions
        for region in annotations.get("layout_regions", []):
            try:
                region_type = LayoutRegion(region.get("region", "PAR"))
            except ValueError:
                region_type = LayoutRegion.PAR

            try:
                semantic_role = SemanticRole(region.get("semantic_role", "PRIM"))
            except ValueError:
                semantic_role = SemanticRole.PRIM

            # LayoutClaim uses text_content from base Claim class
            claim = LayoutClaim(
                claim_id=f"layout-{len(session.layout_claims)+1}",
                region=region_type,
                semantic_role=semantic_role,
                xpath=region.get("xpath", ""),
                text_content=region.get("text_preview", "")[:200],  # Use text_content
                provenance=Provenance(
                    namespace="glam-ner",
                    path=region.get("xpath", ""),
                    timestamp=timestamp,
                    agent=f"{self.config.provider.value}/{self.config.model}",
                    context_convention=self.config.context_convention,
                ),
            )
            session.layout_claims.append(claim)

        # Process relationships
        for rel in annotations.get("relationships", []):
            self._process_relationship(
                session=session,
                rel_data=rel,
                entity_id_lookup=entity_id_lookup,
                timestamp=timestamp,
            )

        # Process claims (aggregate)
        for claim_data in annotations.get("claims", []):
            claim = AggregateClaim(
                claim_id=f"claim-{len(session.aggregate_claims)+1}",
                claim_type=claim_data.get("claim_type", "unknown"),
                claim_value=claim_data.get("claim_value", ""),
                text_content=claim_data.get("claim_value", ""),  # Store value in text_content too
                provenance=Provenance(
                    namespace="glam-ner",
                    path=claim_data.get("xpath", ""),
                    timestamp=timestamp,
                    agent=f"{self.config.provider.value}/{self.config.model}",
                    context_convention=self.config.context_convention,
                    confidence=claim_data.get("confidence", 0.5),
                ),
            )
            session.aggregate_claims.append(claim)

    def _process_relationship(
        self,
        session: AnnotationSession,
        rel_data: Dict[str, Any],
        entity_id_lookup: Dict[str, str],
        timestamp: str,
    ) -> None:
        """Process a relationship from LLM response and add to session."""
        # Parse relationship type
        rel_type = rel_data.get("relationship_type", "REL.CRE")
        rel_hypernym = None
        rel_hyponym = None

        # Extract hypernym and hyponym from relationship type
        if rel_type and "." in rel_type:
            parts = rel_type.split(".")
            if len(parts) >= 2:
                hypernym_str = f"{parts[0]}.{parts[1]}"  # e.g., "REL.CRE"
                try:
                    rel_hypernym = RelationshipHypernym(hypernym_str)
                except ValueError:
                    pass
                rel_hyponym = rel_type  # Full code, e.g., "REL.CRE.AUT"

        # Parse subject
        subject_data = rel_data.get("subject", {})
        subject_text = subject_data.get("text", "")
        subject = RelationshipSubject(
            entity_id=entity_id_lookup.get(subject_text),
            entity_type=subject_data.get("entity_type") or subject_data.get("type"),  # Support both keys
            span_text=subject_text,
            uri=subject_data.get("uri"),
        )

        # Parse object
        object_data = rel_data.get("object", {})
        object_text = object_data.get("text", "")
        obj = RelationshipObject(
            entity_id=entity_id_lookup.get(object_text),
            entity_type=object_data.get("entity_type") or object_data.get("type"),  # Support both keys
            span_text=object_text,
            uri=object_data.get("uri"),
        )

        # Parse predicate
        predicate_data = rel_data.get("predicate", {})
        predicate = RelationshipPredicate(
            uri=predicate_data.get("uri"),
            label=predicate_data.get("label", rel_type),
            direction=predicate_data.get("direction", "FORWARD"),
        )

        # Parse temporal scope (if present)
        temporal_scope = None
        temporal_data = rel_data.get("temporal", {})
        if temporal_data:
            temporal_scope = TemporalScope(
                start_date=temporal_data.get("start_date"),
                end_date=temporal_data.get("end_date"),
                temporal_modifier=temporal_data.get("modifier"),
            )

        # Parse spatial scope (if present)
        spatial_scope = None
        spatial_data = rel_data.get("spatial", {})
        if spatial_data:
            spatial_scope = SpatialScope(
                place_id=spatial_data.get("place_id"),
                place_name=spatial_data.get("place_name"),
                geo_uri=spatial_data.get("geo_uri"),
            )

        # Parse qualifiers (if present)
        qualifiers = []
        for qual_data in rel_data.get("qualifiers", []):
            qualifiers.append(RelationshipQualifier(
                qualifier_type=qual_data.get("type", ""),
                qualifier_value=qual_data.get("value", ""),
                qualifier_uri=qual_data.get("uri"),
            ))

        # Create relationship claim
        claim = RelationshipClaim(
            claim_id=f"rel-{len(session.relationship_claims)+1}",
            relationship_hypernym=rel_hypernym,
            relationship_hyponym=rel_hyponym,
            subject=subject,
            predicate=predicate,
            object=obj,
            temporal_scope=temporal_scope,
            spatial_scope=spatial_scope,
            qualifiers=qualifiers,
            negation=rel_data.get("negation", False),
            hypothetical=rel_data.get("hypothetical", False),
            source_claim=rel_data.get("source_claim", False),
            attributed_to=rel_data.get("attributed_to"),
            extraction_confidence=rel_data.get("confidence", 0.5),
            text_content=rel_data.get("text", ""),  # Original text span
            provenance=Provenance(
                namespace="glam-ner",
                path=rel_data.get("xpath", ""),
                timestamp=timestamp,
                agent=f"{self.config.provider.value}/{self.config.model}",
                context_convention=self.config.context_convention,
                confidence=rel_data.get("confidence", 0.5),
            ),
        )

        # Validate domain/range constraints
        if rel_hyponym:
            validation_result = validate_relationship_constraints(
                relationship_type=rel_hyponym,
                subject_type=subject.entity_type,
                object_type=obj.entity_type,
                strict=False,  # Treat violations as warnings, not errors
            )

            # Add any validation warnings to session errors
            if validation_result.warnings:
                for warning in validation_result.warnings:
                    session.errors.append(f"[VALIDATION WARNING] {claim.claim_id}: {warning}")
                    logger.warning(f"Relationship validation: {warning}")

            if validation_result.errors:
                for error in validation_result.errors:
                    session.errors.append(f"[VALIDATION ERROR] {claim.claim_id}: {error}")
                    logger.error(f"Relationship validation: {error}")

        session.add_relationship_claim(claim)

    async def annotate_batch(
        self,
        documents: List[Union[HTMLDocument, str, Path]],
        source_urls: Optional[List[str]] = None,
        concurrency: int = 3,
    ) -> List[AnnotationSession]:
        """
        Annotate multiple documents concurrently.

        Args:
            documents: List of documents to annotate
            source_urls: Optional list of source URLs
            concurrency: Maximum concurrent requests

        Returns:
            List of AnnotationSessions
        """
        urls: List[Optional[str]] = list(source_urls) if source_urls else [None] * len(documents)

        semaphore = asyncio.Semaphore(concurrency)

        async def annotate_with_semaphore(doc: Union[HTMLDocument, str, Path], url: Optional[str]) -> AnnotationSession:
            async with semaphore:
                return await self.annotate(doc, url)

        tasks = [
            annotate_with_semaphore(doc, url)
            for doc, url in zip(documents, urls)
        ]

        return await asyncio.gather(*tasks)

    async def annotate_with_schema(
        self,
        document: Union[HTMLDocument, str, Path],
        schema: Optional[GLAMSchema] = None,
        source_url: Optional[str] = None,
        validate_output: bool = True,
    ) -> Tuple[AnnotationSession, Dict[str, Any]]:
        """
        Annotate a document using schema-driven extraction.

        This method uses GLAMSchema to:
        1. Generate targeted extraction prompts
        2. Extract structured fields defined in the schema
        3. Optionally validate output against JSON Schema

        Args:
            document: HTMLDocument, HTML string, or path to HTML file
            schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
            source_url: Optional source URL for provenance
            validate_output: Whether to validate extracted data against schema

        Returns:
            Tuple of (AnnotationSession, structured_data dict)

        Example:
            >>> schema = (
            ...     GLAMSchema("custom")
            ...     .entities("GRP", "TOP")
            ...     .structure()
            ...     .field("name::str::Institution name")  # GLiNER2 syntax
            ...     .field("type::[MUSEUM|ARCHIVE]::str::Type")
            ...     .build()
            ... )
            >>> session, data = await annotator.annotate_with_schema(doc, schema)
            >>> print(data["structured"]["name"])
        """
        # Use default schema if not provided
        if schema is None:
            schema = heritage_custodian_schema()

        # Load document
        html_content: str
        if isinstance(document, Path):
            with open(document, 'r', encoding='utf-8') as f:
                html_content = f.read()
            source_url = source_url or str(document)
        elif isinstance(document, str):
            # Check if it's a file path (short string, no HTML tags)
            is_file_path = len(document) < 500 and not document.strip().startswith('<')
            if is_file_path:
                try:
                    path = Path(document)
                    if path.exists():
                        with open(path, 'r', encoding='utf-8') as f:
                            html_content = f.read()
                        source_url = source_url or document
                    else:
                        html_content = document
                except OSError:
                    # Path too long or invalid
                    html_content = document
            else:
                html_content = document
        elif isinstance(document, HTMLDocument):
            html_content = document.raw_html
            source_url = source_url or document.source_url
        else:
            raise TypeError(f"Unsupported document type: {type(document)}")

        # Create session
        session = AnnotationSession(
            session_id=f"schema-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}",
            source_url=source_url or "unknown",
        )

        # Generate schema-aware prompt
        schema_prompt = schema.to_llm_prompt(include_examples=True, output_format="json")
        user_prompt = self._prepare_schema_prompt(html_content, schema_prompt)

        structured_data: Dict[str, Any] = {}

        try:
            # Call LLM with schema-aware prompt
            response = await self._call_llm(user_prompt)

            # Parse response
            annotations = self._parse_response(response)

            # Extract structured data from response
            structured_data = annotations.get("structured", {})

            # Also extract classifications if present
            if "classifications" in annotations:
                structured_data["_classifications"] = annotations["classifications"]

            # Extract relations if present
            if "relations" in annotations:
                structured_data["_relations"] = annotations["relations"]

            # Validate against JSON Schema if requested
            if validate_output and structured_data:
                validation_errors = self._validate_structured_output(structured_data, schema)
                if validation_errors:
                    session.errors.extend(validation_errors)

            # Populate session with entity and claim data
            self._populate_session(session, annotations, source_url)

            # Add structured data to session config
            session.config["structured_data"] = structured_data
            session.config["schema_name"] = schema.name

        except Exception as e:
            session.errors.append(f"Schema-driven annotation failed: {e}")

        session.completed_at = datetime.now(timezone.utc).isoformat()
        return session, structured_data

    def _prepare_schema_prompt(self, html_content: str, schema_prompt: str) -> str:
        """Prepare prompt with schema instructions and document content."""
        # Truncate if too long
        max_chars = 25000  # Leave room for schema prompt
        if len(html_content) > max_chars:
            html_content = html_content[:max_chars] + "\n... [truncated]"

        return f"""{schema_prompt}

---

## Document to Analyze

Extract all information following the schema above from this HTML document:

```html
{html_content}
```

## Instructions

1. Extract ALL entities matching the specified hypernyms
2. Fill in ALL structured fields from the schema
3. Include XPath locations for provenance
4. Use confidence scores appropriately
5. Return ONLY a valid JSON object matching the output format

IMPORTANT: The "structured" field in your response must contain the extracted field values.
"""

    def _validate_structured_output(
        self,
        data: Dict[str, Any],
        schema: GLAMSchema,
    ) -> List[str]:
        """
        Validate structured output against schema.

        Args:
            data: Extracted structured data
            schema: GLAMSchema used for extraction

        Returns:
            List of validation error messages (empty if valid)
        """
        errors = []

        # Check required fields
        for field in schema.fields:
            if field.required and field.name not in data:
                errors.append(f"Missing required field: {field.name}")

        # Validate field types and choices
        for field in schema.fields:
            if field.name not in data:
                continue

            value = data[field.name]

            # Check choices
            if field.choices and value:
                if field.dtype == "list":
                    invalid_values = [v for v in value if v not in field.choices]
                    if invalid_values:
                        errors.append(
                            f"Invalid values for {field.name}: {invalid_values}. "
                            f"Valid: {field.choices}"
                        )
                elif value not in field.choices:
                    errors.append(
                        f"Invalid value for {field.name}: {value}. "
                        f"Valid: {field.choices}"
                    )

            # Check patterns
            if field.pattern and value and isinstance(value, str):
                import re
                if not re.match(field.pattern, value):
                    errors.append(
                        f"Field {field.name} does not match pattern {field.pattern}: {value}"
                    )

        return errors

    async def extract_structured(
        self,
        document: Union[HTMLDocument, str, Path],
        fields: List[str],
        source_url: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Quick structured extraction using GLiNER2-style field specs.

        This is a convenience method for simple extractions without
        full annotation session overhead.

        Args:
            document: Document to extract from
            fields: List of GLiNER2-style field specs
                   e.g., ["name::str::Institution name",
                          "type::[MUSEUM|ARCHIVE]::str::Type"]
            source_url: Optional source URL

        Returns:
            Dict of extracted field values

        Example:
            >>> data = await annotator.extract_structured(
            ...     html_doc,
            ...     ["name::str::Full name",
            ...      "email::str::Contact email",
            ...      "type::[MUSEUM|ARCHIVE|LIBRARY]::str::Institution type"]
            ... )
            >>> print(data["name"])
        """
        # Build schema from field specs
        schema = GLAMSchema("quick_extraction").structure()

        for field_spec in fields:
            parsed = FieldSpec.from_gliner2_syntax(field_spec)
            schema.fields.append(parsed)

        schema = schema.build()

        # Run extraction
        _, structured_data = await self.annotate_with_schema(
            document,
            schema=schema,
            source_url=source_url,
            validate_output=False,  # Skip validation for quick extraction
        )

        return structured_data


# =============================================================================
# CONVENIENCE FUNCTIONS
# =============================================================================

def create_llm_annotator(
    provider: str = "zai",
    model: Optional[str] = None,
    api_key: Optional[str] = None,
    enable_fallback: bool = True,
    max_retries: int = 5,
) -> LLMAnnotator:
    """
    Create an LLM annotator with the specified provider.

    Args:
        provider: "zai", "anthropic", or "openai"
        model: Optional model name (uses provider default if not specified)
        api_key: Optional API key (uses environment variable if not specified)
        enable_fallback: Enable automatic fallback to other providers on failure
        max_retries: Maximum retry attempts per provider

    Returns:
        Configured LLMAnnotator instance
    """
    provider_enum = LLMProvider(provider)

    default_models = {
        LLMProvider.ZAI: "glm-4.6",
        LLMProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
        LLMProvider.OPENAI: "gpt-4o",
    }

    # Configure retry
    retry_config = RetryConfig(max_retries=max_retries)

    # Configure fallback providers
    fallback_providers = (
        [p for p in [LLMProvider.ZAI, LLMProvider.ANTHROPIC, LLMProvider.OPENAI]
         if p != provider_enum]
        if enable_fallback else []
    )

    config = LLMAnnotatorConfig(
        provider=provider_enum,
        model=model or default_models[provider_enum],
        api_key=api_key,
        retry=retry_config,
        fallback_providers=fallback_providers,
    )

    return LLMAnnotator(config)


async def annotate_html_file(
    file_path: Union[str, Path],
    provider: str = "zai",
    model: Optional[str] = None,
) -> AnnotationSession:
    """
    Annotate an HTML file using LLM.

    Args:
        file_path: Path to HTML file
        provider: LLM provider ("zai", "anthropic", "openai")
        model: Optional model name

    Returns:
        AnnotationSession with extracted claims
    """
    annotator = create_llm_annotator(provider=provider, model=model)
    return await annotator.annotate(file_path)


async def annotate_with_schema(
    file_path: Union[str, Path],
    schema: Optional[GLAMSchema] = None,
    provider: str = "zai",
    model: Optional[str] = None,
) -> Tuple[AnnotationSession, Dict[str, Any]]:
    """
    Annotate an HTML file using schema-driven extraction.

    Args:
        file_path: Path to HTML file
        schema: GLAMSchema for extraction (defaults to heritage_custodian_schema)
        provider: LLM provider ("zai", "anthropic", "openai")
        model: Optional model name

    Returns:
        Tuple of (AnnotationSession, structured_data dict)

    Example:
        >>> schema = (
        ...     GLAMSchema("museum")
        ...     .entities("GRP", "TOP")
        ...     .structure()
        ...     .field("name::str::Museum name")
        ...     .field("city::str::City location")
        ...     .build()
        ... )
        >>> session, data = await annotate_with_schema("museum.html", schema)
        >>> print(data["name"])
    """
    annotator = create_llm_annotator(provider=provider, model=model)
    return await annotator.annotate_with_schema(file_path, schema=schema)


__all__ = [
    "LLMProvider",
    "LLMAnnotatorConfig",
    "RetryConfig",
    "LLMAnnotator",
    "GLAM_NER_SYSTEM_PROMPT",
    "create_llm_annotator",
    "annotate_html_file",
    "annotate_with_schema",
]