glam/archive/deprecated_heuristic_annotators/entity_annotator.py

"""
Entity Annotator for GLAM documents.

Recognizes entities using the 10 hypernyms from GLAM-NER v1.7.0:
- AGT (Agent): Humans, animals, AI, fictional characters
- GRP (Group): Organizations, collectives
- TOP (Toponym): Place names, nominal references
- GEO (Geometry): Coordinates, shapes
- TMP (Temporal): Dates, durations, periods
- APP (Appellation): Identifiers, codes
- ROL (Role): Titles, positions, honorifics
- WRK (Work): Textual references (FRBR)
- QTY (Quantity): Measurements, counts
- THG (Thing): Physical objects, artifacts
"""

import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

from .base import (
    BaseAnnotator,
    AnnotationSession,
    EntityClaim,
    EntityHypernym,
    Provenance,
)
from .html_parser import HTMLDocument, HTMLElement


# =============================================================================
# ENTITY PATTERNS
# =============================================================================

@dataclass
class EntityPattern:
    """Pattern for entity recognition."""
    hypernym: EntityHypernym
    hyponym: str
    pattern: str
    cidoc_class: Optional[str] = None
    tei_element: Optional[str] = None
    priority: int = 0


# Common patterns for heritage domain
ENTITY_PATTERNS: List[EntityPattern] = [
    # AGT - Agents (persons)
    EntityPattern(
        hypernym=EntityHypernym.AGT,
        hyponym="AGT.PER",
        pattern=r"\b(?:Dr\.?|Prof\.?|Mr\.?|Mrs\.?|Ms\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b",
        cidoc_class="crm:E21_Person",
        tei_element="tei:persName",
        priority=10,
    ),

    # GRP - Organizations
    EntityPattern(
        hypernym=EntityHypernym.GRP,
        hyponym="GRP.ORG",
        pattern=r"\b(?:Stichting|Vereniging|Foundation|Association|Society|Institute|Museum|Library|Archive|Archief|Bibliotheek|Galerie|Gallery)\s+[A-Z][a-zA-Z\s]+\b",
        cidoc_class="crm:E74_Group",
        tei_element="tei:orgName",
        priority=10,
    ),

    # TOP - Toponyms (place names)
    EntityPattern(
        hypernym=EntityHypernym.TOP,
        hyponym="TOP.PPL",
        pattern=r"\b(?:Amsterdam|Rotterdam|Den Haag|Utrecht|Eindhoven|Groningen|Tilburg|Almere|Breda|Nijmegen|Haarlem|Arnhem|Zaanstad|Amersfoort|Apeldoorn|Enschede|Maastricht|Leiden|Dordrecht|Zoetermeer)\b",
        cidoc_class="crm:E53_Place",
        tei_element="tei:placeName",
        priority=5,
    ),

    # TMP - Temporal expressions
    EntityPattern(
        hypernym=EntityHypernym.TMP,
        hyponym="TMP.DAB",  # Datable (absolute)
        pattern=r"\b(?:19|20)\d{2}\b",  # Years 1900-2099
        cidoc_class="crm:E52_Time-Span",
        tei_element="tei:date",
        priority=5,
    ),
    EntityPattern(
        hypernym=EntityHypernym.TMP,
        hyponym="TMP.DAB",
        pattern=r"\b\d{1,2}[-/]\d{1,2}[-/](?:19|20)\d{2}\b",  # Full dates
        cidoc_class="crm:E52_Time-Span",
        tei_element="tei:date",
        priority=8,
    ),

    # APP - Appellations (identifiers)
    EntityPattern(
        hypernym=EntityHypernym.APP,
        hyponym="APP.ISIL",
        pattern=r"\b[A-Z]{2}-[A-Za-z0-9]+\b",  # ISIL codes
        cidoc_class="crm:E42_Identifier",
        tei_element="tei:idno",
        priority=10,
    ),
    EntityPattern(
        hypernym=EntityHypernym.APP,
        hyponym="APP.WIKIDATA",
        pattern=r"\bQ\d+\b",  # Wikidata IDs
        cidoc_class="crm:E42_Identifier",
        tei_element="tei:idno",
        priority=10,
    ),
    EntityPattern(
        hypernym=EntityHypernym.APP,
        hyponym="APP.URL",
        pattern=r"https?://[^\s<>\"']+",  # URLs
        cidoc_class="crm:E42_Identifier",
        tei_element="tei:ref",
        priority=5,
    ),
    EntityPattern(
        hypernym=EntityHypernym.APP,
        hyponym="APP.EMAIL",
        pattern=r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b",  # Emails
        cidoc_class="crm:E42_Identifier",
        tei_element="tei:email",
        priority=8,
    ),
    EntityPattern(
        hypernym=EntityHypernym.APP,
        hyponym="APP.PHONE",
        pattern=r"\b(?:\+31|0)[\s-]?\d{1,3}[\s-]?\d{3,4}[\s-]?\d{3,4}\b",  # Dutch phones
        cidoc_class="crm:E42_Identifier",
        tei_element="tei:phone",
        priority=7,
    ),

    # ROL - Roles (titles, positions)
    EntityPattern(
        hypernym=EntityHypernym.ROL,
        hyponym="ROL.OCU",
        pattern=r"\b(?:directeur|director|curator|conservator|archivist|archivaris|bibliothecaris|librarian|manager|voorzitter|chairman|secretaris|secretary|penningmeester|treasurer)\b",
        cidoc_class="crm:E55_Type",
        tei_element="tei:roleName",
        priority=6,
    ),

    # QTY - Quantities
    EntityPattern(
        hypernym=EntityHypernym.QTY,
        hyponym="QTY.CNT",
        pattern=r"\b\d+(?:\.\d+)?\s*(?:items?|objects?|documents?|photos?|pieces?|stuks?|objecten|documenten)\b",
        cidoc_class="crm:E54_Dimension",
        tei_element="tei:measure",
        priority=6,
    ),
]


class EntityAnnotator(BaseAnnotator):
    """
    Entity annotator for heritage documents.

    Uses pattern matching and (optionally) LLM-based extraction
    to identify entities following GLAM-NER v1.7.0 taxonomy.
    """

    def __init__(
        self,
        patterns: Optional[List[EntityPattern]] = None,
        use_llm: bool = False,
        model_id: Optional[str] = None,
    ):
        """
        Initialize entity annotator.

        Args:
            patterns: Custom entity patterns (defaults to ENTITY_PATTERNS)
            use_llm: Use LLM for enhanced recognition
            model_id: LLM model identifier (e.g., "glm-4-flash")
        """
        super().__init__(
            agent_name="EntityAnnotator",
            agent_version="1.0.0",
            model_id=model_id,
        )
        self.patterns = patterns or ENTITY_PATTERNS
        self.use_llm = use_llm

        # Compile patterns
        self._compiled_patterns = [
            (p, re.compile(p.pattern, re.IGNORECASE if p.priority < 10 else 0))
            for p in sorted(self.patterns, key=lambda x: -x.priority)
        ]

    def annotate(
        self,
        document: Any,
        session: Optional[AnnotationSession] = None,
    ) -> AnnotationSession:
        """
        Annotate entities in a document.

        Args:
            document: HTMLDocument or text string
            session: Existing session to add claims to

        Returns:
            AnnotationSession with entity claims
        """
        if session is None:
            source_url = None
            source_file = None
            if isinstance(document, HTMLDocument):
                source_url = document.source_url
                source_file = document.source_file
            session = self.create_session(
                source_url=source_url,
                source_file=source_file,
            )

        # Get text and element mappings
        if isinstance(document, HTMLDocument):
            self._annotate_html_document(document, session)
        elif isinstance(document, str):
            self._annotate_text(document, session)
        else:
            raise ValueError(f"Unsupported document type: {type(document)}")

        return session

    def _annotate_html_document(
        self,
        document: HTMLDocument,
        session: AnnotationSession,
    ):
        """Annotate entities in HTML document with XPath provenance."""
        for element in document.elements:
            if not element.text_content.strip():
                continue

            # Find entities in this element
            entities = self._find_entities_in_text(
                element.text_content,
                element.xpath,
                document.source_url,
                document.source_file,
            )

            for claim in entities:
                # Adjust offsets to be relative to element
                if element.start_offset is not None and claim.start_offset is not None:
                    claim.start_offset += element.start_offset
                    claim.end_offset += element.start_offset

                session.add_entity_claim(claim)

    def _annotate_text(
        self,
        text: str,
        session: AnnotationSession,
        xpath: str = "/text()[1]",
    ):
        """Annotate entities in plain text."""
        entities = self._find_entities_in_text(
            text,
            xpath,
            session.source_url,
            session.source_file,
        )

        for claim in entities:
            session.add_entity_claim(claim)

    def _find_entities_in_text(
        self,
        text: str,
        xpath: str,
        source_url: Optional[str] = None,
        source_file: Optional[str] = None,
    ) -> List[EntityClaim]:
        """
        Find entities in text using patterns.

        Args:
            text: Text to search
            xpath: XPath location of text
            source_url: Source URL for provenance
            source_file: Source file for provenance

        Returns:
            List of EntityClaim objects
        """
        claims = []
        seen_spans = set()  # Avoid duplicate matches

        for pattern, compiled in self._compiled_patterns:
            for match in compiled.finditer(text):
                start, end = match.span()

                # Skip if overlapping with higher priority match
                span_key = (start, end)
                if span_key in seen_spans:
                    continue
                seen_spans.add(span_key)

                matched_text = match.group(0)

                # Create provenance
                provenance = self.create_provenance(
                    namespace=self._get_namespace(pattern),
                    path=xpath,
                    confidence=0.8,  # Pattern-based confidence
                    source_url=source_url,
                    source_file=source_file,
                )

                # Create claim
                claim = EntityClaim(
                    hypernym=pattern.hypernym,
                    hyponym=pattern.hyponym,
                    claim_value=matched_text,
                    text_content=matched_text,
                    start_offset=start,
                    end_offset=end,
                    provenance=provenance,
                    cidoc_class=pattern.cidoc_class,
                    tei_element=pattern.tei_element,
                    recognition_confidence=0.8,
                )

                claims.append(claim)

        return claims

    def _get_namespace(self, pattern: EntityPattern) -> str:
        """Get ontology namespace prefix for pattern."""
        if pattern.cidoc_class:
            return "crm"
        elif pattern.tei_element:
            return "tei"
        return "glam"

    def add_pattern(self, pattern: EntityPattern):
        """Add a new entity pattern."""
        self.patterns.append(pattern)
        self._compiled_patterns.append(
            (pattern, re.compile(pattern.pattern, re.IGNORECASE if pattern.priority < 10 else 0))
        )
        # Re-sort by priority
        self._compiled_patterns.sort(key=lambda x: -x[0].priority)


def create_heritage_entity_annotator() -> EntityAnnotator:
    """
    Create an entity annotator optimized for heritage documents.

    Includes patterns for:
    - Heritage institution names (museums, archives, libraries)
    - Dutch place names
    - ISIL codes
    - Wikidata IDs
    - Common heritage roles (curator, archivist, etc.)
    """
    return EntityAnnotator(
        patterns=ENTITY_PATTERNS,
        use_llm=False,
    )