""" Entity Annotator for GLAM documents. Recognizes entities using the 10 hypernyms from GLAM-NER v1.7.0: - AGT (Agent): Humans, animals, AI, fictional characters - GRP (Group): Organizations, collectives - TOP (Toponym): Place names, nominal references - GEO (Geometry): Coordinates, shapes - TMP (Temporal): Dates, durations, periods - APP (Appellation): Identifiers, codes - ROL (Role): Titles, positions, honorifics - WRK (Work): Textual references (FRBR) - QTY (Quantity): Measurements, counts - THG (Thing): Physical objects, artifacts """ import re from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple from .base import ( BaseAnnotator, AnnotationSession, EntityClaim, EntityHypernym, Provenance, ) from .html_parser import HTMLDocument, HTMLElement # ============================================================================= # ENTITY PATTERNS # ============================================================================= @dataclass class EntityPattern: """Pattern for entity recognition.""" hypernym: EntityHypernym hyponym: str pattern: str cidoc_class: Optional[str] = None tei_element: Optional[str] = None priority: int = 0 # Common patterns for heritage domain ENTITY_PATTERNS: List[EntityPattern] = [ # AGT - Agents (persons) EntityPattern( hypernym=EntityHypernym.AGT, hyponym="AGT.PER", pattern=r"\b(?:Dr\.?|Prof\.?|Mr\.?|Mrs\.?|Ms\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", cidoc_class="crm:E21_Person", tei_element="tei:persName", priority=10, ), # GRP - Organizations EntityPattern( hypernym=EntityHypernym.GRP, hyponym="GRP.ORG", pattern=r"\b(?:Stichting|Vereniging|Foundation|Association|Society|Institute|Museum|Library|Archive|Archief|Bibliotheek|Galerie|Gallery)\s+[A-Z][a-zA-Z\s]+\b", cidoc_class="crm:E74_Group", tei_element="tei:orgName", priority=10, ), # TOP - Toponyms (place names) EntityPattern( hypernym=EntityHypernym.TOP, hyponym="TOP.PPL", pattern=r"\b(?:Amsterdam|Rotterdam|Den Haag|Utrecht|Eindhoven|Groningen|Tilburg|Almere|Breda|Nijmegen|Haarlem|Arnhem|Zaanstad|Amersfoort|Apeldoorn|Enschede|Maastricht|Leiden|Dordrecht|Zoetermeer)\b", cidoc_class="crm:E53_Place", tei_element="tei:placeName", priority=5, ), # TMP - Temporal expressions EntityPattern( hypernym=EntityHypernym.TMP, hyponym="TMP.DAB", # Datable (absolute) pattern=r"\b(?:19|20)\d{2}\b", # Years 1900-2099 cidoc_class="crm:E52_Time-Span", tei_element="tei:date", priority=5, ), EntityPattern( hypernym=EntityHypernym.TMP, hyponym="TMP.DAB", pattern=r"\b\d{1,2}[-/]\d{1,2}[-/](?:19|20)\d{2}\b", # Full dates cidoc_class="crm:E52_Time-Span", tei_element="tei:date", priority=8, ), # APP - Appellations (identifiers) EntityPattern( hypernym=EntityHypernym.APP, hyponym="APP.ISIL", pattern=r"\b[A-Z]{2}-[A-Za-z0-9]+\b", # ISIL codes cidoc_class="crm:E42_Identifier", tei_element="tei:idno", priority=10, ), EntityPattern( hypernym=EntityHypernym.APP, hyponym="APP.WIKIDATA", pattern=r"\bQ\d+\b", # Wikidata IDs cidoc_class="crm:E42_Identifier", tei_element="tei:idno", priority=10, ), EntityPattern( hypernym=EntityHypernym.APP, hyponym="APP.URL", pattern=r"https?://[^\s<>\"']+", # URLs cidoc_class="crm:E42_Identifier", tei_element="tei:ref", priority=5, ), EntityPattern( hypernym=EntityHypernym.APP, hyponym="APP.EMAIL", pattern=r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", # Emails cidoc_class="crm:E42_Identifier", tei_element="tei:email", priority=8, ), EntityPattern( hypernym=EntityHypernym.APP, hyponym="APP.PHONE", pattern=r"\b(?:\+31|0)[\s-]?\d{1,3}[\s-]?\d{3,4}[\s-]?\d{3,4}\b", # Dutch phones cidoc_class="crm:E42_Identifier", tei_element="tei:phone", priority=7, ), # ROL - Roles (titles, positions) EntityPattern( hypernym=EntityHypernym.ROL, hyponym="ROL.OCU", pattern=r"\b(?:directeur|director|curator|conservator|archivist|archivaris|bibliothecaris|librarian|manager|voorzitter|chairman|secretaris|secretary|penningmeester|treasurer)\b", cidoc_class="crm:E55_Type", tei_element="tei:roleName", priority=6, ), # QTY - Quantities EntityPattern( hypernym=EntityHypernym.QTY, hyponym="QTY.CNT", pattern=r"\b\d+(?:\.\d+)?\s*(?:items?|objects?|documents?|photos?|pieces?|stuks?|objecten|documenten)\b", cidoc_class="crm:E54_Dimension", tei_element="tei:measure", priority=6, ), ] class EntityAnnotator(BaseAnnotator): """ Entity annotator for heritage documents. Uses pattern matching and (optionally) LLM-based extraction to identify entities following GLAM-NER v1.7.0 taxonomy. """ def __init__( self, patterns: Optional[List[EntityPattern]] = None, use_llm: bool = False, model_id: Optional[str] = None, ): """ Initialize entity annotator. Args: patterns: Custom entity patterns (defaults to ENTITY_PATTERNS) use_llm: Use LLM for enhanced recognition model_id: LLM model identifier (e.g., "glm-4-flash") """ super().__init__( agent_name="EntityAnnotator", agent_version="1.0.0", model_id=model_id, ) self.patterns = patterns or ENTITY_PATTERNS self.use_llm = use_llm # Compile patterns self._compiled_patterns = [ (p, re.compile(p.pattern, re.IGNORECASE if p.priority < 10 else 0)) for p in sorted(self.patterns, key=lambda x: -x.priority) ] def annotate( self, document: Any, session: Optional[AnnotationSession] = None, ) -> AnnotationSession: """ Annotate entities in a document. Args: document: HTMLDocument or text string session: Existing session to add claims to Returns: AnnotationSession with entity claims """ if session is None: source_url = None source_file = None if isinstance(document, HTMLDocument): source_url = document.source_url source_file = document.source_file session = self.create_session( source_url=source_url, source_file=source_file, ) # Get text and element mappings if isinstance(document, HTMLDocument): self._annotate_html_document(document, session) elif isinstance(document, str): self._annotate_text(document, session) else: raise ValueError(f"Unsupported document type: {type(document)}") return session def _annotate_html_document( self, document: HTMLDocument, session: AnnotationSession, ): """Annotate entities in HTML document with XPath provenance.""" for element in document.elements: if not element.text_content.strip(): continue # Find entities in this element entities = self._find_entities_in_text( element.text_content, element.xpath, document.source_url, document.source_file, ) for claim in entities: # Adjust offsets to be relative to element if element.start_offset is not None and claim.start_offset is not None: claim.start_offset += element.start_offset claim.end_offset += element.start_offset session.add_entity_claim(claim) def _annotate_text( self, text: str, session: AnnotationSession, xpath: str = "/text()[1]", ): """Annotate entities in plain text.""" entities = self._find_entities_in_text( text, xpath, session.source_url, session.source_file, ) for claim in entities: session.add_entity_claim(claim) def _find_entities_in_text( self, text: str, xpath: str, source_url: Optional[str] = None, source_file: Optional[str] = None, ) -> List[EntityClaim]: """ Find entities in text using patterns. Args: text: Text to search xpath: XPath location of text source_url: Source URL for provenance source_file: Source file for provenance Returns: List of EntityClaim objects """ claims = [] seen_spans = set() # Avoid duplicate matches for pattern, compiled in self._compiled_patterns: for match in compiled.finditer(text): start, end = match.span() # Skip if overlapping with higher priority match span_key = (start, end) if span_key in seen_spans: continue seen_spans.add(span_key) matched_text = match.group(0) # Create provenance provenance = self.create_provenance( namespace=self._get_namespace(pattern), path=xpath, confidence=0.8, # Pattern-based confidence source_url=source_url, source_file=source_file, ) # Create claim claim = EntityClaim( hypernym=pattern.hypernym, hyponym=pattern.hyponym, claim_value=matched_text, text_content=matched_text, start_offset=start, end_offset=end, provenance=provenance, cidoc_class=pattern.cidoc_class, tei_element=pattern.tei_element, recognition_confidence=0.8, ) claims.append(claim) return claims def _get_namespace(self, pattern: EntityPattern) -> str: """Get ontology namespace prefix for pattern.""" if pattern.cidoc_class: return "crm" elif pattern.tei_element: return "tei" return "glam" def add_pattern(self, pattern: EntityPattern): """Add a new entity pattern.""" self.patterns.append(pattern) self._compiled_patterns.append( (pattern, re.compile(pattern.pattern, re.IGNORECASE if pattern.priority < 10 else 0)) ) # Re-sort by priority self._compiled_patterns.sort(key=lambda x: -x[0].priority) def create_heritage_entity_annotator() -> EntityAnnotator: """ Create an entity annotator optimized for heritage documents. Includes patterns for: - Heritage institution names (museums, archives, libraries) - Dutch place names - ISIL codes - Wikidata IDs - Common heritage roles (curator, archivist, etc.) """ return EntityAnnotator( patterns=ENTITY_PATTERNS, use_llm=False, )