glam/archive/deprecated_heuristic_annotators/entity_annotator.py
2025-12-05 15:30:23 +01:00

368 lines
12 KiB
Python

"""
Entity Annotator for GLAM documents.
Recognizes entities using the 10 hypernyms from GLAM-NER v1.7.0:
- AGT (Agent): Humans, animals, AI, fictional characters
- GRP (Group): Organizations, collectives
- TOP (Toponym): Place names, nominal references
- GEO (Geometry): Coordinates, shapes
- TMP (Temporal): Dates, durations, periods
- APP (Appellation): Identifiers, codes
- ROL (Role): Titles, positions, honorifics
- WRK (Work): Textual references (FRBR)
- QTY (Quantity): Measurements, counts
- THG (Thing): Physical objects, artifacts
"""
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from .base import (
BaseAnnotator,
AnnotationSession,
EntityClaim,
EntityHypernym,
Provenance,
)
from .html_parser import HTMLDocument, HTMLElement
# =============================================================================
# ENTITY PATTERNS
# =============================================================================
@dataclass
class EntityPattern:
"""Pattern for entity recognition."""
hypernym: EntityHypernym
hyponym: str
pattern: str
cidoc_class: Optional[str] = None
tei_element: Optional[str] = None
priority: int = 0
# Common patterns for heritage domain
ENTITY_PATTERNS: List[EntityPattern] = [
# AGT - Agents (persons)
EntityPattern(
hypernym=EntityHypernym.AGT,
hyponym="AGT.PER",
pattern=r"\b(?:Dr\.?|Prof\.?|Mr\.?|Mrs\.?|Ms\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b",
cidoc_class="crm:E21_Person",
tei_element="tei:persName",
priority=10,
),
# GRP - Organizations
EntityPattern(
hypernym=EntityHypernym.GRP,
hyponym="GRP.ORG",
pattern=r"\b(?:Stichting|Vereniging|Foundation|Association|Society|Institute|Museum|Library|Archive|Archief|Bibliotheek|Galerie|Gallery)\s+[A-Z][a-zA-Z\s]+\b",
cidoc_class="crm:E74_Group",
tei_element="tei:orgName",
priority=10,
),
# TOP - Toponyms (place names)
EntityPattern(
hypernym=EntityHypernym.TOP,
hyponym="TOP.PPL",
pattern=r"\b(?:Amsterdam|Rotterdam|Den Haag|Utrecht|Eindhoven|Groningen|Tilburg|Almere|Breda|Nijmegen|Haarlem|Arnhem|Zaanstad|Amersfoort|Apeldoorn|Enschede|Maastricht|Leiden|Dordrecht|Zoetermeer)\b",
cidoc_class="crm:E53_Place",
tei_element="tei:placeName",
priority=5,
),
# TMP - Temporal expressions
EntityPattern(
hypernym=EntityHypernym.TMP,
hyponym="TMP.DAB", # Datable (absolute)
pattern=r"\b(?:19|20)\d{2}\b", # Years 1900-2099
cidoc_class="crm:E52_Time-Span",
tei_element="tei:date",
priority=5,
),
EntityPattern(
hypernym=EntityHypernym.TMP,
hyponym="TMP.DAB",
pattern=r"\b\d{1,2}[-/]\d{1,2}[-/](?:19|20)\d{2}\b", # Full dates
cidoc_class="crm:E52_Time-Span",
tei_element="tei:date",
priority=8,
),
# APP - Appellations (identifiers)
EntityPattern(
hypernym=EntityHypernym.APP,
hyponym="APP.ISIL",
pattern=r"\b[A-Z]{2}-[A-Za-z0-9]+\b", # ISIL codes
cidoc_class="crm:E42_Identifier",
tei_element="tei:idno",
priority=10,
),
EntityPattern(
hypernym=EntityHypernym.APP,
hyponym="APP.WIKIDATA",
pattern=r"\bQ\d+\b", # Wikidata IDs
cidoc_class="crm:E42_Identifier",
tei_element="tei:idno",
priority=10,
),
EntityPattern(
hypernym=EntityHypernym.APP,
hyponym="APP.URL",
pattern=r"https?://[^\s<>\"']+", # URLs
cidoc_class="crm:E42_Identifier",
tei_element="tei:ref",
priority=5,
),
EntityPattern(
hypernym=EntityHypernym.APP,
hyponym="APP.EMAIL",
pattern=r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", # Emails
cidoc_class="crm:E42_Identifier",
tei_element="tei:email",
priority=8,
),
EntityPattern(
hypernym=EntityHypernym.APP,
hyponym="APP.PHONE",
pattern=r"\b(?:\+31|0)[\s-]?\d{1,3}[\s-]?\d{3,4}[\s-]?\d{3,4}\b", # Dutch phones
cidoc_class="crm:E42_Identifier",
tei_element="tei:phone",
priority=7,
),
# ROL - Roles (titles, positions)
EntityPattern(
hypernym=EntityHypernym.ROL,
hyponym="ROL.OCU",
pattern=r"\b(?:directeur|director|curator|conservator|archivist|archivaris|bibliothecaris|librarian|manager|voorzitter|chairman|secretaris|secretary|penningmeester|treasurer)\b",
cidoc_class="crm:E55_Type",
tei_element="tei:roleName",
priority=6,
),
# QTY - Quantities
EntityPattern(
hypernym=EntityHypernym.QTY,
hyponym="QTY.CNT",
pattern=r"\b\d+(?:\.\d+)?\s*(?:items?|objects?|documents?|photos?|pieces?|stuks?|objecten|documenten)\b",
cidoc_class="crm:E54_Dimension",
tei_element="tei:measure",
priority=6,
),
]
class EntityAnnotator(BaseAnnotator):
"""
Entity annotator for heritage documents.
Uses pattern matching and (optionally) LLM-based extraction
to identify entities following GLAM-NER v1.7.0 taxonomy.
"""
def __init__(
self,
patterns: Optional[List[EntityPattern]] = None,
use_llm: bool = False,
model_id: Optional[str] = None,
):
"""
Initialize entity annotator.
Args:
patterns: Custom entity patterns (defaults to ENTITY_PATTERNS)
use_llm: Use LLM for enhanced recognition
model_id: LLM model identifier (e.g., "glm-4-flash")
"""
super().__init__(
agent_name="EntityAnnotator",
agent_version="1.0.0",
model_id=model_id,
)
self.patterns = patterns or ENTITY_PATTERNS
self.use_llm = use_llm
# Compile patterns
self._compiled_patterns = [
(p, re.compile(p.pattern, re.IGNORECASE if p.priority < 10 else 0))
for p in sorted(self.patterns, key=lambda x: -x.priority)
]
def annotate(
self,
document: Any,
session: Optional[AnnotationSession] = None,
) -> AnnotationSession:
"""
Annotate entities in a document.
Args:
document: HTMLDocument or text string
session: Existing session to add claims to
Returns:
AnnotationSession with entity claims
"""
if session is None:
source_url = None
source_file = None
if isinstance(document, HTMLDocument):
source_url = document.source_url
source_file = document.source_file
session = self.create_session(
source_url=source_url,
source_file=source_file,
)
# Get text and element mappings
if isinstance(document, HTMLDocument):
self._annotate_html_document(document, session)
elif isinstance(document, str):
self._annotate_text(document, session)
else:
raise ValueError(f"Unsupported document type: {type(document)}")
return session
def _annotate_html_document(
self,
document: HTMLDocument,
session: AnnotationSession,
):
"""Annotate entities in HTML document with XPath provenance."""
for element in document.elements:
if not element.text_content.strip():
continue
# Find entities in this element
entities = self._find_entities_in_text(
element.text_content,
element.xpath,
document.source_url,
document.source_file,
)
for claim in entities:
# Adjust offsets to be relative to element
if element.start_offset is not None and claim.start_offset is not None:
claim.start_offset += element.start_offset
claim.end_offset += element.start_offset
session.add_entity_claim(claim)
def _annotate_text(
self,
text: str,
session: AnnotationSession,
xpath: str = "/text()[1]",
):
"""Annotate entities in plain text."""
entities = self._find_entities_in_text(
text,
xpath,
session.source_url,
session.source_file,
)
for claim in entities:
session.add_entity_claim(claim)
def _find_entities_in_text(
self,
text: str,
xpath: str,
source_url: Optional[str] = None,
source_file: Optional[str] = None,
) -> List[EntityClaim]:
"""
Find entities in text using patterns.
Args:
text: Text to search
xpath: XPath location of text
source_url: Source URL for provenance
source_file: Source file for provenance
Returns:
List of EntityClaim objects
"""
claims = []
seen_spans = set() # Avoid duplicate matches
for pattern, compiled in self._compiled_patterns:
for match in compiled.finditer(text):
start, end = match.span()
# Skip if overlapping with higher priority match
span_key = (start, end)
if span_key in seen_spans:
continue
seen_spans.add(span_key)
matched_text = match.group(0)
# Create provenance
provenance = self.create_provenance(
namespace=self._get_namespace(pattern),
path=xpath,
confidence=0.8, # Pattern-based confidence
source_url=source_url,
source_file=source_file,
)
# Create claim
claim = EntityClaim(
hypernym=pattern.hypernym,
hyponym=pattern.hyponym,
claim_value=matched_text,
text_content=matched_text,
start_offset=start,
end_offset=end,
provenance=provenance,
cidoc_class=pattern.cidoc_class,
tei_element=pattern.tei_element,
recognition_confidence=0.8,
)
claims.append(claim)
return claims
def _get_namespace(self, pattern: EntityPattern) -> str:
"""Get ontology namespace prefix for pattern."""
if pattern.cidoc_class:
return "crm"
elif pattern.tei_element:
return "tei"
return "glam"
def add_pattern(self, pattern: EntityPattern):
"""Add a new entity pattern."""
self.patterns.append(pattern)
self._compiled_patterns.append(
(pattern, re.compile(pattern.pattern, re.IGNORECASE if pattern.priority < 10 else 0))
)
# Re-sort by priority
self._compiled_patterns.sort(key=lambda x: -x[0].priority)
def create_heritage_entity_annotator() -> EntityAnnotator:
"""
Create an entity annotator optimized for heritage documents.
Includes patterns for:
- Heritage institution names (museums, archives, libraries)
- Dutch place names
- ISIL codes
- Wikidata IDs
- Common heritage roles (curator, archivist, etc.)
"""
return EntityAnnotator(
patterns=ENTITY_PATTERNS,
use_llm=False,
)