368 lines
12 KiB
Python
368 lines
12 KiB
Python
"""
|
|
Entity Annotator for GLAM documents.
|
|
|
|
Recognizes entities using the 10 hypernyms from GLAM-NER v1.7.0:
|
|
- AGT (Agent): Humans, animals, AI, fictional characters
|
|
- GRP (Group): Organizations, collectives
|
|
- TOP (Toponym): Place names, nominal references
|
|
- GEO (Geometry): Coordinates, shapes
|
|
- TMP (Temporal): Dates, durations, periods
|
|
- APP (Appellation): Identifiers, codes
|
|
- ROL (Role): Titles, positions, honorifics
|
|
- WRK (Work): Textual references (FRBR)
|
|
- QTY (Quantity): Measurements, counts
|
|
- THG (Thing): Physical objects, artifacts
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from .base import (
|
|
BaseAnnotator,
|
|
AnnotationSession,
|
|
EntityClaim,
|
|
EntityHypernym,
|
|
Provenance,
|
|
)
|
|
from .html_parser import HTMLDocument, HTMLElement
|
|
|
|
|
|
# =============================================================================
|
|
# ENTITY PATTERNS
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class EntityPattern:
|
|
"""Pattern for entity recognition."""
|
|
hypernym: EntityHypernym
|
|
hyponym: str
|
|
pattern: str
|
|
cidoc_class: Optional[str] = None
|
|
tei_element: Optional[str] = None
|
|
priority: int = 0
|
|
|
|
|
|
# Common patterns for heritage domain
|
|
ENTITY_PATTERNS: List[EntityPattern] = [
|
|
# AGT - Agents (persons)
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.AGT,
|
|
hyponym="AGT.PER",
|
|
pattern=r"\b(?:Dr\.?|Prof\.?|Mr\.?|Mrs\.?|Ms\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b",
|
|
cidoc_class="crm:E21_Person",
|
|
tei_element="tei:persName",
|
|
priority=10,
|
|
),
|
|
|
|
# GRP - Organizations
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.GRP,
|
|
hyponym="GRP.ORG",
|
|
pattern=r"\b(?:Stichting|Vereniging|Foundation|Association|Society|Institute|Museum|Library|Archive|Archief|Bibliotheek|Galerie|Gallery)\s+[A-Z][a-zA-Z\s]+\b",
|
|
cidoc_class="crm:E74_Group",
|
|
tei_element="tei:orgName",
|
|
priority=10,
|
|
),
|
|
|
|
# TOP - Toponyms (place names)
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.TOP,
|
|
hyponym="TOP.PPL",
|
|
pattern=r"\b(?:Amsterdam|Rotterdam|Den Haag|Utrecht|Eindhoven|Groningen|Tilburg|Almere|Breda|Nijmegen|Haarlem|Arnhem|Zaanstad|Amersfoort|Apeldoorn|Enschede|Maastricht|Leiden|Dordrecht|Zoetermeer)\b",
|
|
cidoc_class="crm:E53_Place",
|
|
tei_element="tei:placeName",
|
|
priority=5,
|
|
),
|
|
|
|
# TMP - Temporal expressions
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.TMP,
|
|
hyponym="TMP.DAB", # Datable (absolute)
|
|
pattern=r"\b(?:19|20)\d{2}\b", # Years 1900-2099
|
|
cidoc_class="crm:E52_Time-Span",
|
|
tei_element="tei:date",
|
|
priority=5,
|
|
),
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.TMP,
|
|
hyponym="TMP.DAB",
|
|
pattern=r"\b\d{1,2}[-/]\d{1,2}[-/](?:19|20)\d{2}\b", # Full dates
|
|
cidoc_class="crm:E52_Time-Span",
|
|
tei_element="tei:date",
|
|
priority=8,
|
|
),
|
|
|
|
# APP - Appellations (identifiers)
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.APP,
|
|
hyponym="APP.ISIL",
|
|
pattern=r"\b[A-Z]{2}-[A-Za-z0-9]+\b", # ISIL codes
|
|
cidoc_class="crm:E42_Identifier",
|
|
tei_element="tei:idno",
|
|
priority=10,
|
|
),
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.APP,
|
|
hyponym="APP.WIKIDATA",
|
|
pattern=r"\bQ\d+\b", # Wikidata IDs
|
|
cidoc_class="crm:E42_Identifier",
|
|
tei_element="tei:idno",
|
|
priority=10,
|
|
),
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.APP,
|
|
hyponym="APP.URL",
|
|
pattern=r"https?://[^\s<>\"']+", # URLs
|
|
cidoc_class="crm:E42_Identifier",
|
|
tei_element="tei:ref",
|
|
priority=5,
|
|
),
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.APP,
|
|
hyponym="APP.EMAIL",
|
|
pattern=r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", # Emails
|
|
cidoc_class="crm:E42_Identifier",
|
|
tei_element="tei:email",
|
|
priority=8,
|
|
),
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.APP,
|
|
hyponym="APP.PHONE",
|
|
pattern=r"\b(?:\+31|0)[\s-]?\d{1,3}[\s-]?\d{3,4}[\s-]?\d{3,4}\b", # Dutch phones
|
|
cidoc_class="crm:E42_Identifier",
|
|
tei_element="tei:phone",
|
|
priority=7,
|
|
),
|
|
|
|
# ROL - Roles (titles, positions)
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.ROL,
|
|
hyponym="ROL.OCU",
|
|
pattern=r"\b(?:directeur|director|curator|conservator|archivist|archivaris|bibliothecaris|librarian|manager|voorzitter|chairman|secretaris|secretary|penningmeester|treasurer)\b",
|
|
cidoc_class="crm:E55_Type",
|
|
tei_element="tei:roleName",
|
|
priority=6,
|
|
),
|
|
|
|
# QTY - Quantities
|
|
EntityPattern(
|
|
hypernym=EntityHypernym.QTY,
|
|
hyponym="QTY.CNT",
|
|
pattern=r"\b\d+(?:\.\d+)?\s*(?:items?|objects?|documents?|photos?|pieces?|stuks?|objecten|documenten)\b",
|
|
cidoc_class="crm:E54_Dimension",
|
|
tei_element="tei:measure",
|
|
priority=6,
|
|
),
|
|
]
|
|
|
|
|
|
class EntityAnnotator(BaseAnnotator):
|
|
"""
|
|
Entity annotator for heritage documents.
|
|
|
|
Uses pattern matching and (optionally) LLM-based extraction
|
|
to identify entities following GLAM-NER v1.7.0 taxonomy.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
patterns: Optional[List[EntityPattern]] = None,
|
|
use_llm: bool = False,
|
|
model_id: Optional[str] = None,
|
|
):
|
|
"""
|
|
Initialize entity annotator.
|
|
|
|
Args:
|
|
patterns: Custom entity patterns (defaults to ENTITY_PATTERNS)
|
|
use_llm: Use LLM for enhanced recognition
|
|
model_id: LLM model identifier (e.g., "glm-4-flash")
|
|
"""
|
|
super().__init__(
|
|
agent_name="EntityAnnotator",
|
|
agent_version="1.0.0",
|
|
model_id=model_id,
|
|
)
|
|
self.patterns = patterns or ENTITY_PATTERNS
|
|
self.use_llm = use_llm
|
|
|
|
# Compile patterns
|
|
self._compiled_patterns = [
|
|
(p, re.compile(p.pattern, re.IGNORECASE if p.priority < 10 else 0))
|
|
for p in sorted(self.patterns, key=lambda x: -x.priority)
|
|
]
|
|
|
|
def annotate(
|
|
self,
|
|
document: Any,
|
|
session: Optional[AnnotationSession] = None,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate entities in a document.
|
|
|
|
Args:
|
|
document: HTMLDocument or text string
|
|
session: Existing session to add claims to
|
|
|
|
Returns:
|
|
AnnotationSession with entity claims
|
|
"""
|
|
if session is None:
|
|
source_url = None
|
|
source_file = None
|
|
if isinstance(document, HTMLDocument):
|
|
source_url = document.source_url
|
|
source_file = document.source_file
|
|
session = self.create_session(
|
|
source_url=source_url,
|
|
source_file=source_file,
|
|
)
|
|
|
|
# Get text and element mappings
|
|
if isinstance(document, HTMLDocument):
|
|
self._annotate_html_document(document, session)
|
|
elif isinstance(document, str):
|
|
self._annotate_text(document, session)
|
|
else:
|
|
raise ValueError(f"Unsupported document type: {type(document)}")
|
|
|
|
return session
|
|
|
|
def _annotate_html_document(
|
|
self,
|
|
document: HTMLDocument,
|
|
session: AnnotationSession,
|
|
):
|
|
"""Annotate entities in HTML document with XPath provenance."""
|
|
for element in document.elements:
|
|
if not element.text_content.strip():
|
|
continue
|
|
|
|
# Find entities in this element
|
|
entities = self._find_entities_in_text(
|
|
element.text_content,
|
|
element.xpath,
|
|
document.source_url,
|
|
document.source_file,
|
|
)
|
|
|
|
for claim in entities:
|
|
# Adjust offsets to be relative to element
|
|
if element.start_offset is not None and claim.start_offset is not None:
|
|
claim.start_offset += element.start_offset
|
|
claim.end_offset += element.start_offset
|
|
|
|
session.add_entity_claim(claim)
|
|
|
|
def _annotate_text(
|
|
self,
|
|
text: str,
|
|
session: AnnotationSession,
|
|
xpath: str = "/text()[1]",
|
|
):
|
|
"""Annotate entities in plain text."""
|
|
entities = self._find_entities_in_text(
|
|
text,
|
|
xpath,
|
|
session.source_url,
|
|
session.source_file,
|
|
)
|
|
|
|
for claim in entities:
|
|
session.add_entity_claim(claim)
|
|
|
|
def _find_entities_in_text(
|
|
self,
|
|
text: str,
|
|
xpath: str,
|
|
source_url: Optional[str] = None,
|
|
source_file: Optional[str] = None,
|
|
) -> List[EntityClaim]:
|
|
"""
|
|
Find entities in text using patterns.
|
|
|
|
Args:
|
|
text: Text to search
|
|
xpath: XPath location of text
|
|
source_url: Source URL for provenance
|
|
source_file: Source file for provenance
|
|
|
|
Returns:
|
|
List of EntityClaim objects
|
|
"""
|
|
claims = []
|
|
seen_spans = set() # Avoid duplicate matches
|
|
|
|
for pattern, compiled in self._compiled_patterns:
|
|
for match in compiled.finditer(text):
|
|
start, end = match.span()
|
|
|
|
# Skip if overlapping with higher priority match
|
|
span_key = (start, end)
|
|
if span_key in seen_spans:
|
|
continue
|
|
seen_spans.add(span_key)
|
|
|
|
matched_text = match.group(0)
|
|
|
|
# Create provenance
|
|
provenance = self.create_provenance(
|
|
namespace=self._get_namespace(pattern),
|
|
path=xpath,
|
|
confidence=0.8, # Pattern-based confidence
|
|
source_url=source_url,
|
|
source_file=source_file,
|
|
)
|
|
|
|
# Create claim
|
|
claim = EntityClaim(
|
|
hypernym=pattern.hypernym,
|
|
hyponym=pattern.hyponym,
|
|
claim_value=matched_text,
|
|
text_content=matched_text,
|
|
start_offset=start,
|
|
end_offset=end,
|
|
provenance=provenance,
|
|
cidoc_class=pattern.cidoc_class,
|
|
tei_element=pattern.tei_element,
|
|
recognition_confidence=0.8,
|
|
)
|
|
|
|
claims.append(claim)
|
|
|
|
return claims
|
|
|
|
def _get_namespace(self, pattern: EntityPattern) -> str:
|
|
"""Get ontology namespace prefix for pattern."""
|
|
if pattern.cidoc_class:
|
|
return "crm"
|
|
elif pattern.tei_element:
|
|
return "tei"
|
|
return "glam"
|
|
|
|
def add_pattern(self, pattern: EntityPattern):
|
|
"""Add a new entity pattern."""
|
|
self.patterns.append(pattern)
|
|
self._compiled_patterns.append(
|
|
(pattern, re.compile(pattern.pattern, re.IGNORECASE if pattern.priority < 10 else 0))
|
|
)
|
|
# Re-sort by priority
|
|
self._compiled_patterns.sort(key=lambda x: -x[0].priority)
|
|
|
|
|
|
def create_heritage_entity_annotator() -> EntityAnnotator:
|
|
"""
|
|
Create an entity annotator optimized for heritage documents.
|
|
|
|
Includes patterns for:
|
|
- Heritage institution names (museums, archives, libraries)
|
|
- Dutch place names
|
|
- ISIL codes
|
|
- Wikidata IDs
|
|
- Common heritage roles (curator, archivist, etc.)
|
|
"""
|
|
return EntityAnnotator(
|
|
patterns=ENTITY_PATTERNS,
|
|
use_llm=False,
|
|
)
|