- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
384 lines
16 KiB
Python
384 lines
16 KiB
Python
"""
|
|
Data models for GLAM heritage custodian entities.
|
|
|
|
These Pydantic models correspond to the LinkML schema in schemas/heritage_custodian.yaml
|
|
"""
|
|
|
|
from datetime import date, datetime
|
|
from enum import Enum
|
|
from typing import Optional, List
|
|
from pydantic import BaseModel, Field, HttpUrl, AnyUrl, validator
|
|
|
|
# Import GHCID history model for provenance tracking
|
|
from glam_extractor.identifiers.ghcid import GHCIDHistoryEntry
|
|
|
|
|
|
# =============================================================================
|
|
# ENUMERATIONS
|
|
# =============================================================================
|
|
|
|
class InstitutionType(str, Enum):
|
|
"""
|
|
Types of heritage institutions.
|
|
|
|
Uses single-letter codes for GHCID generation:
|
|
G(allery), L(ibrary), A(rchive), M(useum), O(fficial), R(esearch),
|
|
C(orporation), U(ndefined), B(otanical/Zoo), E(ducation), P(ersonal), S(ociety).
|
|
"""
|
|
GALLERY = "GALLERY" # G
|
|
LIBRARY = "LIBRARY" # L
|
|
ARCHIVE = "ARCHIVE" # A
|
|
MUSEUM = "MUSEUM" # M
|
|
OFFICIAL_INSTITUTION = "OFFICIAL_INSTITUTION" # O
|
|
RESEARCH_CENTER = "RESEARCH_CENTER" # R
|
|
CORPORATION = "CORPORATION" # C
|
|
UNDEFINED = "UNDEFINED" # U
|
|
BOTANICAL_ZOO = "BOTANICAL_ZOO" # B
|
|
EDUCATION_PROVIDER = "EDUCATION_PROVIDER" # E
|
|
PERSONAL_COLLECTION = "PERSONAL_COLLECTION" # P
|
|
COLLECTING_SOCIETY = "COLLECTING_SOCIETY" # S
|
|
MIXED = "MIXED" # M (use primary type for GHCID)
|
|
|
|
|
|
class OrganizationStatus(str, Enum):
|
|
"""Operational status of the organization"""
|
|
ACTIVE = "ACTIVE"
|
|
INACTIVE = "INACTIVE"
|
|
MERGED = "MERGED"
|
|
SUSPENDED = "SUSPENDED"
|
|
PLANNED = "PLANNED"
|
|
UNKNOWN = "UNKNOWN"
|
|
|
|
|
|
class DataSource(str, Enum):
|
|
"""Source of the data (for provenance tracking)"""
|
|
ISIL_REGISTRY = "ISIL_REGISTRY"
|
|
DUTCH_ORG_CSV = "DUTCH_ORG_CSV"
|
|
CONVERSATION_NLP = "CONVERSATION_NLP"
|
|
WEB_CRAWL = "WEB_CRAWL"
|
|
WIKIDATA = "WIKIDATA"
|
|
MANUAL_ENTRY = "MANUAL_ENTRY"
|
|
INFERRED = "INFERRED"
|
|
CSV_REGISTRY = "CSV_REGISTRY"
|
|
|
|
|
|
class DataTier(str, Enum):
|
|
"""Data quality/authority tier"""
|
|
TIER_1_AUTHORITATIVE = "TIER_1_AUTHORITATIVE"
|
|
TIER_2_VERIFIED = "TIER_2_VERIFIED"
|
|
TIER_3_CROWD_SOURCED = "TIER_3_CROWD_SOURCED"
|
|
TIER_4_INFERRED = "TIER_4_INFERRED"
|
|
|
|
|
|
class MetadataStandard(str, Enum):
|
|
"""Metadata standards used by institutions"""
|
|
DUBLIN_CORE = "DUBLIN_CORE"
|
|
MARC21 = "MARC21"
|
|
EAD = "EAD"
|
|
BIBFRAME = "BIBFRAME"
|
|
LIDO = "LIDO"
|
|
CIDOC_CRM = "CIDOC_CRM"
|
|
SCHEMA_ORG = "SCHEMA_ORG"
|
|
RIC_O = "RIC_O"
|
|
MODS = "MODS"
|
|
PREMIS = "PREMIS"
|
|
SPECTRUM = "SPECTRUM"
|
|
DACS = "DACS"
|
|
|
|
|
|
class DigitalPlatformType(str, Enum):
|
|
"""Types of digital platforms"""
|
|
COLLECTION_MANAGEMENT = "COLLECTION_MANAGEMENT"
|
|
DIGITAL_REPOSITORY = "DIGITAL_REPOSITORY"
|
|
DISCOVERY_PORTAL = "DISCOVERY_PORTAL"
|
|
LINKED_DATA_ENDPOINT = "LINKED_DATA_ENDPOINT"
|
|
API = "API"
|
|
AGGREGATOR = "AGGREGATOR"
|
|
WEBSITE = "WEBSITE"
|
|
GENERIC = "GENERIC" # General-purpose software (FileMaker, Access, etc.)
|
|
|
|
|
|
class ChangeType(str, Enum):
|
|
"""Types of organizational change events"""
|
|
FOUNDING = "FOUNDING"
|
|
CLOSURE = "CLOSURE"
|
|
MERGER = "MERGER"
|
|
SPLIT = "SPLIT"
|
|
ACQUISITION = "ACQUISITION"
|
|
RELOCATION = "RELOCATION"
|
|
NAME_CHANGE = "NAME_CHANGE"
|
|
TYPE_CHANGE = "TYPE_CHANGE"
|
|
STATUS_CHANGE = "STATUS_CHANGE"
|
|
RESTRUCTURING = "RESTRUCTURING"
|
|
LEGAL_CHANGE = "LEGAL_CHANGE"
|
|
OTHER = "OTHER"
|
|
|
|
|
|
# =============================================================================
|
|
# DATA MODELS
|
|
# =============================================================================
|
|
|
|
class Provenance(BaseModel):
|
|
"""Provenance information for data quality tracking"""
|
|
|
|
data_source: DataSource = Field(..., description="Source of this data record")
|
|
data_tier: DataTier = Field(..., description="Data quality tier")
|
|
extraction_date: datetime = Field(..., description="Date the data was extracted or created")
|
|
extraction_method: Optional[str] = Field(None, description="Method used to extract data")
|
|
confidence_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Confidence score (0.0-1.0)")
|
|
conversation_id: Optional[str] = Field(None, description="UUID of source conversation")
|
|
source_url: Optional[str] = Field(None, description="URI of the source (any URI scheme including file://, http://, https://)")
|
|
verified_date: Optional[datetime] = Field(None, description="Date the data was verified")
|
|
verified_by: Optional[str] = Field(None, description="Person/system that verified data")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class Location(BaseModel):
|
|
"""Physical or virtual location"""
|
|
|
|
location_type: Optional[str] = Field(None, description="Type of location")
|
|
street_address: Optional[str] = Field(None, description="Street address")
|
|
city: Optional[str] = Field(None, description="City or town")
|
|
postal_code: Optional[str] = Field(None, description="Postal code")
|
|
region: Optional[str] = Field(None, description="State, province, or region")
|
|
country: Optional[str] = Field(None, description="Country (ISO 3166-1 alpha-2)")
|
|
latitude: Optional[float] = Field(None, description="Latitude coordinate")
|
|
longitude: Optional[float] = Field(None, description="Longitude coordinate")
|
|
geonames_id: Optional[str] = Field(None, description="GeoNames identifier")
|
|
is_primary: Optional[bool] = Field(False, description="Is this the primary location?")
|
|
|
|
@validator("country")
|
|
def validate_country_code(cls, v: Optional[str]) -> Optional[str]:
|
|
if v and len(v) != 2:
|
|
raise ValueError("Country code must be 2 characters (ISO 3166-1 alpha-2)")
|
|
return v.upper() if v else v
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class ContactInfo(BaseModel):
|
|
"""Contact information for the organization"""
|
|
|
|
email: Optional[str] = Field(None, description="Email address")
|
|
phone: Optional[str] = Field(None, description="Phone number")
|
|
fax: Optional[str] = Field(None, description="Fax number")
|
|
contact_type: Optional[str] = Field(None, description="Type of contact")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class Identifier(BaseModel):
|
|
"""External identifier for the organization"""
|
|
|
|
identifier_scheme: str = Field(..., description="Identifier scheme name")
|
|
identifier_value: str = Field(..., description="The actual identifier value")
|
|
identifier_url: Optional[HttpUrl] = Field(None, description="URL to the identifier registry")
|
|
assigned_date: Optional[date] = Field(None, description="Date identifier was assigned")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class Collection(BaseModel):
|
|
"""A collection held by the heritage custodian"""
|
|
|
|
collection_id: Optional[str] = Field(None, description="Unique identifier for collection")
|
|
collection_name: Optional[str] = Field(None, description="Name of the collection")
|
|
collection_description: Optional[str] = Field(None, description="Description")
|
|
collection_type: Optional[str] = Field(None, description="Type of collection")
|
|
item_count: Optional[int] = Field(None, ge=0, description="Number of items")
|
|
subjects: List[str] = Field(default_factory=list, description="Subject headings")
|
|
time_period_start: Optional[date] = Field(None, description="Start of time period covered")
|
|
time_period_end: Optional[date] = Field(None, description="End of time period covered")
|
|
access_rights: Optional[str] = Field(None, description="Access rights and restrictions")
|
|
catalog_url: Optional[HttpUrl] = Field(None, description="URL to collection catalog")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class DigitalPlatform(BaseModel):
|
|
"""Digital platform or system used by the institution"""
|
|
|
|
platform_name: Optional[str] = Field(None, description="Name of the platform")
|
|
platform_type: Optional[DigitalPlatformType] = Field(None, description="Type of platform")
|
|
platform_url: Optional[HttpUrl] = Field(None, description="URL to the platform")
|
|
vendor: Optional[str] = Field(None, description="Software vendor or provider")
|
|
implemented_standards: List[MetadataStandard] = Field(
|
|
default_factory=list,
|
|
description="Standards implemented"
|
|
)
|
|
integration_method: Optional[str] = Field(
|
|
None, description="Technical method for data integration (API, OAI-PMH, SPARQL, CSV export, etc.)"
|
|
)
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class Partnership(BaseModel):
|
|
"""Partnership or network membership"""
|
|
|
|
partner_name: Optional[str] = Field(None, description="Name of partner organization")
|
|
partner_id: Optional[str] = Field(None, description="Identifier for partner")
|
|
partnership_type: Optional[str] = Field(None, description="Type of partnership")
|
|
start_date: Optional[date] = Field(None, description="Start date")
|
|
end_date: Optional[date] = Field(None, description="End date")
|
|
description: Optional[str] = Field(None, description="Partnership description")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class ChangeEvent(BaseModel):
|
|
"""
|
|
A significant organizational change event in an institution's lifecycle.
|
|
Based on TOOI Wijzigingsgebeurtenis and W3C PROV-O Activity patterns.
|
|
"""
|
|
|
|
event_id: str = Field(..., description="Unique identifier for this change event")
|
|
change_type: ChangeType = Field(..., description="Type of organizational change")
|
|
event_date: date = Field(..., description="Date when the change event occurred")
|
|
event_description: Optional[str] = Field(None, description="Textual description of the change event")
|
|
affected_organization: Optional[str] = Field(None, description="Organization ID that was affected")
|
|
resulting_organization: Optional[str] = Field(None, description="Organization ID resulting from change")
|
|
related_organizations: List[str] = Field(default_factory=list, description="Other organization IDs involved")
|
|
source_documentation: Optional[HttpUrl] = Field(None, description="URL to documentation of this event")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
|
|
|
|
class HeritageCustodian(BaseModel):
|
|
"""A heritage custodian organization (GLAM institution)"""
|
|
|
|
id: str = Field(..., description="Unique identifier for this record")
|
|
name: str = Field(..., description="Official name of the organization")
|
|
alternative_names: List[str] = Field(default_factory=list, description="Alternative names")
|
|
institution_type: InstitutionType = Field(..., description="Primary type of institution")
|
|
organization_status: Optional[OrganizationStatus] = Field(
|
|
OrganizationStatus.UNKNOWN,
|
|
description="Operational status"
|
|
)
|
|
description: Optional[str] = Field(None, description="Textual description")
|
|
|
|
# Organizational hierarchy
|
|
parent_organization: Optional[str] = Field(None, description="Parent organization ID")
|
|
parent_organization_name: Optional[str] = Field(
|
|
None,
|
|
description="Name of parent organization (unresolved reference)"
|
|
)
|
|
sub_organizations: List[str] = Field(default_factory=list, description="Sub-organization IDs")
|
|
|
|
# Temporal
|
|
founded_date: Optional[date] = Field(None, description="Date founded")
|
|
closed_date: Optional[date] = Field(None, description="Date closed")
|
|
|
|
# Web presence
|
|
homepage: Optional[HttpUrl] = Field(None, description="Official website URL")
|
|
|
|
# GHCID (Global Heritage Custodian Identifier)
|
|
# Four identifier formats for maximum interoperability and performance:
|
|
ghcid_uuid: Optional[str] = Field(
|
|
None,
|
|
description="UUID v5 persistent identifier (RFC 4122, SHA-1 based) - PRIMARY for interoperability with Europeana, DPLA, IIIF, Wikidata"
|
|
)
|
|
ghcid_uuid_sha256: Optional[str] = Field(
|
|
None,
|
|
description="UUID with SHA-256 (version 8, custom) - SOTA cryptographic strength for future-proofing and security compliance"
|
|
)
|
|
record_id: Optional[str] = Field(
|
|
None,
|
|
description="UUID v7 database primary key (RFC 9562, time-ordered, random) - for database performance optimization and natural chronological ordering"
|
|
)
|
|
ghcid_numeric: Optional[int] = Field(
|
|
None,
|
|
description="Persistent numeric GHCID (SHA256→64bit) - for database primary keys and CSV exports"
|
|
)
|
|
ghcid: Optional[str] = Field(
|
|
None,
|
|
description="Current human-readable GHCID (ISO-based format) - for citations and references"
|
|
)
|
|
ghcid_original: Optional[str] = Field(
|
|
None,
|
|
description="Original GHCID string (frozen, never changes) - basis for all identifier generation"
|
|
)
|
|
ghcid_history: Optional[List[GHCIDHistoryEntry]] = Field(
|
|
None,
|
|
description="History of GHCID changes (name/location changes)"
|
|
)
|
|
|
|
# Complex objects
|
|
contact_info: Optional[ContactInfo] = Field(None, description="Contact information")
|
|
locations: List[Location] = Field(default_factory=list, description="Physical locations")
|
|
identifiers: List[Identifier] = Field(default_factory=list, description="External identifiers")
|
|
|
|
# Universal business and administrative identifiers
|
|
chamber_of_commerce_number: Optional[str] = Field(
|
|
None,
|
|
description="Business registration number (KvK, CNPJ, EIN, etc.)"
|
|
)
|
|
municipality_identifier: Optional[str] = Field(
|
|
None,
|
|
description="Local government identifier for municipality"
|
|
)
|
|
|
|
collections: List[Collection] = Field(default_factory=list, description="Collections held")
|
|
digital_platforms: List[DigitalPlatform] = Field(
|
|
default_factory=list,
|
|
description="Digital platforms used"
|
|
)
|
|
metadata_standards: List[MetadataStandard] = Field(
|
|
default_factory=list,
|
|
description="Metadata standards used"
|
|
)
|
|
partnerships: List[Partnership] = Field(
|
|
default_factory=list,
|
|
description="Partnerships and memberships"
|
|
)
|
|
change_history: List[ChangeEvent] = Field(
|
|
default_factory=list,
|
|
description="Chronological list of significant organizational change events"
|
|
)
|
|
|
|
# Provenance (required)
|
|
provenance: Provenance = Field(..., description="Data provenance and quality metadata")
|
|
|
|
class Config:
|
|
use_enum_values = True
|
|
arbitrary_types_allowed = True # Allow GHCIDHistoryEntry dataclass
|
|
|
|
|
|
class DutchHeritageCustodian(HeritageCustodian):
|
|
"""
|
|
A heritage custodian organization based in the Netherlands.
|
|
|
|
NOTE: This class is deprecated and will be removed in a future version.
|
|
Most Dutch-specific fields have been migrated to universal patterns:
|
|
- kvk_number → chamber_of_commerce_number (in base class)
|
|
- gemeente_code → municipality_identifier (in base class)
|
|
- provincie → locations[0].region (in base class)
|
|
- Network memberships → partnerships[] with Partnership objects (in base class)
|
|
|
|
The remaining Dutch-specific fields are kept temporarily for backwards compatibility.
|
|
"""
|
|
|
|
# Dutch-specific fields (deprecated - will be removed)
|
|
kvk_number: Optional[str] = Field(
|
|
None,
|
|
description="DEPRECATED: Use chamber_of_commerce_number instead. Dutch Chamber of Commerce number (8 digits)"
|
|
)
|
|
gemeente_code: Optional[str] = Field(
|
|
None,
|
|
description="DEPRECATED: Use municipality_identifier instead. Dutch municipality code (CBS gemeentecode)"
|
|
)
|
|
|
|
@validator("kvk_number")
|
|
def validate_kvk_number(cls, v: Optional[str]) -> Optional[str]:
|
|
if v and (len(v) != 8 or not v.isdigit()):
|
|
raise ValueError("KvK number must be exactly 8 digits")
|
|
return v
|