""" Data models for GLAM heritage custodian entities. These Pydantic models correspond to the LinkML schema in schemas/heritage_custodian.yaml """ from datetime import date, datetime from enum import Enum from typing import Optional, List from pydantic import BaseModel, Field, HttpUrl, AnyUrl, validator # Import GHCID history model for provenance tracking from glam_extractor.identifiers.ghcid import GHCIDHistoryEntry # ============================================================================= # ENUMERATIONS # ============================================================================= class InstitutionType(str, Enum): """ Types of heritage institutions. Uses single-letter codes for GHCID generation: G(allery), L(ibrary), A(rchive), M(useum), O(fficial), R(esearch), C(orporation), U(ndefined), B(otanical/Zoo), E(ducation), P(ersonal), S(ociety). """ GALLERY = "GALLERY" # G LIBRARY = "LIBRARY" # L ARCHIVE = "ARCHIVE" # A MUSEUM = "MUSEUM" # M OFFICIAL_INSTITUTION = "OFFICIAL_INSTITUTION" # O RESEARCH_CENTER = "RESEARCH_CENTER" # R CORPORATION = "CORPORATION" # C UNDEFINED = "UNDEFINED" # U BOTANICAL_ZOO = "BOTANICAL_ZOO" # B EDUCATION_PROVIDER = "EDUCATION_PROVIDER" # E PERSONAL_COLLECTION = "PERSONAL_COLLECTION" # P COLLECTING_SOCIETY = "COLLECTING_SOCIETY" # S MIXED = "MIXED" # M (use primary type for GHCID) class OrganizationStatus(str, Enum): """Operational status of the organization""" ACTIVE = "ACTIVE" INACTIVE = "INACTIVE" MERGED = "MERGED" SUSPENDED = "SUSPENDED" PLANNED = "PLANNED" UNKNOWN = "UNKNOWN" class DataSource(str, Enum): """Source of the data (for provenance tracking)""" ISIL_REGISTRY = "ISIL_REGISTRY" DUTCH_ORG_CSV = "DUTCH_ORG_CSV" CONVERSATION_NLP = "CONVERSATION_NLP" WEB_CRAWL = "WEB_CRAWL" WIKIDATA = "WIKIDATA" MANUAL_ENTRY = "MANUAL_ENTRY" INFERRED = "INFERRED" CSV_REGISTRY = "CSV_REGISTRY" class DataTier(str, Enum): """Data quality/authority tier""" TIER_1_AUTHORITATIVE = "TIER_1_AUTHORITATIVE" TIER_2_VERIFIED = "TIER_2_VERIFIED" TIER_3_CROWD_SOURCED = "TIER_3_CROWD_SOURCED" TIER_4_INFERRED = "TIER_4_INFERRED" class MetadataStandard(str, Enum): """Metadata standards used by institutions""" DUBLIN_CORE = "DUBLIN_CORE" MARC21 = "MARC21" EAD = "EAD" BIBFRAME = "BIBFRAME" LIDO = "LIDO" CIDOC_CRM = "CIDOC_CRM" SCHEMA_ORG = "SCHEMA_ORG" RIC_O = "RIC_O" MODS = "MODS" PREMIS = "PREMIS" SPECTRUM = "SPECTRUM" DACS = "DACS" class DigitalPlatformType(str, Enum): """Types of digital platforms""" COLLECTION_MANAGEMENT = "COLLECTION_MANAGEMENT" DIGITAL_REPOSITORY = "DIGITAL_REPOSITORY" DISCOVERY_PORTAL = "DISCOVERY_PORTAL" LINKED_DATA_ENDPOINT = "LINKED_DATA_ENDPOINT" API = "API" AGGREGATOR = "AGGREGATOR" WEBSITE = "WEBSITE" GENERIC = "GENERIC" # General-purpose software (FileMaker, Access, etc.) class ChangeType(str, Enum): """Types of organizational change events""" FOUNDING = "FOUNDING" CLOSURE = "CLOSURE" MERGER = "MERGER" SPLIT = "SPLIT" ACQUISITION = "ACQUISITION" RELOCATION = "RELOCATION" NAME_CHANGE = "NAME_CHANGE" TYPE_CHANGE = "TYPE_CHANGE" STATUS_CHANGE = "STATUS_CHANGE" RESTRUCTURING = "RESTRUCTURING" LEGAL_CHANGE = "LEGAL_CHANGE" OTHER = "OTHER" # ============================================================================= # DATA MODELS # ============================================================================= class Provenance(BaseModel): """Provenance information for data quality tracking""" data_source: DataSource = Field(..., description="Source of this data record") data_tier: DataTier = Field(..., description="Data quality tier") extraction_date: datetime = Field(..., description="Date the data was extracted or created") extraction_method: Optional[str] = Field(None, description="Method used to extract data") confidence_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Confidence score (0.0-1.0)") conversation_id: Optional[str] = Field(None, description="UUID of source conversation") source_url: Optional[str] = Field(None, description="URI of the source (any URI scheme including file://, http://, https://)") verified_date: Optional[datetime] = Field(None, description="Date the data was verified") verified_by: Optional[str] = Field(None, description="Person/system that verified data") class Config: use_enum_values = True class Location(BaseModel): """Physical or virtual location""" location_type: Optional[str] = Field(None, description="Type of location") street_address: Optional[str] = Field(None, description="Street address") city: Optional[str] = Field(None, description="City or town") postal_code: Optional[str] = Field(None, description="Postal code") region: Optional[str] = Field(None, description="State, province, or region") country: Optional[str] = Field(None, description="Country (ISO 3166-1 alpha-2)") latitude: Optional[float] = Field(None, description="Latitude coordinate") longitude: Optional[float] = Field(None, description="Longitude coordinate") geonames_id: Optional[str] = Field(None, description="GeoNames identifier") is_primary: Optional[bool] = Field(False, description="Is this the primary location?") @validator("country") def validate_country_code(cls, v: Optional[str]) -> Optional[str]: if v and len(v) != 2: raise ValueError("Country code must be 2 characters (ISO 3166-1 alpha-2)") return v.upper() if v else v class Config: use_enum_values = True class ContactInfo(BaseModel): """Contact information for the organization""" email: Optional[str] = Field(None, description="Email address") phone: Optional[str] = Field(None, description="Phone number") fax: Optional[str] = Field(None, description="Fax number") contact_type: Optional[str] = Field(None, description="Type of contact") class Config: use_enum_values = True class Identifier(BaseModel): """External identifier for the organization""" identifier_scheme: str = Field(..., description="Identifier scheme name") identifier_value: str = Field(..., description="The actual identifier value") identifier_url: Optional[HttpUrl] = Field(None, description="URL to the identifier registry") assigned_date: Optional[date] = Field(None, description="Date identifier was assigned") class Config: use_enum_values = True class Collection(BaseModel): """A collection held by the heritage custodian""" collection_id: Optional[str] = Field(None, description="Unique identifier for collection") collection_name: Optional[str] = Field(None, description="Name of the collection") collection_description: Optional[str] = Field(None, description="Description") collection_type: Optional[str] = Field(None, description="Type of collection") item_count: Optional[int] = Field(None, ge=0, description="Number of items") subjects: List[str] = Field(default_factory=list, description="Subject headings") time_period_start: Optional[date] = Field(None, description="Start of time period covered") time_period_end: Optional[date] = Field(None, description="End of time period covered") access_rights: Optional[str] = Field(None, description="Access rights and restrictions") catalog_url: Optional[HttpUrl] = Field(None, description="URL to collection catalog") class Config: use_enum_values = True class DigitalPlatform(BaseModel): """Digital platform or system used by the institution""" platform_name: Optional[str] = Field(None, description="Name of the platform") platform_type: Optional[DigitalPlatformType] = Field(None, description="Type of platform") platform_url: Optional[HttpUrl] = Field(None, description="URL to the platform") vendor: Optional[str] = Field(None, description="Software vendor or provider") implemented_standards: List[MetadataStandard] = Field( default_factory=list, description="Standards implemented" ) integration_method: Optional[str] = Field( None, description="Technical method for data integration (API, OAI-PMH, SPARQL, CSV export, etc.)" ) class Config: use_enum_values = True class Partnership(BaseModel): """Partnership or network membership""" partner_name: Optional[str] = Field(None, description="Name of partner organization") partner_id: Optional[str] = Field(None, description="Identifier for partner") partnership_type: Optional[str] = Field(None, description="Type of partnership") start_date: Optional[date] = Field(None, description="Start date") end_date: Optional[date] = Field(None, description="End date") description: Optional[str] = Field(None, description="Partnership description") class Config: use_enum_values = True class ChangeEvent(BaseModel): """ A significant organizational change event in an institution's lifecycle. Based on TOOI Wijzigingsgebeurtenis and W3C PROV-O Activity patterns. """ event_id: str = Field(..., description="Unique identifier for this change event") change_type: ChangeType = Field(..., description="Type of organizational change") event_date: date = Field(..., description="Date when the change event occurred") event_description: Optional[str] = Field(None, description="Textual description of the change event") affected_organization: Optional[str] = Field(None, description="Organization ID that was affected") resulting_organization: Optional[str] = Field(None, description="Organization ID resulting from change") related_organizations: List[str] = Field(default_factory=list, description="Other organization IDs involved") source_documentation: Optional[HttpUrl] = Field(None, description="URL to documentation of this event") class Config: use_enum_values = True class HeritageCustodian(BaseModel): """A heritage custodian organization (GLAM institution)""" id: str = Field(..., description="Unique identifier for this record") name: str = Field(..., description="Official name of the organization") alternative_names: List[str] = Field(default_factory=list, description="Alternative names") institution_type: InstitutionType = Field(..., description="Primary type of institution") organization_status: Optional[OrganizationStatus] = Field( OrganizationStatus.UNKNOWN, description="Operational status" ) description: Optional[str] = Field(None, description="Textual description") # Organizational hierarchy parent_organization: Optional[str] = Field(None, description="Parent organization ID") parent_organization_name: Optional[str] = Field( None, description="Name of parent organization (unresolved reference)" ) sub_organizations: List[str] = Field(default_factory=list, description="Sub-organization IDs") # Temporal founded_date: Optional[date] = Field(None, description="Date founded") closed_date: Optional[date] = Field(None, description="Date closed") # Web presence homepage: Optional[HttpUrl] = Field(None, description="Official website URL") # GHCID (Global Heritage Custodian Identifier) # Four identifier formats for maximum interoperability and performance: ghcid_uuid: Optional[str] = Field( None, description="UUID v5 persistent identifier (RFC 4122, SHA-1 based) - PRIMARY for interoperability with Europeana, DPLA, IIIF, Wikidata" ) ghcid_uuid_sha256: Optional[str] = Field( None, description="UUID with SHA-256 (version 8, custom) - SOTA cryptographic strength for future-proofing and security compliance" ) record_id: Optional[str] = Field( None, description="UUID v7 database primary key (RFC 9562, time-ordered, random) - for database performance optimization and natural chronological ordering" ) ghcid_numeric: Optional[int] = Field( None, description="Persistent numeric GHCID (SHA256→64bit) - for database primary keys and CSV exports" ) ghcid: Optional[str] = Field( None, description="Current human-readable GHCID (ISO-based format) - for citations and references" ) ghcid_original: Optional[str] = Field( None, description="Original GHCID string (frozen, never changes) - basis for all identifier generation" ) ghcid_history: Optional[List[GHCIDHistoryEntry]] = Field( None, description="History of GHCID changes (name/location changes)" ) # Complex objects contact_info: Optional[ContactInfo] = Field(None, description="Contact information") locations: List[Location] = Field(default_factory=list, description="Physical locations") identifiers: List[Identifier] = Field(default_factory=list, description="External identifiers") # Universal business and administrative identifiers chamber_of_commerce_number: Optional[str] = Field( None, description="Business registration number (KvK, CNPJ, EIN, etc.)" ) municipality_identifier: Optional[str] = Field( None, description="Local government identifier for municipality" ) collections: List[Collection] = Field(default_factory=list, description="Collections held") digital_platforms: List[DigitalPlatform] = Field( default_factory=list, description="Digital platforms used" ) metadata_standards: List[MetadataStandard] = Field( default_factory=list, description="Metadata standards used" ) partnerships: List[Partnership] = Field( default_factory=list, description="Partnerships and memberships" ) change_history: List[ChangeEvent] = Field( default_factory=list, description="Chronological list of significant organizational change events" ) # Provenance (required) provenance: Provenance = Field(..., description="Data provenance and quality metadata") class Config: use_enum_values = True arbitrary_types_allowed = True # Allow GHCIDHistoryEntry dataclass class DutchHeritageCustodian(HeritageCustodian): """ A heritage custodian organization based in the Netherlands. NOTE: This class is deprecated and will be removed in a future version. Most Dutch-specific fields have been migrated to universal patterns: - kvk_number → chamber_of_commerce_number (in base class) - gemeente_code → municipality_identifier (in base class) - provincie → locations[0].region (in base class) - Network memberships → partnerships[] with Partnership objects (in base class) The remaining Dutch-specific fields are kept temporarily for backwards compatibility. """ # Dutch-specific fields (deprecated - will be removed) kvk_number: Optional[str] = Field( None, description="DEPRECATED: Use chamber_of_commerce_number instead. Dutch Chamber of Commerce number (8 digits)" ) gemeente_code: Optional[str] = Field( None, description="DEPRECATED: Use municipality_identifier instead. Dutch municipality code (CBS gemeentecode)" ) @validator("kvk_number") def validate_kvk_number(cls, v: Optional[str]) -> Optional[str]: if v and (len(v) != 8 or not v.isdigit()): raise ValueError("KvK number must be exactly 8 digits") return v