glam/docs/CUSTODIAN_CANONICAL_SCHEMA.md

# Canonical Custodian YAML Schema v1.0.0
#
# This document defines the standardized format for all heritage custodian files
# in data/custodian/. ALL files should conform to this structure.
#
# Created: 2025-12-09
# Author: GLAM Data Engineering

---
# =============================================================================
# CANONICAL SCHEMA DEFINITION
# =============================================================================
#
# The canonical format organizes data into these sections (in order):
#
# 1. IDENTITY (who is this institution?)
#    - custodian_name: Consensus name with provenance
#    - identifiers: All identifiers (GHCID, ISIL, Wikidata, etc.)
#    - institution_type: Single canonical type (GALLERY, LIBRARY, etc.)
#
# 2. LOCATION (where is this institution?)
#    - location: Single canonical location object with coordinates
#    - service_area: Geographic service area (optional)
#
# 3. TEMPORAL (when did events occur?)
#    - timespan: Begin/end dates of operation
#    - conflict_status: Destruction/damage info (if applicable)
#
# 4. ENRICHMENTS (data from external sources)
#    - wikidata_enrichment: From Wikidata API
#    - google_maps_enrichment: From Google Maps API
#    - web_enrichment: From website scraping
#    - youtube_enrichment: From YouTube API
#    - (other source-specific enrichments)
#
# 5. PROVENANCE (how was this data created?)
#    - provenance: Data source, tier, extraction info
#    - ghcid: GHCID generation details and history
#
# 6. ARCHIVE (preserve original data)
#    - original_entry: Raw data from source (immutable)
#    - ch_annotator: CH-Annotator metadata (if applicable)
#
# =============================================================================

# -----------------------------------------------------------------------------
# 1. IDENTITY
# -----------------------------------------------------------------------------

# Consensus name - the authoritative institution name
custodian_name:
  claim_type: custodian_name
  claim_value: string              # Required: Canonical name
  emic_name: string | null         # Native language name (if different)
  name_language: string | null     # ISO 639-1 language code
  alternative_names: list[string]  # Other known names
  source: string                   # Which source provided this name
  confidence: float                # 0.0-1.0 confidence score
  consensus_method: bool           # True if multiple sources agree
  sources_checked: int             # Number of sources consulted
  sources_matched: int             # Number of sources that agree
  extraction_timestamp: datetime   # When this was extracted
  matching_sources: list           # Details of each matching source

# All identifiers in normalized format
identifiers:
  - identifier_scheme: GHCID
    identifier_value: string       # e.g., "NL-ZH-SCH-A-GAS"
  - identifier_scheme: GHCID_UUID
    identifier_value: string       # UUID v5
    identifier_url: string | null  # urn:uuid:...
  - identifier_scheme: GHCID_UUID_SHA256
    identifier_value: string       # UUID v8
    identifier_url: string | null
  - identifier_scheme: GHCID_NUMERIC
    identifier_value: string       # 64-bit integer as string
  - identifier_scheme: RECORD_ID
    identifier_value: string       # UUID v7 for database
    identifier_url: string | null
  - identifier_scheme: ISIL
    identifier_value: string       # e.g., "NL-SdmGA"
    identifier_url: string | null  # https://isil.org/...
    assigned_date: date | null
    source: string | null
  - identifier_scheme: Wikidata
    identifier_value: string       # e.g., "Q62069649"
    identifier_url: string         # https://www.wikidata.org/wiki/...
  # ... additional identifiers as needed

# Institution type - single canonical value
institution_type: enum[GALLERY, LIBRARY, ARCHIVE, MUSEUM, OFFICIAL_INSTITUTION,
                       RESEARCH_CENTER, CORPORATION, UNKNOWN, BOTANICAL_ZOO,
                       EDUCATION_PROVIDER, COLLECTING_SOCIETY, FEATURES,
                       INTANGIBLE_HERITAGE_GROUP, MIXED, PERSONAL_COLLECTION,
                       HOLY_SITES, DIGITAL_PLATFORM, NGO, TASTE_SMELL]

# Optional description
description: string | null

# -----------------------------------------------------------------------------
# 2. LOCATION
# -----------------------------------------------------------------------------

# Single canonical location - the AUTHORITATIVE coordinates and address
# This is the ONLY place coordinates should be stored for the institution
location:
  # Geographic coordinates (REQUIRED for map display)
  latitude: float | null           # WGS84 latitude
  longitude: float | null          # WGS84 longitude
  precision: float | null          # Coordinate precision in degrees

  # Address components
  city: string | null              # City/locality name
  region: string | null            # State/province/region name
  region_code: string | null       # ISO 3166-2 subdivision code
  country: string                  # ISO 3166-1 alpha-2 code (REQUIRED)
  postal_code: string | null
  street_address: string | null
  formatted_address: string | null # Full formatted address

  # GeoNames reference
  geonames_id: int | null
  geonames_name: string | null
  feature_code: string | null      # PPL, PPLA, etc.

  # Provenance for location data
  coordinate_source: string        # Which enrichment provided coordinates
  address_source: string | null    # Which enrichment provided address
  resolution_timestamp: datetime | null

# Service area (for institutions serving a geographic region)
service_area:
  type: enum[country, admin1, admin2, city, custom] | null
  country_code: string | null
  admin1_code: string | null
  admin1_name: string | null
  admin2_name: string | null
  source: string | null
  notes: string | null

# -----------------------------------------------------------------------------
# 3. TEMPORAL
# -----------------------------------------------------------------------------

# Institution lifespan (CIDOC-CRM style)
timespan:
  begin_of_the_begin: datetime | null  # Earliest possible founding
  end_of_the_begin: datetime | null    # Latest possible founding
  begin_of_the_end: datetime | null    # Earliest possible closure
  end_of_the_end: datetime | null      # Latest possible closure
  notes: string | null
  sources: list[string] | null

# Conflict/destruction status (for damaged/destroyed institutions)
conflict_status:
  status: enum[operational, damaged, destroyed, unknown] | null
  date: date | null
  reported_date: date | null
  description: string | null
  sources: list[string] | null

# -----------------------------------------------------------------------------
# 4. ENRICHMENTS (source-specific data preserved as-is)
# -----------------------------------------------------------------------------

# Wikidata enrichment (from Wikidata REST API)
wikidata_enrichment:
  wikidata_id: string              # Q-number
  wikidata_url: string
  wikidata_label: string | null
  wikidata_description: string | null
  labels: dict[str, str]           # Language-tagged labels
  descriptions: dict[str, str]     # Language-tagged descriptions
  wikidata_coordinates:            # Coordinates from Wikidata
    latitude: float | null
    longitude: float | null
    precision: float | null
  wikidata_inception: date | null
  wikidata_dissolved: date | null
  instance_of: list[string]        # Q-numbers
  located_in: list[string]         # Q-numbers
  country: string | null           # Q-number
  enrichment_timestamp: datetime
  verification:                    # LLM verification
    method: string | null
    confidence: float | null
    subtype: string | null
    reasoning: string | null

# Google Maps enrichment (from Places API)
google_maps_enrichment:
  place_id: string
  name: string
  fetch_timestamp: datetime
  api_status: string
  coordinates:
    latitude: float
    longitude: float
  formatted_address: string
  short_address: string | null
  address_components: list[dict]
  phone_local: string | null
  phone_international: string | null
  website: string | null
  google_place_types: list[string]
  business_status: string | null
  opening_hours: dict | null
  rating: float | null
  total_ratings: int | null
  reviews: list[dict] | null
  google_maps_url: string | null
  street_view_url: string | null

# Google Maps status (quick check)
google_maps_status: enum[SUCCESS, NO_MATCH, REJECTED, NOT_SEARCHED] | null
google_maps_rejected: dict | null  # Rejection details if rejected by LLM

# Web enrichment (from website scraping)
web_enrichment:
  web_archives: list
  full_site_archive_timestamp: datetime | null

# Web claims (extracted from website)
web_claims:
  extraction_timestamp: datetime
  source_archive: string
  claims_count: int
  claims: list[dict]               # Each claim has XPath provenance

# YouTube enrichment (from YouTube API)
youtube_enrichment: dict | null
youtube_status: enum[FOUND, NOT_FOUND, NOT_SEARCHED] | null

# ISIL registry enrichment (from national ISIL registries)
nan_isil_enrichment: dict | null   # Dutch ISIL registry
# (other national ISIL enrichments as needed)

# Genealogiewerkbalk enrichment (Dutch municipal archives)
genealogiewerkbalk_enrichment: dict | null

# Digital platforms (derived from enrichments)
digital_platforms: list[dict] | null

# UNESCO enrichments
unesco_mow_enrichment: dict | null
unesco_status: dict | null

# -----------------------------------------------------------------------------
# 5. PROVENANCE
# -----------------------------------------------------------------------------

# Overall provenance (how this record was created)
provenance:
  schema_version: string           # e.g., "1.0.0"
  data_source: enum[CSV_REGISTRY, CONVERSATION_NLP, WIKIDATA, WEB_SCRAPE]
  data_tier: enum[TIER_1_AUTHORITATIVE, TIER_2_VERIFIED,
                  TIER_3_CROWD_SOURCED, TIER_4_INFERRED]
  extraction_date: datetime
  extraction_method: string
  confidence_score: float | null
  generated_at: datetime | null
  sources: dict | null             # Detailed source provenance
  data_tier_summary: dict | null   # Summary by tier
  notes: list[string] | null

# GHCID generation and history
ghcid:
  ghcid_current: string            # Current GHCID
  ghcid_original: string           # Original GHCID (may differ)
  ghcid_uuid: string               # UUID v5
  ghcid_uuid_sha256: string        # UUID v8
  ghcid_numeric: int               # 64-bit integer
  record_id: string                # UUID v7
  generation_timestamp: datetime
  location_resolution:             # How location was resolved for GHCID
    method: string
    country_code: string
    region_code: string | null
    city_code: string | null
    city_name: string | null
    geonames_id: int | null
    geonames_name: string | null
    feature_code: string | null
    population: int | null
    admin1_code: string | null
    latitude: float | null         # Coordinates used for GHCID resolution
    longitude: float | null
    source_coordinates: dict | null
    resolution_date: datetime | null
  ghcid_history: list              # GHCID changes over time

# -----------------------------------------------------------------------------
# 6. ARCHIVE (original/source data preserved immutably)
# -----------------------------------------------------------------------------

# Original entry - preserve raw source data exactly as received
original_entry:
  # NDE CSV fields (if from Dutch NDE registry)
  plaatsnaam_bezoekadres: string | null
  organisatie: string | null
  isil-code_na: string | null
  wikidata_id: string | null
  type: list[string] | null
  # ... other source-specific fields

  # CH-Annotator fields (if from CH-Annotator)
  name: string | null
  institution_type: string | null
  source: string | null
  identifiers: list[dict] | null
  locations: list[dict] | null     # Original location data (may have coordinates)

# Entry index (for batch imports)
entry_index: int | string | null

# Processing timestamp
processing_timestamp: datetime

# Enrichment status
enrichment_status: enum[pending, enriched, failed, partial] | null

# CH-Annotator metadata (if applicable)
ch_annotator:
  convention_id: string
  convention_version: string
  entity_classification: dict
  extraction_provenance: dict
  annotation_provenance: dict
  annotation_metadata: dict
  entity_claims: list[dict]
  integration_note: dict | null