glam/docs/CUSTODIAN_CANONICAL_SCHEMA.md
2025-12-09 07:56:35 +01:00

331 lines
12 KiB
Markdown

# Canonical Custodian YAML Schema v1.0.0
#
# This document defines the standardized format for all heritage custodian files
# in data/custodian/. ALL files should conform to this structure.
#
# Created: 2025-12-09
# Author: GLAM Data Engineering
---
# =============================================================================
# CANONICAL SCHEMA DEFINITION
# =============================================================================
#
# The canonical format organizes data into these sections (in order):
#
# 1. IDENTITY (who is this institution?)
# - custodian_name: Consensus name with provenance
# - identifiers: All identifiers (GHCID, ISIL, Wikidata, etc.)
# - institution_type: Single canonical type (GALLERY, LIBRARY, etc.)
#
# 2. LOCATION (where is this institution?)
# - location: Single canonical location object with coordinates
# - service_area: Geographic service area (optional)
#
# 3. TEMPORAL (when did events occur?)
# - timespan: Begin/end dates of operation
# - conflict_status: Destruction/damage info (if applicable)
#
# 4. ENRICHMENTS (data from external sources)
# - wikidata_enrichment: From Wikidata API
# - google_maps_enrichment: From Google Maps API
# - web_enrichment: From website scraping
# - youtube_enrichment: From YouTube API
# - (other source-specific enrichments)
#
# 5. PROVENANCE (how was this data created?)
# - provenance: Data source, tier, extraction info
# - ghcid: GHCID generation details and history
#
# 6. ARCHIVE (preserve original data)
# - original_entry: Raw data from source (immutable)
# - ch_annotator: CH-Annotator metadata (if applicable)
#
# =============================================================================
# -----------------------------------------------------------------------------
# 1. IDENTITY
# -----------------------------------------------------------------------------
# Consensus name - the authoritative institution name
custodian_name:
claim_type: custodian_name
claim_value: string # Required: Canonical name
emic_name: string | null # Native language name (if different)
name_language: string | null # ISO 639-1 language code
alternative_names: list[string] # Other known names
source: string # Which source provided this name
confidence: float # 0.0-1.0 confidence score
consensus_method: bool # True if multiple sources agree
sources_checked: int # Number of sources consulted
sources_matched: int # Number of sources that agree
extraction_timestamp: datetime # When this was extracted
matching_sources: list # Details of each matching source
# All identifiers in normalized format
identifiers:
- identifier_scheme: GHCID
identifier_value: string # e.g., "NL-ZH-SCH-A-GAS"
- identifier_scheme: GHCID_UUID
identifier_value: string # UUID v5
identifier_url: string | null # urn:uuid:...
- identifier_scheme: GHCID_UUID_SHA256
identifier_value: string # UUID v8
identifier_url: string | null
- identifier_scheme: GHCID_NUMERIC
identifier_value: string # 64-bit integer as string
- identifier_scheme: RECORD_ID
identifier_value: string # UUID v7 for database
identifier_url: string | null
- identifier_scheme: ISIL
identifier_value: string # e.g., "NL-SdmGA"
identifier_url: string | null # https://isil.org/...
assigned_date: date | null
source: string | null
- identifier_scheme: Wikidata
identifier_value: string # e.g., "Q62069649"
identifier_url: string # https://www.wikidata.org/wiki/...
# ... additional identifiers as needed
# Institution type - single canonical value
institution_type: enum[GALLERY, LIBRARY, ARCHIVE, MUSEUM, OFFICIAL_INSTITUTION,
RESEARCH_CENTER, CORPORATION, UNKNOWN, BOTANICAL_ZOO,
EDUCATION_PROVIDER, COLLECTING_SOCIETY, FEATURES,
INTANGIBLE_HERITAGE_GROUP, MIXED, PERSONAL_COLLECTION,
HOLY_SITES, DIGITAL_PLATFORM, NGO, TASTE_SMELL]
# Optional description
description: string | null
# -----------------------------------------------------------------------------
# 2. LOCATION
# -----------------------------------------------------------------------------
# Single canonical location - the AUTHORITATIVE coordinates and address
# This is the ONLY place coordinates should be stored for the institution
location:
# Geographic coordinates (REQUIRED for map display)
latitude: float | null # WGS84 latitude
longitude: float | null # WGS84 longitude
precision: float | null # Coordinate precision in degrees
# Address components
city: string | null # City/locality name
region: string | null # State/province/region name
region_code: string | null # ISO 3166-2 subdivision code
country: string # ISO 3166-1 alpha-2 code (REQUIRED)
postal_code: string | null
street_address: string | null
formatted_address: string | null # Full formatted address
# GeoNames reference
geonames_id: int | null
geonames_name: string | null
feature_code: string | null # PPL, PPLA, etc.
# Provenance for location data
coordinate_source: string # Which enrichment provided coordinates
address_source: string | null # Which enrichment provided address
resolution_timestamp: datetime | null
# Service area (for institutions serving a geographic region)
service_area:
type: enum[country, admin1, admin2, city, custom] | null
country_code: string | null
admin1_code: string | null
admin1_name: string | null
admin2_name: string | null
source: string | null
notes: string | null
# -----------------------------------------------------------------------------
# 3. TEMPORAL
# -----------------------------------------------------------------------------
# Institution lifespan (CIDOC-CRM style)
timespan:
begin_of_the_begin: datetime | null # Earliest possible founding
end_of_the_begin: datetime | null # Latest possible founding
begin_of_the_end: datetime | null # Earliest possible closure
end_of_the_end: datetime | null # Latest possible closure
notes: string | null
sources: list[string] | null
# Conflict/destruction status (for damaged/destroyed institutions)
conflict_status:
status: enum[operational, damaged, destroyed, unknown] | null
date: date | null
reported_date: date | null
description: string | null
sources: list[string] | null
# -----------------------------------------------------------------------------
# 4. ENRICHMENTS (source-specific data preserved as-is)
# -----------------------------------------------------------------------------
# Wikidata enrichment (from Wikidata REST API)
wikidata_enrichment:
wikidata_id: string # Q-number
wikidata_url: string
wikidata_label: string | null
wikidata_description: string | null
labels: dict[str, str] # Language-tagged labels
descriptions: dict[str, str] # Language-tagged descriptions
wikidata_coordinates: # Coordinates from Wikidata
latitude: float | null
longitude: float | null
precision: float | null
wikidata_inception: date | null
wikidata_dissolved: date | null
instance_of: list[string] # Q-numbers
located_in: list[string] # Q-numbers
country: string | null # Q-number
enrichment_timestamp: datetime
verification: # LLM verification
method: string | null
confidence: float | null
subtype: string | null
reasoning: string | null
# Google Maps enrichment (from Places API)
google_maps_enrichment:
place_id: string
name: string
fetch_timestamp: datetime
api_status: string
coordinates:
latitude: float
longitude: float
formatted_address: string
short_address: string | null
address_components: list[dict]
phone_local: string | null
phone_international: string | null
website: string | null
google_place_types: list[string]
business_status: string | null
opening_hours: dict | null
rating: float | null
total_ratings: int | null
reviews: list[dict] | null
google_maps_url: string | null
street_view_url: string | null
# Google Maps status (quick check)
google_maps_status: enum[SUCCESS, NO_MATCH, REJECTED, NOT_SEARCHED] | null
google_maps_rejected: dict | null # Rejection details if rejected by LLM
# Web enrichment (from website scraping)
web_enrichment:
web_archives: list
full_site_archive_timestamp: datetime | null
# Web claims (extracted from website)
web_claims:
extraction_timestamp: datetime
source_archive: string
claims_count: int
claims: list[dict] # Each claim has XPath provenance
# YouTube enrichment (from YouTube API)
youtube_enrichment: dict | null
youtube_status: enum[FOUND, NOT_FOUND, NOT_SEARCHED] | null
# ISIL registry enrichment (from national ISIL registries)
nan_isil_enrichment: dict | null # Dutch ISIL registry
# (other national ISIL enrichments as needed)
# Genealogiewerkbalk enrichment (Dutch municipal archives)
genealogiewerkbalk_enrichment: dict | null
# Digital platforms (derived from enrichments)
digital_platforms: list[dict] | null
# UNESCO enrichments
unesco_mow_enrichment: dict | null
unesco_status: dict | null
# -----------------------------------------------------------------------------
# 5. PROVENANCE
# -----------------------------------------------------------------------------
# Overall provenance (how this record was created)
provenance:
schema_version: string # e.g., "1.0.0"
data_source: enum[CSV_REGISTRY, CONVERSATION_NLP, WIKIDATA, WEB_SCRAPE]
data_tier: enum[TIER_1_AUTHORITATIVE, TIER_2_VERIFIED,
TIER_3_CROWD_SOURCED, TIER_4_INFERRED]
extraction_date: datetime
extraction_method: string
confidence_score: float | null
generated_at: datetime | null
sources: dict | null # Detailed source provenance
data_tier_summary: dict | null # Summary by tier
notes: list[string] | null
# GHCID generation and history
ghcid:
ghcid_current: string # Current GHCID
ghcid_original: string # Original GHCID (may differ)
ghcid_uuid: string # UUID v5
ghcid_uuid_sha256: string # UUID v8
ghcid_numeric: int # 64-bit integer
record_id: string # UUID v7
generation_timestamp: datetime
location_resolution: # How location was resolved for GHCID
method: string
country_code: string
region_code: string | null
city_code: string | null
city_name: string | null
geonames_id: int | null
geonames_name: string | null
feature_code: string | null
population: int | null
admin1_code: string | null
latitude: float | null # Coordinates used for GHCID resolution
longitude: float | null
source_coordinates: dict | null
resolution_date: datetime | null
ghcid_history: list # GHCID changes over time
# -----------------------------------------------------------------------------
# 6. ARCHIVE (original/source data preserved immutably)
# -----------------------------------------------------------------------------
# Original entry - preserve raw source data exactly as received
original_entry:
# NDE CSV fields (if from Dutch NDE registry)
plaatsnaam_bezoekadres: string | null
organisatie: string | null
isil-code_na: string | null
wikidata_id: string | null
type: list[string] | null
# ... other source-specific fields
# CH-Annotator fields (if from CH-Annotator)
name: string | null
institution_type: string | null
source: string | null
identifiers: list[dict] | null
locations: list[dict] | null # Original location data (may have coordinates)
# Entry index (for batch imports)
entry_index: int | string | null
# Processing timestamp
processing_timestamp: datetime
# Enrichment status
enrichment_status: enum[pending, enriched, failed, partial] | null
# CH-Annotator metadata (if applicable)
ch_annotator:
convention_id: string
convention_version: string
entity_classification: dict
extraction_provenance: dict
annotation_provenance: dict
annotation_metadata: dict
entity_claims: list[dict]
integration_note: dict | null