331 lines
12 KiB
Markdown
331 lines
12 KiB
Markdown
# Canonical Custodian YAML Schema v1.0.0
|
|
#
|
|
# This document defines the standardized format for all heritage custodian files
|
|
# in data/custodian/. ALL files should conform to this structure.
|
|
#
|
|
# Created: 2025-12-09
|
|
# Author: GLAM Data Engineering
|
|
|
|
---
|
|
# =============================================================================
|
|
# CANONICAL SCHEMA DEFINITION
|
|
# =============================================================================
|
|
#
|
|
# The canonical format organizes data into these sections (in order):
|
|
#
|
|
# 1. IDENTITY (who is this institution?)
|
|
# - custodian_name: Consensus name with provenance
|
|
# - identifiers: All identifiers (GHCID, ISIL, Wikidata, etc.)
|
|
# - institution_type: Single canonical type (GALLERY, LIBRARY, etc.)
|
|
#
|
|
# 2. LOCATION (where is this institution?)
|
|
# - location: Single canonical location object with coordinates
|
|
# - service_area: Geographic service area (optional)
|
|
#
|
|
# 3. TEMPORAL (when did events occur?)
|
|
# - timespan: Begin/end dates of operation
|
|
# - conflict_status: Destruction/damage info (if applicable)
|
|
#
|
|
# 4. ENRICHMENTS (data from external sources)
|
|
# - wikidata_enrichment: From Wikidata API
|
|
# - google_maps_enrichment: From Google Maps API
|
|
# - web_enrichment: From website scraping
|
|
# - youtube_enrichment: From YouTube API
|
|
# - (other source-specific enrichments)
|
|
#
|
|
# 5. PROVENANCE (how was this data created?)
|
|
# - provenance: Data source, tier, extraction info
|
|
# - ghcid: GHCID generation details and history
|
|
#
|
|
# 6. ARCHIVE (preserve original data)
|
|
# - original_entry: Raw data from source (immutable)
|
|
# - ch_annotator: CH-Annotator metadata (if applicable)
|
|
#
|
|
# =============================================================================
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 1. IDENTITY
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Consensus name - the authoritative institution name
|
|
custodian_name:
|
|
claim_type: custodian_name
|
|
claim_value: string # Required: Canonical name
|
|
emic_name: string | null # Native language name (if different)
|
|
name_language: string | null # ISO 639-1 language code
|
|
alternative_names: list[string] # Other known names
|
|
source: string # Which source provided this name
|
|
confidence: float # 0.0-1.0 confidence score
|
|
consensus_method: bool # True if multiple sources agree
|
|
sources_checked: int # Number of sources consulted
|
|
sources_matched: int # Number of sources that agree
|
|
extraction_timestamp: datetime # When this was extracted
|
|
matching_sources: list # Details of each matching source
|
|
|
|
# All identifiers in normalized format
|
|
identifiers:
|
|
- identifier_scheme: GHCID
|
|
identifier_value: string # e.g., "NL-ZH-SCH-A-GAS"
|
|
- identifier_scheme: GHCID_UUID
|
|
identifier_value: string # UUID v5
|
|
identifier_url: string | null # urn:uuid:...
|
|
- identifier_scheme: GHCID_UUID_SHA256
|
|
identifier_value: string # UUID v8
|
|
identifier_url: string | null
|
|
- identifier_scheme: GHCID_NUMERIC
|
|
identifier_value: string # 64-bit integer as string
|
|
- identifier_scheme: RECORD_ID
|
|
identifier_value: string # UUID v7 for database
|
|
identifier_url: string | null
|
|
- identifier_scheme: ISIL
|
|
identifier_value: string # e.g., "NL-SdmGA"
|
|
identifier_url: string | null # https://isil.org/...
|
|
assigned_date: date | null
|
|
source: string | null
|
|
- identifier_scheme: Wikidata
|
|
identifier_value: string # e.g., "Q62069649"
|
|
identifier_url: string # https://www.wikidata.org/wiki/...
|
|
# ... additional identifiers as needed
|
|
|
|
# Institution type - single canonical value
|
|
institution_type: enum[GALLERY, LIBRARY, ARCHIVE, MUSEUM, OFFICIAL_INSTITUTION,
|
|
RESEARCH_CENTER, CORPORATION, UNKNOWN, BOTANICAL_ZOO,
|
|
EDUCATION_PROVIDER, COLLECTING_SOCIETY, FEATURES,
|
|
INTANGIBLE_HERITAGE_GROUP, MIXED, PERSONAL_COLLECTION,
|
|
HOLY_SITES, DIGITAL_PLATFORM, NGO, TASTE_SMELL]
|
|
|
|
# Optional description
|
|
description: string | null
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 2. LOCATION
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Single canonical location - the AUTHORITATIVE coordinates and address
|
|
# This is the ONLY place coordinates should be stored for the institution
|
|
location:
|
|
# Geographic coordinates (REQUIRED for map display)
|
|
latitude: float | null # WGS84 latitude
|
|
longitude: float | null # WGS84 longitude
|
|
precision: float | null # Coordinate precision in degrees
|
|
|
|
# Address components
|
|
city: string | null # City/locality name
|
|
region: string | null # State/province/region name
|
|
region_code: string | null # ISO 3166-2 subdivision code
|
|
country: string # ISO 3166-1 alpha-2 code (REQUIRED)
|
|
postal_code: string | null
|
|
street_address: string | null
|
|
formatted_address: string | null # Full formatted address
|
|
|
|
# GeoNames reference
|
|
geonames_id: int | null
|
|
geonames_name: string | null
|
|
feature_code: string | null # PPL, PPLA, etc.
|
|
|
|
# Provenance for location data
|
|
coordinate_source: string # Which enrichment provided coordinates
|
|
address_source: string | null # Which enrichment provided address
|
|
resolution_timestamp: datetime | null
|
|
|
|
# Service area (for institutions serving a geographic region)
|
|
service_area:
|
|
type: enum[country, admin1, admin2, city, custom] | null
|
|
country_code: string | null
|
|
admin1_code: string | null
|
|
admin1_name: string | null
|
|
admin2_name: string | null
|
|
source: string | null
|
|
notes: string | null
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 3. TEMPORAL
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Institution lifespan (CIDOC-CRM style)
|
|
timespan:
|
|
begin_of_the_begin: datetime | null # Earliest possible founding
|
|
end_of_the_begin: datetime | null # Latest possible founding
|
|
begin_of_the_end: datetime | null # Earliest possible closure
|
|
end_of_the_end: datetime | null # Latest possible closure
|
|
notes: string | null
|
|
sources: list[string] | null
|
|
|
|
# Conflict/destruction status (for damaged/destroyed institutions)
|
|
conflict_status:
|
|
status: enum[operational, damaged, destroyed, unknown] | null
|
|
date: date | null
|
|
reported_date: date | null
|
|
description: string | null
|
|
sources: list[string] | null
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 4. ENRICHMENTS (source-specific data preserved as-is)
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Wikidata enrichment (from Wikidata REST API)
|
|
wikidata_enrichment:
|
|
wikidata_id: string # Q-number
|
|
wikidata_url: string
|
|
wikidata_label: string | null
|
|
wikidata_description: string | null
|
|
labels: dict[str, str] # Language-tagged labels
|
|
descriptions: dict[str, str] # Language-tagged descriptions
|
|
wikidata_coordinates: # Coordinates from Wikidata
|
|
latitude: float | null
|
|
longitude: float | null
|
|
precision: float | null
|
|
wikidata_inception: date | null
|
|
wikidata_dissolved: date | null
|
|
instance_of: list[string] # Q-numbers
|
|
located_in: list[string] # Q-numbers
|
|
country: string | null # Q-number
|
|
enrichment_timestamp: datetime
|
|
verification: # LLM verification
|
|
method: string | null
|
|
confidence: float | null
|
|
subtype: string | null
|
|
reasoning: string | null
|
|
|
|
# Google Maps enrichment (from Places API)
|
|
google_maps_enrichment:
|
|
place_id: string
|
|
name: string
|
|
fetch_timestamp: datetime
|
|
api_status: string
|
|
coordinates:
|
|
latitude: float
|
|
longitude: float
|
|
formatted_address: string
|
|
short_address: string | null
|
|
address_components: list[dict]
|
|
phone_local: string | null
|
|
phone_international: string | null
|
|
website: string | null
|
|
google_place_types: list[string]
|
|
business_status: string | null
|
|
opening_hours: dict | null
|
|
rating: float | null
|
|
total_ratings: int | null
|
|
reviews: list[dict] | null
|
|
google_maps_url: string | null
|
|
street_view_url: string | null
|
|
|
|
# Google Maps status (quick check)
|
|
google_maps_status: enum[SUCCESS, NO_MATCH, REJECTED, NOT_SEARCHED] | null
|
|
google_maps_rejected: dict | null # Rejection details if rejected by LLM
|
|
|
|
# Web enrichment (from website scraping)
|
|
web_enrichment:
|
|
web_archives: list
|
|
full_site_archive_timestamp: datetime | null
|
|
|
|
# Web claims (extracted from website)
|
|
web_claims:
|
|
extraction_timestamp: datetime
|
|
source_archive: string
|
|
claims_count: int
|
|
claims: list[dict] # Each claim has XPath provenance
|
|
|
|
# YouTube enrichment (from YouTube API)
|
|
youtube_enrichment: dict | null
|
|
youtube_status: enum[FOUND, NOT_FOUND, NOT_SEARCHED] | null
|
|
|
|
# ISIL registry enrichment (from national ISIL registries)
|
|
nan_isil_enrichment: dict | null # Dutch ISIL registry
|
|
# (other national ISIL enrichments as needed)
|
|
|
|
# Genealogiewerkbalk enrichment (Dutch municipal archives)
|
|
genealogiewerkbalk_enrichment: dict | null
|
|
|
|
# Digital platforms (derived from enrichments)
|
|
digital_platforms: list[dict] | null
|
|
|
|
# UNESCO enrichments
|
|
unesco_mow_enrichment: dict | null
|
|
unesco_status: dict | null
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 5. PROVENANCE
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Overall provenance (how this record was created)
|
|
provenance:
|
|
schema_version: string # e.g., "1.0.0"
|
|
data_source: enum[CSV_REGISTRY, CONVERSATION_NLP, WIKIDATA, WEB_SCRAPE]
|
|
data_tier: enum[TIER_1_AUTHORITATIVE, TIER_2_VERIFIED,
|
|
TIER_3_CROWD_SOURCED, TIER_4_INFERRED]
|
|
extraction_date: datetime
|
|
extraction_method: string
|
|
confidence_score: float | null
|
|
generated_at: datetime | null
|
|
sources: dict | null # Detailed source provenance
|
|
data_tier_summary: dict | null # Summary by tier
|
|
notes: list[string] | null
|
|
|
|
# GHCID generation and history
|
|
ghcid:
|
|
ghcid_current: string # Current GHCID
|
|
ghcid_original: string # Original GHCID (may differ)
|
|
ghcid_uuid: string # UUID v5
|
|
ghcid_uuid_sha256: string # UUID v8
|
|
ghcid_numeric: int # 64-bit integer
|
|
record_id: string # UUID v7
|
|
generation_timestamp: datetime
|
|
location_resolution: # How location was resolved for GHCID
|
|
method: string
|
|
country_code: string
|
|
region_code: string | null
|
|
city_code: string | null
|
|
city_name: string | null
|
|
geonames_id: int | null
|
|
geonames_name: string | null
|
|
feature_code: string | null
|
|
population: int | null
|
|
admin1_code: string | null
|
|
latitude: float | null # Coordinates used for GHCID resolution
|
|
longitude: float | null
|
|
source_coordinates: dict | null
|
|
resolution_date: datetime | null
|
|
ghcid_history: list # GHCID changes over time
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 6. ARCHIVE (original/source data preserved immutably)
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Original entry - preserve raw source data exactly as received
|
|
original_entry:
|
|
# NDE CSV fields (if from Dutch NDE registry)
|
|
plaatsnaam_bezoekadres: string | null
|
|
organisatie: string | null
|
|
isil-code_na: string | null
|
|
wikidata_id: string | null
|
|
type: list[string] | null
|
|
# ... other source-specific fields
|
|
|
|
# CH-Annotator fields (if from CH-Annotator)
|
|
name: string | null
|
|
institution_type: string | null
|
|
source: string | null
|
|
identifiers: list[dict] | null
|
|
locations: list[dict] | null # Original location data (may have coordinates)
|
|
|
|
# Entry index (for batch imports)
|
|
entry_index: int | string | null
|
|
|
|
# Processing timestamp
|
|
processing_timestamp: datetime
|
|
|
|
# Enrichment status
|
|
enrichment_status: enum[pending, enriched, failed, partial] | null
|
|
|
|
# CH-Annotator metadata (if applicable)
|
|
ch_annotator:
|
|
convention_id: string
|
|
convention_version: string
|
|
entity_classification: dict
|
|
extraction_provenance: dict
|
|
annotation_provenance: dict
|
|
annotation_metadata: dict
|
|
entity_claims: list[dict]
|
|
integration_note: dict | null
|