# Canonical Custodian YAML Schema v1.0.0 # # This document defines the standardized format for all heritage custodian files # in data/custodian/. ALL files should conform to this structure. # # Created: 2025-12-09 # Author: GLAM Data Engineering --- # ============================================================================= # CANONICAL SCHEMA DEFINITION # ============================================================================= # # The canonical format organizes data into these sections (in order): # # 1. IDENTITY (who is this institution?) # - custodian_name: Consensus name with provenance # - identifiers: All identifiers (GHCID, ISIL, Wikidata, etc.) # - institution_type: Single canonical type (GALLERY, LIBRARY, etc.) # # 2. LOCATION (where is this institution?) # - location: Single canonical location object with coordinates # - service_area: Geographic service area (optional) # # 3. TEMPORAL (when did events occur?) # - timespan: Begin/end dates of operation # - conflict_status: Destruction/damage info (if applicable) # # 4. ENRICHMENTS (data from external sources) # - wikidata_enrichment: From Wikidata API # - google_maps_enrichment: From Google Maps API # - web_enrichment: From website scraping # - youtube_enrichment: From YouTube API # - (other source-specific enrichments) # # 5. PROVENANCE (how was this data created?) # - provenance: Data source, tier, extraction info # - ghcid: GHCID generation details and history # # 6. ARCHIVE (preserve original data) # - original_entry: Raw data from source (immutable) # - ch_annotator: CH-Annotator metadata (if applicable) # # ============================================================================= # ----------------------------------------------------------------------------- # 1. IDENTITY # ----------------------------------------------------------------------------- # Consensus name - the authoritative institution name custodian_name: claim_type: custodian_name claim_value: string # Required: Canonical name emic_name: string | null # Native language name (if different) name_language: string | null # ISO 639-1 language code alternative_names: list[string] # Other known names source: string # Which source provided this name confidence: float # 0.0-1.0 confidence score consensus_method: bool # True if multiple sources agree sources_checked: int # Number of sources consulted sources_matched: int # Number of sources that agree extraction_timestamp: datetime # When this was extracted matching_sources: list # Details of each matching source # All identifiers in normalized format identifiers: - identifier_scheme: GHCID identifier_value: string # e.g., "NL-ZH-SCH-A-GAS" - identifier_scheme: GHCID_UUID identifier_value: string # UUID v5 identifier_url: string | null # urn:uuid:... - identifier_scheme: GHCID_UUID_SHA256 identifier_value: string # UUID v8 identifier_url: string | null - identifier_scheme: GHCID_NUMERIC identifier_value: string # 64-bit integer as string - identifier_scheme: RECORD_ID identifier_value: string # UUID v7 for database identifier_url: string | null - identifier_scheme: ISIL identifier_value: string # e.g., "NL-SdmGA" identifier_url: string | null # https://isil.org/... assigned_date: date | null source: string | null - identifier_scheme: Wikidata identifier_value: string # e.g., "Q62069649" identifier_url: string # https://www.wikidata.org/wiki/... # ... additional identifiers as needed # Institution type - single canonical value institution_type: enum[GALLERY, LIBRARY, ARCHIVE, MUSEUM, OFFICIAL_INSTITUTION, RESEARCH_CENTER, CORPORATION, UNKNOWN, BOTANICAL_ZOO, EDUCATION_PROVIDER, COLLECTING_SOCIETY, FEATURES, INTANGIBLE_HERITAGE_GROUP, MIXED, PERSONAL_COLLECTION, HOLY_SITES, DIGITAL_PLATFORM, NGO, TASTE_SMELL] # Optional description description: string | null # ----------------------------------------------------------------------------- # 2. LOCATION # ----------------------------------------------------------------------------- # Single canonical location - the AUTHORITATIVE coordinates and address # This is the ONLY place coordinates should be stored for the institution location: # Geographic coordinates (REQUIRED for map display) latitude: float | null # WGS84 latitude longitude: float | null # WGS84 longitude precision: float | null # Coordinate precision in degrees # Address components city: string | null # City/locality name region: string | null # State/province/region name region_code: string | null # ISO 3166-2 subdivision code country: string # ISO 3166-1 alpha-2 code (REQUIRED) postal_code: string | null street_address: string | null formatted_address: string | null # Full formatted address # GeoNames reference geonames_id: int | null geonames_name: string | null feature_code: string | null # PPL, PPLA, etc. # Provenance for location data coordinate_source: string # Which enrichment provided coordinates address_source: string | null # Which enrichment provided address resolution_timestamp: datetime | null # Service area (for institutions serving a geographic region) service_area: type: enum[country, admin1, admin2, city, custom] | null country_code: string | null admin1_code: string | null admin1_name: string | null admin2_name: string | null source: string | null notes: string | null # ----------------------------------------------------------------------------- # 3. TEMPORAL # ----------------------------------------------------------------------------- # Institution lifespan (CIDOC-CRM style) timespan: begin_of_the_begin: datetime | null # Earliest possible founding end_of_the_begin: datetime | null # Latest possible founding begin_of_the_end: datetime | null # Earliest possible closure end_of_the_end: datetime | null # Latest possible closure notes: string | null sources: list[string] | null # Conflict/destruction status (for damaged/destroyed institutions) conflict_status: status: enum[operational, damaged, destroyed, unknown] | null date: date | null reported_date: date | null description: string | null sources: list[string] | null # ----------------------------------------------------------------------------- # 4. ENRICHMENTS (source-specific data preserved as-is) # ----------------------------------------------------------------------------- # Wikidata enrichment (from Wikidata REST API) wikidata_enrichment: wikidata_id: string # Q-number wikidata_url: string wikidata_label: string | null wikidata_description: string | null labels: dict[str, str] # Language-tagged labels descriptions: dict[str, str] # Language-tagged descriptions wikidata_coordinates: # Coordinates from Wikidata latitude: float | null longitude: float | null precision: float | null wikidata_inception: date | null wikidata_dissolved: date | null instance_of: list[string] # Q-numbers located_in: list[string] # Q-numbers country: string | null # Q-number enrichment_timestamp: datetime verification: # LLM verification method: string | null confidence: float | null subtype: string | null reasoning: string | null # Google Maps enrichment (from Places API) google_maps_enrichment: place_id: string name: string fetch_timestamp: datetime api_status: string coordinates: latitude: float longitude: float formatted_address: string short_address: string | null address_components: list[dict] phone_local: string | null phone_international: string | null website: string | null google_place_types: list[string] business_status: string | null opening_hours: dict | null rating: float | null total_ratings: int | null reviews: list[dict] | null google_maps_url: string | null street_view_url: string | null # Google Maps status (quick check) google_maps_status: enum[SUCCESS, NO_MATCH, REJECTED, NOT_SEARCHED] | null google_maps_rejected: dict | null # Rejection details if rejected by LLM # Web enrichment (from website scraping) web_enrichment: web_archives: list full_site_archive_timestamp: datetime | null # Web claims (extracted from website) web_claims: extraction_timestamp: datetime source_archive: string claims_count: int claims: list[dict] # Each claim has XPath provenance # YouTube enrichment (from YouTube API) youtube_enrichment: dict | null youtube_status: enum[FOUND, NOT_FOUND, NOT_SEARCHED] | null # ISIL registry enrichment (from national ISIL registries) nan_isil_enrichment: dict | null # Dutch ISIL registry # (other national ISIL enrichments as needed) # Genealogiewerkbalk enrichment (Dutch municipal archives) genealogiewerkbalk_enrichment: dict | null # Digital platforms (derived from enrichments) digital_platforms: list[dict] | null # UNESCO enrichments unesco_mow_enrichment: dict | null unesco_status: dict | null # ----------------------------------------------------------------------------- # 5. PROVENANCE # ----------------------------------------------------------------------------- # Overall provenance (how this record was created) provenance: schema_version: string # e.g., "1.0.0" data_source: enum[CSV_REGISTRY, CONVERSATION_NLP, WIKIDATA, WEB_SCRAPE] data_tier: enum[TIER_1_AUTHORITATIVE, TIER_2_VERIFIED, TIER_3_CROWD_SOURCED, TIER_4_INFERRED] extraction_date: datetime extraction_method: string confidence_score: float | null generated_at: datetime | null sources: dict | null # Detailed source provenance data_tier_summary: dict | null # Summary by tier notes: list[string] | null # GHCID generation and history ghcid: ghcid_current: string # Current GHCID ghcid_original: string # Original GHCID (may differ) ghcid_uuid: string # UUID v5 ghcid_uuid_sha256: string # UUID v8 ghcid_numeric: int # 64-bit integer record_id: string # UUID v7 generation_timestamp: datetime location_resolution: # How location was resolved for GHCID method: string country_code: string region_code: string | null city_code: string | null city_name: string | null geonames_id: int | null geonames_name: string | null feature_code: string | null population: int | null admin1_code: string | null latitude: float | null # Coordinates used for GHCID resolution longitude: float | null source_coordinates: dict | null resolution_date: datetime | null ghcid_history: list # GHCID changes over time # ----------------------------------------------------------------------------- # 6. ARCHIVE (original/source data preserved immutably) # ----------------------------------------------------------------------------- # Original entry - preserve raw source data exactly as received original_entry: # NDE CSV fields (if from Dutch NDE registry) plaatsnaam_bezoekadres: string | null organisatie: string | null isil-code_na: string | null wikidata_id: string | null type: list[string] | null # ... other source-specific fields # CH-Annotator fields (if from CH-Annotator) name: string | null institution_type: string | null source: string | null identifiers: list[dict] | null locations: list[dict] | null # Original location data (may have coordinates) # Entry index (for batch imports) entry_index: int | string | null # Processing timestamp processing_timestamp: datetime # Enrichment status enrichment_status: enum[pending, enriched, failed, partial] | null # CH-Annotator metadata (if applicable) ch_annotator: convention_id: string convention_version: string entity_classification: dict extraction_provenance: dict annotation_provenance: dict annotation_metadata: dict entity_claims: list[dict] integration_note: dict | null