glam/data/nde/linkml/nde_enriched_source.yaml
2025-12-02 14:36:01 +01:00

1252 lines
31 KiB
YAML

id: https://w3id.org/heritage/nde/enriched-source
name: nde-enriched-source
title: NDE Enriched Heritage Organizations Source Schema
description: >-
LinkML schema for enriched NDE heritage institution entries.
This schema represents the structure of enriched YAML files in data/nde/enriched/entries/
after Google Maps, Wikidata, website scraping, and GHCID enrichment.
version: 1.0.0
prefixes:
linkml: https://w3id.org/linkml/
nde: https://w3id.org/heritage/nde/
schema: http://schema.org/
wdt: http://www.wikidata.org/prop/direct/
wd: http://www.wikidata.org/entity/
default_prefix: nde
default_range: string
imports:
- linkml:types
# =============================================================================
# CLASSES
# =============================================================================
classes:
# ---------------------------------------------------------------------------
# ROOT CONTAINER
# ---------------------------------------------------------------------------
NDEEnrichedEntry:
description: >-
Root container for a single enriched NDE heritage institution entry.
Contains all enrichment data from multiple sources.
tree_root: true
slots:
- entry_index
- processing_timestamp
- enrichment_status
- enrichment_timestamp
- enrichment_source
- original_entry
- organization
- location
- contact
- digital_presence
- opening_hours
- publications
- collections
- historical_documentation
- activities
- historical_sites
- nde_metadata
- google_maps_enrichment
- wikidata_enrichment
- nan_isil_enrichment
- url_enrichment
- web_enrichment
- web_claims
- custodian_name
- identifiers
- ghcid
- provenance
# ---------------------------------------------------------------------------
# ORIGINAL ENTRY (from CSV)
# ---------------------------------------------------------------------------
OriginalEntry:
description: Original entry data from NDE CSV source
slots:
- plaatsnaam_bezoekadres
- straat_en_huisnummer_bezoekadres
- organisatie
- koepelorganisatie
- webadres_organisatie
- type_organisatie
- systeem
- isil_code_na
- collectie_nederland
- museum_register
- rijkscollectie
- wikidata_id
- type
- opmerkingen
# ---------------------------------------------------------------------------
# ORGANIZATION
# ---------------------------------------------------------------------------
Organization:
description: Structured organization information derived from enrichment
slots:
- name
- alternative_names
- type
- institution_type
- description
- legal_form
- kvk_number
# ---------------------------------------------------------------------------
# LOCATION
# ---------------------------------------------------------------------------
Location:
description: Physical location of the institution
slots:
- location_name
- street_address
- postal_code
- city
- municipality
- sublocality
- province
- country
- coordinates
Coordinates:
description: Geographic coordinates
slots:
- latitude
- longitude
- precision
- globe
# ---------------------------------------------------------------------------
# CONTACT
# ---------------------------------------------------------------------------
Contact:
description: Contact information for the institution
slots:
- address
- email
- phone
- website
- iban
# ---------------------------------------------------------------------------
# DIGITAL PRESENCE
# ---------------------------------------------------------------------------
DigitalPresence:
description: Online presence information
slots:
- website
- social_media
- systems
SocialMediaAccount:
description: A social media account
slots:
- platform
- name
- url
- username
DigitalSystem:
description: Digital system used by the institution
slots:
- name
- type
- url
# ---------------------------------------------------------------------------
# OPENING HOURS
# ---------------------------------------------------------------------------
OpeningHours:
description: Opening hours information
slots:
- venue_hours
VenueHours:
description: Hours for a specific venue
slots:
- monday
- tuesday
- wednesday
- thursday
- friday
- saturday
- sunday
- note
# ---------------------------------------------------------------------------
# PUBLICATIONS
# ---------------------------------------------------------------------------
Publication:
description: Publication produced by the institution
slots:
- name
- type
- description
# ---------------------------------------------------------------------------
# COLLECTIONS
# ---------------------------------------------------------------------------
Collection:
description: Collection held by the institution
slots:
- name
- type
- description
- note
- temporal_coverage
- extent
# ---------------------------------------------------------------------------
# HISTORICAL DOCUMENTATION
# ---------------------------------------------------------------------------
HistoricalDocumentation:
description: Historical documentation projects or resources
slots:
- name
- description
# ---------------------------------------------------------------------------
# ACTIVITIES
# ---------------------------------------------------------------------------
Activity:
description: Activity or service provided by the institution
slots:
- type
- name
- description
# ---------------------------------------------------------------------------
# GOOGLE MAPS ENRICHMENT
# ---------------------------------------------------------------------------
GoogleMapsEnrichment:
description: Enrichment data from Google Maps Places API
slots:
- place_id
- name
- business_status
- rating
- total_ratings
- photo_count
- coordinates
# ---------------------------------------------------------------------------
# WIKIDATA ENRICHMENT
# ---------------------------------------------------------------------------
WikidataEnrichment:
description: Enrichment data from Wikidata REST API
slots:
- wikidata_entity_id
- api_metadata
- wikidata_labels
- wikidata_label_nl
- wikidata_label_en
- wikidata_descriptions
- wikidata_description_nl
- wikidata_description_en
- wikidata_aliases
- wikidata_identifiers
- wikidata_instance_of
- wikidata_country
- wikidata_located_in
- wikidata_coordinates
- wikidata_inception
- wikidata_official_website
- wikidata_image
- wikidata_logo
- wikidata_sitelinks
- wikidata_claims
WikidataApiMetadata:
description: API request metadata from Wikidata fetch
slots:
- api_endpoint
- request_url
- response_status
- response_time_ms
- fetch_timestamp
- user_agent
- authenticated
- rate_limit_delay_used
WikidataEntityReference:
description: Reference to a Wikidata entity with labels
slots:
- id
- label_en
- label_nl
- description_en
- description_nl
- instance_of
WikidataCoordinates:
description: Geographic coordinates from Wikidata
slots:
- latitude
- longitude
- precision
- globe
WikidataTime:
description: Time value from Wikidata
slots:
- time
- precision
- calendarmodel
WikidataIdentifiers:
description: External identifiers from Wikidata
slots:
- gnd
- viaf
- lcnaf
- geonames
- isni
- ringgold
- bnf
- idref
# ---------------------------------------------------------------------------
# NAN ISIL ENRICHMENT
# ---------------------------------------------------------------------------
NANISILEnrichment:
description: Enrichment from Nationaal Archief ISIL Registry
slots:
- source
- source_file
- source_url
- data_tier
- extraction_date
- enrichment_timestamp
- enrichment_method
- match_confidence
- isil_code
- nan_name
- nan_city
- nan_toegekend_op
- nan_opmerking
# ---------------------------------------------------------------------------
# URL ENRICHMENT
# ---------------------------------------------------------------------------
URLEnrichment:
description: URL discovery enrichment
slots:
- discovered_url
- search_query
- enrichment_timestamp
- enrichment_method
- status
# ---------------------------------------------------------------------------
# WEB ENRICHMENT
# ---------------------------------------------------------------------------
WebEnrichment:
description: Website archiving and scraping enrichment
slots:
- web_archives
- full_site_archive_timestamp
WebArchive:
description: Archived website information
slots:
- url
- directory
- pages_archived
- archive_method
- warc_file
- warc_size_bytes
- warc_format
# ---------------------------------------------------------------------------
# WEB CLAIMS
# ---------------------------------------------------------------------------
WebClaims:
description: Claims extracted from archived websites with XPath provenance
slots:
- extraction_timestamp
- source_archive
- claims_count
- claims
WebClaim:
description: A single claim extracted from a website with XPath provenance
slots:
- claim_type
- claim_value
- raw_value
- source_url
- retrieved_on
- xpath
- html_file
- xpath_match_score
- extraction_method
- extraction_timestamp
# ---------------------------------------------------------------------------
# CUSTODIAN NAME (Consensus)
# ---------------------------------------------------------------------------
CustodianNameConsensus:
description: Consensus custodian name derived from multiple sources
slots:
- claim_type
- claim_value
- source
- confidence
- consensus_method
- sources_checked
- sources_matched
- extraction_timestamp
- matching_sources
MatchingSource:
description: A source that matched in consensus derivation
slots:
- source
- name
- score
# ---------------------------------------------------------------------------
# IDENTIFIERS
# ---------------------------------------------------------------------------
Identifier:
description: An identifier for the institution
slots:
- identifier_scheme
- identifier_value
- identifier_url
- assigned_date
- source
# ---------------------------------------------------------------------------
# GHCID (Global Heritage Custodian ID)
# ---------------------------------------------------------------------------
GHCIDInfo:
description: GHCID persistent identifier information
slots:
- ghcid_current
- ghcid_original
- ghcid_uuid
- ghcid_uuid_sha256
- ghcid_numeric
- record_id
- generation_timestamp
- ghcid_history
- location_resolution
- geonames_id
GHCIDHistoryEntry:
description: Historical GHCID entry for tracking changes
slots:
- ghcid
- ghcid_numeric
- valid_from
- valid_to
- reason
LocationResolution:
description: GeoNames location resolution information
slots:
- method
- geonames_id
- geonames_name
- feature_code
- admin1_code
- region_code
- country_code
- source_coordinates
- distance_km
SourceCoordinates:
description: Source coordinates used for location resolution
slots:
- latitude
- longitude
- source
# ---------------------------------------------------------------------------
# PROVENANCE
# ---------------------------------------------------------------------------
Provenance:
description: Provenance and data quality tracking
slots:
- schema_version
- generated_at
- sources
- data_tier_summary
- notes
ProvenanceSources:
description: Sources used in enrichment
slots:
- google_maps
- wikidata_api
- website_scrape
ProvenanceSourceEntry:
description: A single provenance source entry
slots:
- source_type
- api_endpoint
- place_id
- entity_id
- claims_extracted
DataTierSummary:
description: Summary of data tier classifications
slots:
- tier_1_authoritative
- tier_2_verified
- tier_3_crowd_sourced
- tier_4_inferred
# =============================================================================
# SLOTS
# =============================================================================
slots:
# Root level slots
entry_index:
description: Index number from original NDE CSV
range: integer
processing_timestamp:
description: When entry was processed
range: datetime
enrichment_status:
description: Status of enrichment (enriched, pending, failed)
range: string
enrichment_timestamp:
description: When enrichment was performed
range: datetime
enrichment_source:
description: Source of enrichment data
range: string
# Original entry slots
plaatsnaam_bezoekadres:
description: City/town from original CSV
range: string
straat_en_huisnummer_bezoekadres:
description: Street address from original CSV
range: string
organisatie:
description: Organization name from original CSV
range: string
koepelorganisatie:
description: Parent organization from original CSV
range: string
webadres_organisatie:
description: Website URL from original CSV
range: string
type_organisatie:
description: Organization type from original CSV
range: string
systeem:
description: Collection management system from original CSV
range: string
isil_code_na:
description: ISIL code from original CSV
range: string
collectie_nederland:
description: Collectie Nederland participation
range: string
museum_register:
description: Museum register participation
range: string
rijkscollectie:
description: Rijkscollectie participation
range: string
wikidata_id:
description: Wikidata ID from original CSV
range: string
type:
description: Institution type codes (list)
range: string
multivalued: true
opmerkingen:
description: Remarks from original CSV
range: string
# Organization slots
name:
description: Name of the entity
range: string
alternative_names:
description: Alternative names or aliases
range: string
multivalued: true
institution_type:
description: GLAMORCUBESFIXPHDNT institution type code
range: string
description:
description: Description text
range: string
legal_form:
description: Legal form (vereniging, stichting, etc.)
range: string
kvk_number:
description: Dutch Chamber of Commerce number
range: string
# Location slots
location_name:
description: Name of location/venue
range: string
street_address:
description: Street address
range: string
postal_code:
description: Postal code
range: string
city:
description: City name
range: string
municipality:
description: Municipality name
range: string
sublocality:
description: Sublocality or district
range: string
province:
description: Province name
range: string
country:
description: Country code (ISO 3166-1)
range: string
coordinates:
description: Geographic coordinates
range: Coordinates
latitude:
description: Latitude coordinate
range: float
longitude:
description: Longitude coordinate
range: float
precision:
description: Coordinate precision
range: float
globe:
description: Reference globe (typically Earth)
range: string
# Contact slots
address:
description: Full address string
range: string
email:
description: Email address
range: string
phone:
description: Phone number
range: string
website:
description: Website URL
range: uri
iban:
description: IBAN bank account number
range: string
# Digital presence slots
social_media:
description: Social media accounts
range: SocialMediaAccount
multivalued: true
inlined_as_list: true
systems:
description: Digital systems used
range: DigitalSystem
multivalued: true
inlined_as_list: true
platform:
description: Platform name
range: string
url:
description: URL
range: uri
username:
description: Username on platform
range: string
# Opening hours slots
venue_hours:
description: Hours for specific venues
range: VenueHours
inlined: true
monday:
range: string
tuesday:
range: string
wednesday:
range: string
thursday:
range: string
friday:
range: string
saturday:
range: string
sunday:
range: string
note:
range: string
# Collection/publication slots
publications:
description: Publications by the institution
range: Publication
multivalued: true
inlined_as_list: true
collections:
description: Collections held
range: Collection
multivalued: true
inlined_as_list: true
temporal_coverage:
description: Time period covered
range: string
extent:
description: Size/extent of collection
range: string
# Historical slots
historical_documentation:
description: Historical documentation projects
range: HistoricalDocumentation
multivalued: true
inlined_as_list: true
historical_sites:
description: Historical sites associated with institution
range: string
multivalued: true
activities:
description: Activities and services
range: Activity
multivalued: true
inlined_as_list: true
# NDE metadata
nde_metadata:
description: NDE-specific metadata
range: OriginalEntry
inlined: true
# Google Maps enrichment slots
google_maps_enrichment:
description: Google Maps enrichment data
range: GoogleMapsEnrichment
inlined: true
place_id:
description: Google Maps place ID
range: string
business_status:
description: Business operational status
range: string
rating:
description: Google Maps rating
range: float
total_ratings:
description: Total number of ratings
range: integer
photo_count:
description: Number of photos
range: integer
# Wikidata enrichment slots
wikidata_enrichment:
description: Wikidata enrichment data
range: WikidataEnrichment
inlined: true
wikidata_entity_id:
description: Wikidata entity ID (Q-number)
range: string
api_metadata:
description: API request metadata
range: WikidataApiMetadata
inlined: true
api_endpoint:
description: API endpoint URL
range: string
request_url:
description: Full request URL
range: string
response_status:
description: HTTP response status
range: integer
response_time_ms:
description: Response time in milliseconds
range: float
fetch_timestamp:
description: When data was fetched
range: datetime
user_agent:
description: User agent string used
range: string
authenticated:
description: Whether request was authenticated
range: boolean
rate_limit_delay_used:
description: Rate limit delay in seconds
range: float
# Wikidata label/description slots
wikidata_labels:
description: Labels in multiple languages
range: string
# Note: This is actually a dict in source data
wikidata_label_nl:
description: Dutch label
range: string
wikidata_label_en:
description: English label
range: string
wikidata_descriptions:
description: Descriptions in multiple languages
range: string
# Note: This is actually a dict in source data
wikidata_description_nl:
description: Dutch description
range: string
wikidata_description_en:
description: English description
range: string
wikidata_aliases:
description: Aliases in multiple languages
range: string
# Note: This is actually a dict of lists in source data
# Wikidata identifier slots
wikidata_identifiers:
description: External identifiers from Wikidata
range: WikidataIdentifiers
inlined: true
gnd:
description: GND (German National Library) ID
range: string
viaf:
description: VIAF ID
range: string
lcnaf:
description: Library of Congress Name Authority File ID
range: string
geonames:
description: GeoNames ID
range: string
isni:
description: ISNI ID
range: string
ringgold:
description: Ringgold ID
range: string
bnf:
description: BnF (French National Library) ID
range: string
idref:
description: IdRef ID
range: string
# Wikidata entity reference slots
wikidata_instance_of:
description: Instance of (P31) values
range: WikidataEntityReference
multivalued: true
inlined_as_list: true
wikidata_country:
description: Country (P17) value
range: WikidataEntityReference
inlined: true
wikidata_located_in:
description: Located in (P131) value
range: WikidataEntityReference
inlined: true
wikidata_coordinates:
description: Coordinates from Wikidata
range: WikidataCoordinates
inlined: true
wikidata_inception:
description: Inception date
range: WikidataTime
inlined: true
wikidata_official_website:
description: Official website from Wikidata
range: uri
wikidata_image:
description: Wikimedia Commons image filename
range: string
wikidata_logo:
description: Wikimedia Commons logo filename
range: string
wikidata_sitelinks:
description: Wikipedia sitelinks
range: string
# Note: This is actually a dict in source data
wikidata_claims:
description: Additional Wikidata claims
range: string
# Note: This is actually a complex nested dict in source data
# Wikidata entity reference internal slots
id:
description: Entity ID (Q-number or P-number)
range: string
label_en:
description: English label
range: string
label_nl:
description: Dutch label
range: string
description_en:
description: English description
range: string
description_nl:
description: Dutch description
range: string
instance_of:
description: Instance of values (Q-numbers)
range: string
multivalued: true
# Wikidata time slots
time:
description: ISO 8601 time value
range: string
calendarmodel:
description: Calendar model URI
range: string
# NAN ISIL enrichment slots
nan_isil_enrichment:
description: Nationaal Archief ISIL enrichment
range: NANISILEnrichment
inlined: true
source:
description: Source identifier
range: string
source_file:
description: Source file path
range: string
source_url:
description: Source URL
range: uri
data_tier:
description: Data tier classification
range: string
extraction_date:
description: When data was extracted
range: datetime
enrichment_method:
description: Method used for enrichment
range: string
match_confidence:
description: Confidence score for match
range: float
isil_code:
description: ISIL code
range: string
nan_name:
description: Name from NAN registry
range: string
nan_city:
description: City from NAN registry
range: string
nan_toegekend_op:
description: Date ISIL was assigned
range: date
nan_opmerking:
description: Remarks from NAN registry
range: string
# URL enrichment slots
url_enrichment:
description: URL discovery enrichment
range: URLEnrichment
inlined: true
discovered_url:
description: Discovered URL
range: uri
search_query:
description: Search query used
range: string
status:
description: Status of operation
range: string
# Web enrichment slots
web_enrichment:
description: Web archiving enrichment
range: WebEnrichment
inlined: true
web_archives:
description: Archived websites
range: WebArchive
multivalued: true
inlined_as_list: true
full_site_archive_timestamp:
description: When full site was archived
range: datetime
directory:
description: Archive directory path
range: string
pages_archived:
description: Number of pages archived
range: integer
archive_method:
description: Archive method used
range: string
warc_file:
description: WARC filename
range: string
warc_size_bytes:
description: WARC file size in bytes
range: integer
warc_format:
description: WARC format standard
range: string
# Web claims slots
web_claims:
description: Claims extracted from website
range: WebClaims
inlined: true
source_archive:
description: Source archive directory
range: string
claims_count:
description: Number of claims
range: integer
claims:
description: List of claims
range: WebClaim
multivalued: true
inlined_as_list: true
claim_type:
description: Type of claim
range: string
claim_value:
description: Extracted claim value
range: string
raw_value:
description: Raw value before processing
range: string
retrieved_on:
description: When source was retrieved
range: datetime
xpath:
description: XPath to source element
range: string
html_file:
description: Source HTML file path
range: string
xpath_match_score:
description: XPath match confidence score
range: float
extraction_method:
description: Method used for extraction
range: string
extraction_timestamp:
description: When extraction occurred
range: datetime
# Custodian name consensus slots
custodian_name:
description: Consensus custodian name
range: CustodianNameConsensus
inlined: true
confidence:
description: Confidence score
range: float
consensus_method:
description: Whether consensus was achieved
range: boolean
sources_checked:
description: Number of sources checked
range: integer
sources_matched:
description: Number of sources that matched
range: integer
matching_sources:
description: Sources that matched
range: MatchingSource
multivalued: true
inlined_as_list: true
score:
description: Match score
range: float
# Identifier slots
identifiers:
description: Institution identifiers
range: Identifier
multivalued: true
inlined_as_list: true
identifier_scheme:
description: Identifier scheme (ISIL, GHCID, Wikidata, etc.)
range: string
identifier_value:
description: Identifier value
range: string
identifier_url:
description: URL for identifier
range: uri
assigned_date:
description: When identifier was assigned
range: date
# GHCID slots
ghcid:
description: GHCID information
range: GHCIDInfo
inlined: true
ghcid_current:
description: Current GHCID string
range: string
ghcid_original:
description: Original GHCID string
range: string
ghcid_uuid:
description: GHCID as UUID v5
range: string
ghcid_uuid_sha256:
description: GHCID as UUID v8 (SHA-256)
range: string
ghcid_numeric:
description: GHCID as 64-bit numeric
range: integer
record_id:
description: Database record UUID v7
range: string
generation_timestamp:
description: When GHCID was generated
range: datetime
ghcid_history:
description: GHCID change history
range: GHCIDHistoryEntry
multivalued: true
inlined_as_list: true
location_resolution:
description: Location resolution details
range: LocationResolution
inlined: true
geonames_id:
description: GeoNames ID for location
range: integer
# GHCID history slots
valid_from:
description: Start of validity period
range: datetime
valid_to:
description: End of validity period
range: datetime
reason:
description: Reason for change
range: string
# Location resolution slots
method:
description: Resolution method used
range: string
geonames_name:
description: Name from GeoNames
range: string
feature_code:
description: GeoNames feature code
range: string
admin1_code:
description: Admin level 1 code
range: string
region_code:
description: Region code (ISO 3166-2)
range: string
country_code:
description: Country code (ISO 3166-1)
range: string
source_coordinates:
description: Source coordinates used
range: SourceCoordinates
inlined: true
distance_km:
description: Distance in kilometers
range: float
# Provenance slots
provenance:
description: Provenance information
range: Provenance
inlined: true
schema_version:
description: Schema version used
range: string
generated_at:
description: When record was generated
range: datetime
sources:
description: Sources used
range: ProvenanceSources
inlined: true
data_tier_summary:
description: Data tier classifications
range: DataTierSummary
inlined: true
notes:
description: Provenance notes
range: string
multivalued: true
# Provenance source slots
source_type:
description: Type of source
range: string
entity_id:
description: Entity ID at source
range: string
claims_extracted:
description: Claims extracted from source
range: string
multivalued: true
# Data tier summary slots
tier_1_authoritative:
description: Tier 1 authoritative sources
range: string
multivalued: true
tier_2_verified:
description: Tier 2 verified sources
range: string
multivalued: true
tier_3_crowd_sourced:
description: Tier 3 crowd-sourced sources
range: string
multivalued: true
tier_4_inferred:
description: Tier 4 inferred sources
range: string
multivalued: true
# Cross-reference slots for nested structures
original_entry:
description: Original CSV entry data
range: OriginalEntry
inlined: true
organization:
description: Organization information
range: Organization
inlined: true
location:
description: Location information
range: Location
inlined: true
contact:
description: Contact information
range: Contact
inlined: true
digital_presence:
description: Digital presence information
range: DigitalPresence
inlined: true
opening_hours:
description: Opening hours
range: OpeningHours
inlined: true