glam/schemas/20251121/linkml/nde_enriched_entry.yaml
2025-12-06 19:50:04 +01:00

471 lines
14 KiB
YAML

# NDE Enriched Entry Schema
# This schema defines the structure of NDE (Netwerk Digitaal Erfgoed) enriched entry files.
# These are intermediate working files containing original CSV data + enrichments from
# Wikidata, Google Maps, OpenStreetMap, and other sources.
#
# Use: linkml-convert -s nde_enriched_entry.yaml -t json input.yaml -o output.json
id: https://nde.nl/ontology/enriched-entry
name: nde_enriched_entry
title: NDE Enriched Entry Schema
description: |
Schema for NDE (Netwerk Digitaal Erfgoed) enriched heritage custodian entries.
These entries contain:
- Original CSV data from NDE bronhouder spreadsheet
- Wikidata enrichment (labels, descriptions, claims, coordinates)
- Google Maps enrichment (ratings, reviews, place data)
- OpenStreetMap enrichment (OSM IDs, tags)
- GHCID persistent identifier assignment
- Location resolution from GeoNames
- Web scraping enrichment (website claims with XPath provenance)
version: 1.0.0
license: https://creativecommons.org/licenses/by-sa/4.0/
prefixes:
linkml: https://w3id.org/linkml/
schema: http://schema.org/
wd: http://www.wikidata.org/entity/
wdt: http://www.wikidata.org/prop/direct/
geo: http://www.w3.org/2003/01/geo/wgs84_pos#
prov: http://www.w3.org/ns/prov#
default_range: string
imports:
- linkml:types
classes:
NDEEnrichedEntry:
description: A single enriched heritage custodian entry from the NDE bronhouder dataset.
tree_root: true
attributes:
entry_index:
description: Sequential index of the entry in the original CSV
range: integer
processing_timestamp:
description: ISO 8601 timestamp when this entry was processed
range: datetime
enrichment_status:
description: Current enrichment status
range: EnrichmentStatusEnum
original_entry:
description: Original data from NDE CSV
range: OriginalEntry
wikidata_enrichment:
description: Data enriched from Wikidata
range: WikidataEnrichment
google_maps_enrichment:
description: Data enriched from Google Maps
range: GoogleMapsEnrichment
osm_enrichment:
description: Data enriched from OpenStreetMap
range: OSMEnrichment
ghcid:
description: Global Heritage Custodian Identifier assignment
range: GHCIDAssignment
location:
description: Resolved location data
range: Location
location_resolution:
description: GeoNames-based location resolution metadata
range: LocationResolution
web_enrichment:
description: Data from website scraping with XPath provenance
range: WebEnrichment
custodian_name:
description: Standardized emic name (legal form filtered)
range: string
OriginalEntry:
description: Original data from NDE bronhouder CSV spreadsheet
attributes:
plaatsnaam_bezoekadres:
description: City/place of the visiting address
straat_en_huisnummer_bezoekadres:
description: Street and house number of visiting address
organisatie:
description: Organization name
webadres_organisatie:
description: Website URL
range: uri
type_organisatie:
description: Type of organization (museum, archief, bibliotheek, etc.)
systeem:
description: Collection management system used
versnellen:
description: Part of Versnellen digitization project
museum_register:
description: Registered in Museum Register
in_scope_voor_dc4eu:
description: In scope for DC4EU project
linked_data:
description: Publishes linked data
datasetregister:
description: Dataset register category
versnellen_project:
description: Specific Versnellen project
wikidata_id:
description: Wikidata entity ID from original CSV
type:
description: GLAMORCUBESFIXPHDNT type codes
multivalued: true
WikidataEnrichment:
description: Enrichment data from Wikidata REST API
attributes:
wikidata_entity_id:
description: Wikidata Q-ID
pattern: "^Q[0-9]+$"
api_metadata:
description: API request metadata
range: APIMetadata
wikidata_labels:
description: Labels in multiple languages
range: string
multivalued: true
inlined_as_list: true
wikidata_label_nl:
description: Dutch label
wikidata_label_en:
description: English label
wikidata_descriptions:
description: Descriptions in multiple languages
range: string
multivalued: true
inlined_as_list: true
wikidata_description_nl:
description: Dutch description
wikidata_description_en:
description: English description
wikidata_aliases:
description: Alternative names by language
range: string
multivalued: true
inlined_as_list: true
wikidata_instance_of:
description: Instance of (P31) claims
range: WikidataEntity
multivalued: true
inlined_as_list: true
wikidata_country:
description: Country (P17) claim
range: WikidataEntity
wikidata_located_in:
description: Located in (P131) claim
range: WikidataEntity
wikidata_coordinates:
description: Coordinate location (P625)
range: Coordinates
wikidata_inception:
description: Inception/founding date (P571)
range: WikidataTime
wikidata_official_website:
description: Official website (P856)
range: uri
wikidata_image:
description: Image filename (P18)
wikidata_sitelinks:
description: Wikipedia and other sitelinks
range: string
multivalued: true
inlined_as_list: true
wikidata_claims:
description: Additional Wikidata claims
range: WikidataClaims
WikidataEntity:
description: A Wikidata entity reference with labels
attributes:
id:
description: Wikidata Q-ID
pattern: "^Q[0-9]+$"
label_en:
description: English label
label_nl:
description: Dutch label
description_en:
description: English description
description_nl:
description: Dutch description
instance_of:
description: Instance of IDs
range: string
multivalued: true
inlined_as_list: true
Coordinates:
description: Geographic coordinates from Wikidata
attributes:
latitude:
range: float
longitude:
range: float
precision:
range: float
globe:
description: Globe entity (usually Q2 for Earth)
range: uri
WikidataTime:
description: Wikidata time value
attributes:
time:
description: ISO 8601 time string with Wikidata prefix
precision:
description: Precision level (9=year, 10=month, 11=day)
range: integer
calendarmodel:
description: Calendar model (usually Gregorian)
range: uri
WikidataClaims:
description: Container for additional Wikidata claims
attributes:
commons_category:
description: Wikimedia Commons category
postal_code:
description: Postal code from Wikidata
phone:
description: Phone number from Wikidata
described_at_url:
description: URLs describing this entity
range: uri
multivalued: true
inlined_as_list: true
APIMetadata:
description: Metadata about API requests
attributes:
api_endpoint:
range: uri
request_url:
range: uri
response_status:
range: integer
response_time_ms:
range: float
fetch_timestamp:
range: datetime
user_agent:
description: User agent string used
authenticated:
range: boolean
rate_limit_delay_used:
range: float
GoogleMapsEnrichment:
description: Enrichment data from Google Maps Places API
attributes:
place_id:
description: Google Place ID
name:
description: Name from Google Maps
formatted_address:
description: Full formatted address
rating:
description: Average rating (1-5)
range: float
user_ratings_total:
description: Total number of ratings
range: integer
reviews_count:
description: Number of reviews
range: integer
photo_count:
description: Number of photos
range: integer
business_status:
description: Business status (OPERATIONAL, CLOSED_TEMPORARILY, etc.)
opening_hours:
description: Opening hours
range: string
multivalued: true
inlined_as_list: true
website:
range: uri
phone:
description: Phone number
types:
description: Google place types
range: string
multivalued: true
inlined_as_list: true
latitude:
range: float
longitude:
range: float
OSMEnrichment:
description: Enrichment data from OpenStreetMap
attributes:
osm_id:
description: OpenStreetMap ID
osm_type:
description: OSM element type (node, way, relation)
range: OSMTypeEnum
name:
description: Name from OSM
amenity:
description: Amenity tag value
building:
description: Building tag value
heritage:
description: Heritage tag value
wikidata:
description: Wikidata tag from OSM
wikipedia:
description: Wikipedia tag from OSM
website:
range: uri
osm_tags:
description: Additional OSM tags
range: string
multivalued: true
inlined_as_list: true
GHCIDAssignment:
description: Global Heritage Custodian Identifier assignment
attributes:
ghcid_string:
description: Human-readable GHCID (e.g., NL-DR-BOR-M-HC)
ghcid_uuid:
description: UUID v5 derived from GHCID string
ghcid_uuid_sha256:
description: UUID v8 (SHA-256) for future-proofing
ghcid_numeric:
description: 64-bit numeric identifier
range: integer
generation_timestamp:
range: datetime
collision_resolution:
description: Method used if collision occurred
Location:
description: Resolved geographic location
attributes:
city:
description: City/settlement name
address:
description: Full street address
postal_code:
description: Postal code
region:
description: Province/state/region
country:
description: ISO 3166-1 alpha-2 country code
pattern: "^[A-Z]{2}$"
latitude:
range: float
longitude:
range: float
LocationResolution:
description: GeoNames-based location resolution metadata
attributes:
method:
description: Resolution method used
range: LocationResolutionMethodEnum
geonames_id:
description: GeoNames ID of resolved settlement
range: integer
geonames_name:
description: Settlement name from GeoNames
feature_code:
description: GeoNames feature code (PPL, PPLA, etc.)
settlement_code:
description: 3-letter settlement code for GHCID
admin1_code:
description: GeoNames admin1 code
region_code:
description: ISO 3166-2 region code
country_code:
description: ISO 3166-1 alpha-2 country code
resolution_date:
range: datetime
WebEnrichment:
description: Data from website scraping with XPath provenance
attributes:
claims:
description: Verified claims with XPath provenance
range: WebClaim
multivalued: true
inlined_as_list: true
removed_unverified_claims:
description: Claims removed due to missing XPath verification
range: WebClaim
multivalued: true
inlined_as_list: true
scrape_timestamp:
range: datetime
html_file:
description: Path to archived HTML file
WebClaim:
description: A claim extracted from a webpage with XPath provenance
attributes:
claim_type:
description: Type of claim (full_name, description, email, etc.)
claim_value:
description: The extracted value
source_url:
range: uri
retrieved_on:
range: datetime
xpath:
description: XPath to the element containing this value
html_file:
description: Relative path to archived HTML file
xpath_match_score:
description: Match confidence (1.0 = exact)
range: float
NDEEnrichedEntryCollection:
description: Collection of NDE enriched entries for batch processing
attributes:
entries:
range: NDEEnrichedEntry
multivalued: true
inlined_as_list: true
generated_at:
range: datetime
entry_count:
range: integer
source_directory:
description: Source directory path
enums:
EnrichmentStatusEnum:
permissible_values:
pending:
description: Not yet enriched
enriched:
description: Successfully enriched
partial:
description: Partially enriched (some sources failed)
failed:
description: Enrichment failed
unknown:
description: Status unknown
OSMTypeEnum:
permissible_values:
node:
description: OSM node
way:
description: OSM way
relation:
description: OSM relation
LocationResolutionMethodEnum:
permissible_values:
REVERSE_GEOCODE:
description: Resolved by reverse geocoding coordinates
NAME_LOOKUP:
description: Resolved by name lookup in GeoNames
MANUAL:
description: Manually assigned
INHERITED:
description: Inherited from parent entry