glam/schemas/20251121/linkml/modules/classes/SourceRecord.yaml
2026-01-19 00:09:28 +01:00

145 lines
5.2 KiB
YAML

# SourceRecord - Individual source record with claims
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
# Extraction date: 2026-01-08
id: https://nde.nl/ontology/hc/classes/SourceRecord
name: SourceRecord
title: SourceRecord
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
prov: http://www.w3.org/ns/prov#
xsd: http://www.w3.org/2001/XMLSchema#
pav: http://purl.org/pav/
dcat: http://www.w3.org/ns/dcat#
imports:
- linkml:types
- ../enums/DataTierEnum
default_range: string
classes:
SourceRecord:
description: >-
Individual source record with claims, representing a data extraction from a specific
source (API, registry, web scrape, etc.). Contains metadata about the source type,
data tier, fetch timestamp, and extracted claims. Used to track provenance of
individual data points.
Ontology mapping rationale:
- class_uri is prov:Entity because this represents a discrete data entity with
provenance (when fetched, from where, by what method)
- close_mappings includes dcat:Distribution as this is similar to a specific
manifestation/representation of data from a source
- related_mappings includes pav:retrievedFrom conceptually (the source was retrieved)
and prov:PrimarySource (the record may be from a primary source)
class_uri: prov:Entity
close_mappings:
- dcat:Distribution
related_mappings:
- prov:PrimarySource
attributes:
source_type:
range: string
description: Type identifier (nde_csv_registry, google_maps_api, etc.)
data_tier:
range: DataTierEnum
description: Quality tier of this source
fetch_timestamp:
range: string
description: When data was fetched (ISO datetime string)
has_or_had_api_endpoint:
range: uri
description: API endpoint used
api_endpoint:
range: uri
description: API endpoint used (alias for has_or_had_api_endpoint for backward compatibility)
place_id:
range: string
description: Google Maps place ID
data_url:
range: uri
description: Data source URL
match_method:
range: string
description: Method used for matching
claims_extracted:
range: Any
multivalued: true
inlined_as_list: true
description: List of claim fields extracted (can be strings or structured objects)
entity_id:
range: string
description: Wikidata entity ID (Q-number)
wikidata_id:
range: string
description: Wikidata entity ID (Q-number) - alternative key to entity_id
source_url:
range: uri
description: Source URL for the data
extraction_source:
range: string
multivalued: true
inlined_as_list: true
description: List of extraction source methods (e.g., archiveslab_llm_extraction)
retrieved_at:
range: datetime
description: When data was retrieved (alias for fetch_timestamp)
search_result:
range: string
description: Result of search operation (found, not_found, etc.)
search_queries:
range: string
multivalued: true
inlined_as_list: true
description: Search queries attempted
note:
range: string
description: Additional notes about this source record
source_file:
range: string
description: Source file name
research_date:
range: string
description: Date of research (YYYY-MM-DD format)
url:
range: uri
description: URL of the source (website URL, etc.)
data_extracted:
range: string
multivalued: true
inlined_as_list: true
description: List of data types/fields extracted from this source
merge_note:
range: string
description: Note about merge operations involving this source record
extraction_timestamp:
range: string
description: When extraction was performed (ISO datetime string)
api_version:
range: string
description: API version used for the request (e.g., v1, v2)
search_query:
range: string
description: Search query used to find the source
source_urls:
range: uri
multivalued: true
inlined_as_list: true
description: Multiple source URLs (for sources with multiple pages)
archive_path:
range: string
description: Path to archived source data file
extraction_method:
range: string
description: Method used for data extraction (e.g., docling_pdf_table_extraction, linkup_markdown_extraction)
notes:
range: string
description: Additional notes about the source record extraction
pdf_count:
range: integer
description: Number of PDF files processed in this source record