203 lines
6.9 KiB
YAML
203 lines
6.9 KiB
YAML
# WebEnrichment - Web archive metadata
|
|
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
|
|
# Extraction date: 2026-01-08
|
|
|
|
id: https://nde.nl/ontology/hc/classes/WebEnrichment
|
|
name: WebEnrichment
|
|
title: WebEnrichment
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
hc: https://nde.nl/ontology/hc/
|
|
schema: http://schema.org/
|
|
prov: http://www.w3.org/ns/prov#
|
|
xsd: http://www.w3.org/2001/XMLSchema#
|
|
pav: http://purl.org/pav/
|
|
|
|
imports:
|
|
- linkml:types
|
|
|
|
|
|
- ./DuplicateEntry
|
|
- ./OrganizationalChange
|
|
- ./RawSource
|
|
- ./WebArchive
|
|
- ./WebArchiveFailure
|
|
- ./WebClaim
|
|
- ./WebCollection
|
|
default_range: string
|
|
|
|
classes:
|
|
WebEnrichment:
|
|
description: >-
|
|
Web archive metadata and enrichment data extracted from institutional websites.
|
|
This class captures web scraping results, archived pages, extracted claims,
|
|
domain registration info, and organizational status discovered through web research.
|
|
|
|
Ontology mapping rationale:
|
|
- class_uri is prov:Entity because this represents enrichment DATA with provenance
|
|
(timestamps, methods, sources), not the institution or website itself
|
|
- close_mappings includes pav:SourceAccessedAt because this captures when/how
|
|
a web source was accessed and archived
|
|
- related_mappings includes schema:WebPage because the enrichment is derived from
|
|
web pages, though this class models the extracted data not the page itself
|
|
class_uri: prov:Entity
|
|
close_mappings:
|
|
- pav:SourceAccessedAt
|
|
related_mappings:
|
|
- schema:WebPage
|
|
- prov:Derivation
|
|
attributes:
|
|
web_archives:
|
|
range: WebArchive
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
full_site_archive_timestamp:
|
|
range: datetime
|
|
web_archive_timestamp:
|
|
range: datetime
|
|
description: When the web archive was created (alternative key)
|
|
enrichment_timestamp:
|
|
range: datetime
|
|
description: When enrichment was performed
|
|
enrichment_method:
|
|
range: string
|
|
description: Method used (website_scrape_with_claims, exa_search, etc.)
|
|
source_url:
|
|
range: uri
|
|
description: URL that was enriched
|
|
web_enrichment_status:
|
|
range: string
|
|
description: Status of enrichment
|
|
enrichment_status:
|
|
range: string
|
|
description: Status of enrichment (alternative key)
|
|
enrichment_id:
|
|
range: string
|
|
description: Unique ID for this enrichment session
|
|
enrichment_notes:
|
|
range: string
|
|
description: Notes about the enrichment
|
|
search_query:
|
|
range: string
|
|
description: Query used to search
|
|
search_timestamp:
|
|
range: datetime
|
|
description: When search was performed
|
|
search_engine:
|
|
range: string
|
|
description: Search engine used (exa, google, etc.)
|
|
source_method:
|
|
range: string
|
|
description: Method used to find source (exa_web_search_manual, google_search,
|
|
etc.)
|
|
claim:
|
|
range: WebClaim
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Claims extracted from web content
|
|
raw_sources:
|
|
any_of:
|
|
- range: string
|
|
multivalued: true
|
|
- range: RawSource
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Raw source data (can be strings or structured objects)
|
|
xpath_provenance_added:
|
|
any_of:
|
|
- range: boolean
|
|
- range: datetime
|
|
description: Whether/when XPath provenance was added to claims
|
|
removed_unverified_claims:
|
|
any_of:
|
|
- range: string
|
|
multivalued: true
|
|
- range: WebClaim
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Claims removed due to lack of XPath verification (can be strings
|
|
or full claim objects)
|
|
archive_failures:
|
|
range: WebArchiveFailure
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Failed archive attempts
|
|
archive_failure_timestamp:
|
|
range: datetime
|
|
description: When archive failure was recorded
|
|
retry_timestamp:
|
|
range: datetime
|
|
description: Timestamp for retry attempt
|
|
platform_archive_timestamp:
|
|
range: datetime
|
|
description: Timestamp for platform archive
|
|
markdown_files:
|
|
range: string
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Markdown file paths from web scrape
|
|
markdown_fetch_timestamp:
|
|
range: datetime
|
|
description: When markdown was fetched
|
|
research_date:
|
|
range: string
|
|
description: Date research was conducted
|
|
status:
|
|
range: string
|
|
description: Status (CLOSED, ACTIVE, etc.)
|
|
website_found:
|
|
range: boolean
|
|
description: Whether a website was found
|
|
official_website:
|
|
range: uri
|
|
description: Official website URL found during research
|
|
research_notes:
|
|
range: string
|
|
description: Notes from research
|
|
organizational_change:
|
|
range: OrganizationalChange
|
|
description: Organizational change information (closures, mergers, etc.)
|
|
domain:
|
|
range: string
|
|
description: Domain name of the website
|
|
domain_registered:
|
|
range: string
|
|
description: Date domain was registered (YYYY-MM-DD)
|
|
registrar:
|
|
range: string
|
|
description: Domain registrar name
|
|
registration_country:
|
|
range: string
|
|
description: Country where domain is registered (ISO 3166-1 alpha-2)
|
|
site_launched:
|
|
range: string
|
|
description: Year or date when site was launched
|
|
collections:
|
|
range: WebCollection
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: Collections documented on the website
|
|
is_canonical_entry:
|
|
range: boolean
|
|
description: Whether this is the canonical entry (vs duplicate)
|
|
duplicate_entries:
|
|
range: DuplicateEntry
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
description: References to duplicate entries of this institution
|
|
organization_status:
|
|
range: string
|
|
description: Current status of the organization (ACTIVE, CLOSED, etc.)
|
|
research_timestamp:
|
|
range: datetime
|
|
description: When research was performed
|
|
website:
|
|
range: uri
|
|
description: Website URL found during research
|
|
claims_migrated:
|
|
range: boolean
|
|
description: Whether claims were migrated from another format
|
|
migration_timestamp:
|
|
range: datetime
|
|
description: When claims migration was performed
|