glam/frontend/public/schemas/20251121/linkml/modules/classes/WebEnrichment.yaml

203 lines
6.9 KiB
YAML

# WebEnrichment - Web archive metadata
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
# Extraction date: 2026-01-08
id: https://nde.nl/ontology/hc/classes/WebEnrichment
name: WebEnrichment
title: WebEnrichment
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
prov: http://www.w3.org/ns/prov#
xsd: http://www.w3.org/2001/XMLSchema#
pav: http://purl.org/pav/
imports:
- linkml:types
- ./DuplicateEntry
- ./OrganizationalChange
- ./RawSource
- ./WebArchive
- ./WebArchiveFailure
- ./WebClaim
- ./WebCollection
default_range: string
classes:
WebEnrichment:
description: >-
Web archive metadata and enrichment data extracted from institutional websites.
This class captures web scraping results, archived pages, extracted claims,
domain registration info, and organizational status discovered through web research.
Ontology mapping rationale:
- class_uri is prov:Entity because this represents enrichment DATA with provenance
(timestamps, methods, sources), not the institution or website itself
- close_mappings includes pav:SourceAccessedAt because this captures when/how
a web source was accessed and archived
- related_mappings includes schema:WebPage because the enrichment is derived from
web pages, though this class models the extracted data not the page itself
class_uri: prov:Entity
close_mappings:
- pav:SourceAccessedAt
related_mappings:
- schema:WebPage
- prov:Derivation
attributes:
web_archives:
range: WebArchive
multivalued: true
inlined_as_list: true
full_site_archive_timestamp:
range: datetime
web_archive_timestamp:
range: datetime
description: When the web archive was created (alternative key)
enrichment_timestamp:
range: datetime
description: When enrichment was performed
enrichment_method:
range: string
description: Method used (website_scrape_with_claims, exa_search, etc.)
source_url:
range: uri
description: URL that was enriched
web_enrichment_status:
range: string
description: Status of enrichment
enrichment_status:
range: string
description: Status of enrichment (alternative key)
enrichment_id:
range: string
description: Unique ID for this enrichment session
enrichment_notes:
range: string
description: Notes about the enrichment
search_query:
range: string
description: Query used to search
search_timestamp:
range: datetime
description: When search was performed
search_engine:
range: string
description: Search engine used (exa, google, etc.)
source_method:
range: string
description: Method used to find source (exa_web_search_manual, google_search,
etc.)
claim:
range: WebClaim
multivalued: true
inlined_as_list: true
description: Claims extracted from web content
raw_sources:
any_of:
- range: string
multivalued: true
- range: RawSource
multivalued: true
inlined_as_list: true
description: Raw source data (can be strings or structured objects)
xpath_provenance_added:
any_of:
- range: boolean
- range: datetime
description: Whether/when XPath provenance was added to claims
removed_unverified_claims:
any_of:
- range: string
multivalued: true
- range: WebClaim
multivalued: true
inlined_as_list: true
description: Claims removed due to lack of XPath verification (can be strings
or full claim objects)
archive_failures:
range: WebArchiveFailure
multivalued: true
inlined_as_list: true
description: Failed archive attempts
archive_failure_timestamp:
range: datetime
description: When archive failure was recorded
retry_timestamp:
range: datetime
description: Timestamp for retry attempt
platform_archive_timestamp:
range: datetime
description: Timestamp for platform archive
markdown_files:
range: string
multivalued: true
inlined_as_list: true
description: Markdown file paths from web scrape
markdown_fetch_timestamp:
range: datetime
description: When markdown was fetched
research_date:
range: string
description: Date research was conducted
status:
range: string
description: Status (CLOSED, ACTIVE, etc.)
website_found:
range: boolean
description: Whether a website was found
official_website:
range: uri
description: Official website URL found during research
research_notes:
range: string
description: Notes from research
organizational_change:
range: OrganizationalChange
description: Organizational change information (closures, mergers, etc.)
domain:
range: string
description: Domain name of the website
domain_registered:
range: string
description: Date domain was registered (YYYY-MM-DD)
registrar:
range: string
description: Domain registrar name
registration_country:
range: string
description: Country where domain is registered (ISO 3166-1 alpha-2)
site_launched:
range: string
description: Year or date when site was launched
collections:
range: WebCollection
multivalued: true
inlined_as_list: true
description: Collections documented on the website
is_canonical_entry:
range: boolean
description: Whether this is the canonical entry (vs duplicate)
duplicate_entries:
range: DuplicateEntry
multivalued: true
inlined_as_list: true
description: References to duplicate entries of this institution
organization_status:
range: string
description: Current status of the organization (ACTIVE, CLOSED, etc.)
research_timestamp:
range: datetime
description: When research was performed
website:
range: uri
description: Website URL found during research
claims_migrated:
range: boolean
description: Whether claims were migrated from another format
migration_timestamp:
range: datetime
description: When claims migration was performed