glam/schemas/20251121/linkml/modules/classes/WebEnrichment.yaml
kempersc 626bd3a095 refactor(schemas): apply naming conventions to 261 class files
- Apply Rule 39: RiC-O style hasOrHad*/isOrWas* for temporal slots
- Apply Rule 43: Singular noun convention (keywords → keyword)
- Update slot references to match renamed slot files
- Maintain schema integrity across all class definitions
2026-01-10 15:36:33 +01:00

185 lines
6 KiB
YAML

# WebEnrichment - Web archive metadata
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
# Extraction date: 2026-01-08
id: https://nde.nl/ontology/hc/classes/WebEnrichment
name: WebEnrichment
title: WebEnrichment
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
prov: http://www.w3.org/ns/prov#
xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
- ./DuplicateEntry
- ./OrganizationalChange
- ./RawSource
- ./WebArchive
- ./WebArchiveFailure
- ./WebClaim
- ./WebCollection
default_range: string
classes:
WebEnrichment:
description: Web archive metadata
attributes:
web_archives:
range: WebArchive
multivalued: true
inlined_as_list: true
full_site_archive_timestamp:
range: datetime
web_archive_timestamp:
range: datetime
description: When the web archive was created (alternative key)
enrichment_timestamp:
range: datetime
description: When enrichment was performed
enrichment_method:
range: string
description: Method used (website_scrape_with_claims, exa_search, etc.)
source_url:
range: uri
description: URL that was enriched
web_enrichment_status:
range: string
description: Status of enrichment
enrichment_status:
range: string
description: Status of enrichment (alternative key)
enrichment_id:
range: string
description: Unique ID for this enrichment session
enrichment_notes:
range: string
description: Notes about the enrichment
search_query:
range: string
description: Query used to search
search_timestamp:
range: datetime
description: When search was performed
search_engine:
range: string
description: Search engine used (exa, google, etc.)
source_method:
range: string
description: Method used to find source (exa_web_search_manual, google_search,
etc.)
claim:
range: WebClaim
multivalued: true
inlined_as_list: true
description: Claims extracted from web content
raw_sources:
any_of:
- range: string
multivalued: true
- range: RawSource
multivalued: true
inlined_as_list: true
description: Raw source data (can be strings or structured objects)
xpath_provenance_added:
any_of:
- range: boolean
- range: datetime
description: Whether/when XPath provenance was added to claims
removed_unverified_claims:
any_of:
- range: string
multivalued: true
- range: WebClaim
multivalued: true
inlined_as_list: true
description: Claims removed due to lack of XPath verification (can be strings
or full claim objects)
archive_failures:
range: WebArchiveFailure
multivalued: true
inlined_as_list: true
description: Failed archive attempts
archive_failure_timestamp:
range: datetime
description: When archive failure was recorded
retry_timestamp:
range: datetime
description: Timestamp for retry attempt
platform_archive_timestamp:
range: datetime
description: Timestamp for platform archive
markdown_files:
range: string
multivalued: true
inlined_as_list: true
description: Markdown file paths from web scrape
markdown_fetch_timestamp:
range: datetime
description: When markdown was fetched
research_date:
range: string
description: Date research was conducted
status:
range: string
description: Status (CLOSED, ACTIVE, etc.)
website_found:
range: boolean
description: Whether a website was found
official_website:
range: uri
description: Official website URL found during research
research_notes:
range: string
description: Notes from research
organizational_change:
range: OrganizationalChange
description: Organizational change information (closures, mergers, etc.)
domain:
range: string
description: Domain name of the website
domain_registered:
range: string
description: Date domain was registered (YYYY-MM-DD)
registrar:
range: string
description: Domain registrar name
registration_country:
range: string
description: Country where domain is registered (ISO 3166-1 alpha-2)
site_launched:
range: string
description: Year or date when site was launched
collections:
range: WebCollection
multivalued: true
inlined_as_list: true
description: Collections documented on the website
is_canonical_entry:
range: boolean
description: Whether this is the canonical entry (vs duplicate)
duplicate_entries:
range: DuplicateEntry
multivalued: true
inlined_as_list: true
description: References to duplicate entries of this institution
organization_status:
range: string
description: Current status of the organization (ACTIVE, CLOSED, etc.)
research_timestamp:
range: datetime
description: When research was performed
website:
range: uri
description: Website URL found during research
claims_migrated:
range: boolean
description: Whether claims were migrated from another format
migration_timestamp:
range: datetime
description: When claims migration was performed