glam/frontend/public/schemas/20251121/linkml/modules/classes/WebEnrichment.yaml

# WebEnrichment - Web archive metadata
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
# Extraction date: 2026-01-08

id: https://nde.nl/ontology/hc/classes/WebEnrichment
name: WebEnrichment
title: WebEnrichment

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  xsd: http://www.w3.org/2001/XMLSchema#
  pav: http://purl.org/pav/

imports:
  - linkml:types


  - ./DuplicateEntry
  - ./OrganizationalChange
  - ./RawSource
  - ./WebArchive
  - ./WebArchiveFailure
  - ./WebClaim
  - ./WebCollection
default_range: string

classes:
  WebEnrichment:
      description: >-
        Web archive metadata and enrichment data extracted from institutional websites.
        This class captures web scraping results, archived pages, extracted claims,
        domain registration info, and organizational status discovered through web research.

        Ontology mapping rationale:
        - class_uri is prov:Entity because this represents enrichment DATA with provenance
          (timestamps, methods, sources), not the institution or website itself
        - close_mappings includes pav:SourceAccessedAt because this captures when/how
          a web source was accessed and archived
        - related_mappings includes schema:WebPage because the enrichment is derived from
          web pages, though this class models the extracted data not the page itself
      class_uri: prov:Entity
      close_mappings:
        - pav:SourceAccessedAt
      related_mappings:
        - schema:WebPage
        - prov:Derivation
      attributes:
        web_archives:
          range: WebArchive
          multivalued: true
          inlined_as_list: true
        full_site_archive_timestamp:
          range: datetime
        web_archive_timestamp:
          range: datetime
          description: When the web archive was created (alternative key)
        enrichment_timestamp:
          range: datetime
          description: When enrichment was performed
        enrichment_method:
          range: string
          description: Method used (website_scrape_with_claims, exa_search, etc.)
        source_url:
          range: uri
          description: URL that was enriched
        web_enrichment_status:
          range: string
          description: Status of enrichment
        enrichment_status:
          range: string
          description: Status of enrichment (alternative key)
        enrichment_id:
          range: string
          description: Unique ID for this enrichment session
        enrichment_notes:
          range: string
          description: Notes about the enrichment
        search_query:
          range: string
          description: Query used to search
        search_timestamp:
          range: datetime
          description: When search was performed
        search_engine:
          range: string
          description: Search engine used (exa, google, etc.)
        source_method:
          range: string
          description: Method used to find source (exa_web_search_manual, google_search,
            etc.)
        claim:
          range: WebClaim
          multivalued: true
          inlined_as_list: true
          description: Claims extracted from web content
        raw_sources:
          any_of:
          - range: string
            multivalued: true
          - range: RawSource
            multivalued: true
          inlined_as_list: true
          description: Raw source data (can be strings or structured objects)
        xpath_provenance_added:
          any_of:
          - range: boolean
          - range: datetime
          description: Whether/when XPath provenance was added to claims
        removed_unverified_claims:
          any_of:
          - range: string
            multivalued: true
          - range: WebClaim
            multivalued: true
          inlined_as_list: true
          description: Claims removed due to lack of XPath verification (can be strings
            or full claim objects)
        archive_failures:
          range: WebArchiveFailure
          multivalued: true
          inlined_as_list: true
          description: Failed archive attempts
        archive_failure_timestamp:
          range: datetime
          description: When archive failure was recorded
        retry_timestamp:
          range: datetime
          description: Timestamp for retry attempt
        platform_archive_timestamp:
          range: datetime
          description: Timestamp for platform archive
        markdown_files:
          range: string
          multivalued: true
          inlined_as_list: true
          description: Markdown file paths from web scrape
        markdown_fetch_timestamp:
          range: datetime
          description: When markdown was fetched
        research_date:
          range: string
          description: Date research was conducted
        status:
          range: string
          description: Status (CLOSED, ACTIVE, etc.)
        website_found:
          range: boolean
          description: Whether a website was found
        official_website:
          range: uri
          description: Official website URL found during research
        research_notes:
          range: string
          description: Notes from research
        organizational_change:
          range: OrganizationalChange
          description: Organizational change information (closures, mergers, etc.)
        domain:
          range: string
          description: Domain name of the website
        domain_registered:
          range: string
          description: Date domain was registered (YYYY-MM-DD)
        registrar:
          range: string
          description: Domain registrar name
        registration_country:
          range: string
          description: Country where domain is registered (ISO 3166-1 alpha-2)
        site_launched:
          range: string
          description: Year or date when site was launched
        collections:
          range: WebCollection
          multivalued: true
          inlined_as_list: true
          description: Collections documented on the website
        is_canonical_entry:
          range: boolean
          description: Whether this is the canonical entry (vs duplicate)
        duplicate_entries:
          range: DuplicateEntry
          multivalued: true
          inlined_as_list: true
          description: References to duplicate entries of this institution
        organization_status:
          range: string
          description: Current status of the organization (ACTIVE, CLOSED, etc.)
        research_timestamp:
          range: datetime
          description: When research was performed
        website:
          range: uri
          description: Website URL found during research
        claims_migrated:
          range: boolean
          description: Whether claims were migrated from another format
        migration_timestamp:
          range: datetime
          description: When claims migration was performed