glam/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml
kempersc 0d5d48568d refactor(schema): centralize slot definitions per Rule 38
- Remove slot_uri, description, mappings from slot_usage sections
- Move these properties to centralized slot files in modules/slots/
- Keep only class-specific overrides in slot_usage (required, inlined, examples)
- Update 1,499 centralized slot files with enriched definitions
- Clean 188 class files

Violations fixed:
- slot_uri in slot_usage: 1,676 → 0
- description in slot_usage: 2,287 → 0 (moved to centralized)

Schema still validates: 816 classes, 2028 slots, 127 enums
2026-01-11 23:27:17 +01:00

134 lines
5.1 KiB
YAML

id: https://nde.nl/ontology/hc/class/ExtractionMetadata
name: extraction_metadata_class
title: Extraction Metadata Class
version: 1.0.0
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
prov: http://www.w3.org/ns/prov#
dct: http://purl.org/dc/terms/
xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
- ../metadata
- ./LLMResponse
- ./SpecificityAnnotation
- ./TemplateSpecificityScores
- ../enums/ProfileExtractionMethodEnum
- ../slots/extraction_agent
- ../slots/extraction_method
- ../slots/cost_usd
- ../slots/source_file
- ../slots/staff_id
- ../slots/extraction_date
- ../slots/linkedin_url
- ../slots/request_id
- ../slots/llm_response
- ../slots/specificity_annotation
- ../slots/template_specificity
default_range: string
classes:
ExtractionMetadata:
class_uri: prov:Activity
description: "Provenance metadata for data extraction activities.\n\nRecords how, when, and by what agent data was extracted\
\ from \nexternal sources (LinkedIn, web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity\
\ (the extraction process)\n- The extracted data IS the prov:Entity (output of the activity)\n- extraction_agent IS\
\ the prov:Agent (software/AI that performed extraction)\n- source_file/linkedin_url IS prov:used (input to the activity)\n\
\n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping provenance\n- Staff list parsing provenance\n\
- Connection network extraction\n\n**Example JSON Structure**:\n```json\n{\n \"extraction_metadata\": {\n \"source_file\"\
: \"/path/to/source.json\",\n \"staff_id\": \"org_staff_0001_name\",\n \"extraction_date\": \"2025-12-12T22:00:00Z\"\
,\n \"extraction_method\": \"exa_crawling_exa\",\n \"extraction_agent\": \"claude-opus-4.5\",\n \"linkedin_url\"\
: \"https://www.linkedin.com/in/...\",\n \"cost_usd\": 0.001\n }\n}\n```\n"
exact_mappings:
- prov:Activity
close_mappings:
- schema:Action
- dct:ProvenanceStatement
slots:
- cost_usd
- extraction_agent
- extraction_date
- extraction_method
- linkedin_url
- llm_response
- request_id
- source_file
- specificity_annotation
- staff_id
- template_specificity
slot_usage:
source_file:
range: string
examples:
- value: /data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json
description: Path to parsed staff list JSON
staff_id:
range: string
pattern: ^[a-z0-9-]+_staff_[a-z0-9-_]+$
examples:
- value: rijksmuseum_staff_0042_jan_van_der_berg
description: Staff ID with org prefix, index, and name slug
extraction_date:
range: datetime
required: true
examples:
- value: '2025-12-12T22:00:00Z'
description: UTC timestamp of extraction
extraction_method:
range: ProfileExtractionMethodEnum
required: true
examples:
- value: exa_crawling_exa
description: Extracted via Exa AI crawling API
extraction_agent:
range: string
examples:
- value: claude-opus-4.5
description: Extracted by Claude Opus 4.5
- value: ''
description: Empty string for fully automated extraction
linkedin_url:
range: uri
pattern: ^https://www\.linkedin\.com/in/[a-z0-9-]+/?$
examples:
- value: https://www.linkedin.com/in/jan-van-der-berg-12345
description: LinkedIn profile URL
cost_usd:
range: float
minimum_value: 0.0
examples:
- value: 0.001
description: Exa API call cost
- value: 0.0
description: Free extraction (cached/local)
request_id:
range: string
examples:
- value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv
description: Exa API request ID
llm_response:
range: LLMResponse
required: false
inlined: true
examples:
- value: "{\n \"content\": \"Extracted institution data...\",\n \"reasoning_content\": \"Analyzing the input for\
\ LinkML schema conformity...\",\n \"thinking_mode\": \"preserved\",\n \"clear_thinking\": false,\n \"model\"\
: \"glm-4.7\",\n \"provider\": \"zai\",\n \"created\": \"2025-12-23T10:30:00Z\",\n \"prompt_tokens\": 150,\n\
\ \"completion_tokens\": 450,\n \"total_tokens\": 600,\n \"finish_reason\": \"stop\",\n \"cost_usd\": 0.0\n\
}\n"
description: GLM 4.7 response with Preserved Thinking for extraction
specificity_annotation:
range: SpecificityAnnotation
inlined: true
template_specificity:
range: TemplateSpecificityScores
inlined: true
comments:
- Every person entity file MUST have extraction_metadata
- See AGENTS.md Rule 20 for required fields
- extraction_agent should be 'claude-opus-4.5' for manual extraction
- cost_usd enables budget tracking for API-heavy extractions
see_also:
- https://www.w3.org/TR/prov-o/
- https://docs.exa.ai/