From 21c207c9dac1b6541032eb7e8faa4f89ce920585 Mon Sep 17 00:00:00 2001 From: kempersc Date: Wed, 14 Jan 2026 13:28:33 +0100 Subject: [PATCH] Refactor schema slots and classes for improved clarity and structure - Migrated `archived_at` to `is_or_was_archived_at` in AuxiliaryDigitalPlatform, WebObservation, and other relevant classes to better reflect historical archival status. - Removed `bold_id` slot and replaced it with `has_or_had_identifier` linked to the new `BOLDIdentifier` class in BiologicalObject. - Introduced `Bookplate` and `Approver` classes to enhance provenance tracking and ownership documentation. - Updated `InformationCarrier` to replace `bookplate` with `includes_or_included` for better representation of ownership marks. - Added new slots `is_or_was_approved_by` and `is_or_was_archived_at` to capture historical approval and archival locations. - Archived old slot definitions for `archived_at` and `bold_id` to maintain schema integrity. - Enhanced LinkedIn profile extraction functionality by integrating Linkup API alongside Exa API. --- .../src/lib/linkml/linkml-schema-service.ts | 84 ++++++ frontend/src/pages/LinkMLViewerPage.tsx | 256 +++++++++++------- schemas/20251121/linkml/manifest.json | 2 +- .../linkml/modules/classes/Approver.yaml | 72 +++++ .../classes/AuxiliaryDigitalPlatform.yaml | 7 +- .../modules/classes/BOLDIdentifier.yaml | 108 ++++++++ .../modules/classes/BiologicalObject.yaml | 20 +- .../linkml/modules/classes/Bookplate.yaml | 119 ++++++++ .../modules/classes/InformationCarrier.yaml | 7 +- .../modules/classes/WebObservation.yaml | 10 +- .../archived_at_archived_20260115.yaml} | 0 .../bold_id_archived_20260114.yaml} | 0 .../modules/slots/is_or_was_approved_by.yaml | 44 +++ .../modules/slots/is_or_was_archived_at.yaml | 44 +++ .../linkml/modules/slots/manifest.json | 1 - .../linkml/modules/slots/slot_fixes.yaml | 12 +- src/glam_extractor/api/entity_review.py | 106 +++++++- 17 files changed, 779 insertions(+), 113 deletions(-) create mode 100644 schemas/20251121/linkml/modules/classes/Approver.yaml create mode 100644 schemas/20251121/linkml/modules/classes/BOLDIdentifier.yaml create mode 100644 schemas/20251121/linkml/modules/classes/Bookplate.yaml rename schemas/20251121/linkml/modules/slots/{archived_at.yaml => archive/archived_at_archived_20260115.yaml} (100%) rename schemas/20251121/linkml/modules/slots/{bold_id.yaml => archive/bold_id_archived_20260114.yaml} (100%) create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_approved_by.yaml create mode 100644 schemas/20251121/linkml/modules/slots/is_or_was_archived_at.yaml diff --git a/frontend/src/lib/linkml/linkml-schema-service.ts b/frontend/src/lib/linkml/linkml-schema-service.ts index 45d058c00a..7927c0a053 100644 --- a/frontend/src/lib/linkml/linkml-schema-service.ts +++ b/frontend/src/lib/linkml/linkml-schema-service.ts @@ -269,6 +269,28 @@ export interface SlotExportInfo { }; } +/** + * Information about what a slot depends on (forward dependencies). + * This represents the "imports" - what schema elements this slot references. + */ +export interface SlotImportInfo { + slotName: string; + + /** The range type if it's a class or enum (dependency) */ + rangeType?: { + name: string; + isClass: boolean; + isEnum: boolean; + }; + + /** Any_of types if the slot has union types */ + anyOfTypes: Array<{ + name: string; + isClass: boolean; + isEnum: boolean; + }>; +} + const SCHEMA_BASE_PATH = '/schemas/20251121/linkml'; /** @@ -1561,6 +1583,68 @@ class LinkMLSchemaService { return exportInfo; } + /** + * Get import/dependency information for a slot. + * This finds what a slot depends ON (forward dependencies). + * + * Analyzes: + * - Range type (class or enum this slot references) + * - Any_of types (union types if applicable) + * + * This is the inverse of getSlotExportInfo. + */ + async getSlotImportInfo(slotName: string): Promise { + await this.initialize(); + + const importInfo: SlotImportInfo = { + slotName, + rangeType: undefined, + anyOfTypes: [], + }; + + // Get the slot definition + const slotDef = this.slotSchemas.get(slotName); + if (!slotDef) { + return importInfo; + } + + // Check range type + if (slotDef.range) { + const isClass = this.classSchemas.has(slotDef.range); + const isEnum = this.enumSchemas.has(slotDef.range); + + if (isClass || isEnum) { + importInfo.rangeType = { + name: slotDef.range, + isClass, + isEnum, + }; + } + } + + // Check any_of types (union types) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const anyOf = (slotDef as any).any_of; + if (anyOf && Array.isArray(anyOf)) { + for (const item of anyOf) { + if (item.range) { + const isClass = this.classSchemas.has(item.range); + const isEnum = this.enumSchemas.has(item.range); + + if (isClass || isEnum) { + importInfo.anyOfTypes.push({ + name: item.range, + isClass, + isEnum, + }); + } + } + } + } + + return importInfo; + } + /** * Get import/dependency information for a class. * This finds what a class depends ON (forward dependencies). diff --git a/frontend/src/pages/LinkMLViewerPage.tsx b/frontend/src/pages/LinkMLViewerPage.tsx index 225e6b6545..97addc6294 100644 --- a/frontend/src/pages/LinkMLViewerPage.tsx +++ b/frontend/src/pages/LinkMLViewerPage.tsx @@ -27,7 +27,7 @@ import { extractSlots, extractEnums, } from '../lib/linkml/schema-loader'; -import { linkmlSchemaService, type ClassExportInfo, type ClassImportInfo, type ClassDependencyCounts, type SlotDefinition, type SlotExportInfo } from '../lib/linkml/linkml-schema-service'; +import { linkmlSchemaService, type ClassExportInfo, type ClassImportInfo, type ClassDependencyCounts, type SlotDefinition, type SlotExportInfo, type SlotImportInfo } from '../lib/linkml/linkml-schema-service'; import { useLanguage } from '../contexts/LanguageContext'; import { useSchemaLoadingProgress } from '../hooks/useSchemaLoadingProgress'; import { CustodianTypeBadge } from '../components/uml/CustodianTypeIndicator'; @@ -1130,6 +1130,11 @@ const LinkMLViewerPage: React.FC = () => { const [slotExports, setSlotExports] = useState>({}); const [loadingSlotExports, setLoadingSlotExports] = useState>(new Set()); + // State for expandable Imports section in slot details (what this slot depends on) + const [expandedSlotImports, setExpandedSlotImports] = useState>(new Set()); + const [slotImports, setSlotImports] = useState>({}); + const [loadingSlotImports, setLoadingSlotImports] = useState>(new Set()); + // State for expandable UML diagram section in class details const [expandedUML, setExpandedUML] = useState>(new Set()); @@ -1409,6 +1414,37 @@ const LinkMLViewerPage: React.FC = () => { } }, [slotExports, loadingSlotExports, isSchemaServiceComplete]); + // Toggle imports section for a slot and load import data on demand + const toggleSlotImports = useCallback(async (slotName: string) => { + // Toggle expansion state + setExpandedSlotImports(prev => { + const next = new Set(prev); + if (next.has(slotName)) { + next.delete(slotName); + } else { + next.add(slotName); + } + return next; + }); + + // Load import data if not already loaded and schema service is ready + if (!slotImports[slotName] && !loadingSlotImports.has(slotName) && isSchemaServiceComplete) { + setLoadingSlotImports(prev => new Set(prev).add(slotName)); + try { + const importInfo = await linkmlSchemaService.getSlotImportInfo(slotName); + setSlotImports(prev => ({ ...prev, [slotName]: importInfo })); + } catch (error) { + console.error(`Error loading import info for slot ${slotName}:`, error); + } finally { + setLoadingSlotImports(prev => { + const next = new Set(prev); + next.delete(slotName); + return next; + }); + } + } + }, [slotImports, loadingSlotImports, isSchemaServiceComplete]); + // Toggle UML diagram section for a class // Loads both exports AND imports data since UML diagram can show both directions const toggleUML = useCallback(async (className: string) => { @@ -3113,107 +3149,143 @@ const LinkMLViewerPage: React.FC = () => { )} - {/* Exports section - Only show for standalone slots (not within class context) */} - {!className && ( -
+ {/* Imports Section - Shows forward dependencies (what this slot depends on) */} + {!className && isSchemaServiceComplete && ( +
- - {expandedSlotExports.has(slot.name) && ( -
- {loadingSlotExports.has(slot.name) ? ( -
{t('loading')}...
- ) : slotExports[slot.name] ? ( - <> - {/* Range Type - Navigation to class/enum */} - {slotExports[slot.name].rangeType && ( -
- {t('range')} - +
+
+ )} + {/* Any_of Types (union types) */} + {slotImports[slot.name].anyOfTypes.length > 0 && ( +
+ {t('anyOfTypesLabel')} ({slotImports[slot.name].anyOfTypes.length}) +
+ {slotImports[slot.name].anyOfTypes.map(type => ( + -
- )} - - {/* Classes that use this slot */} - {slotExports[slot.name].classesUsingSlot.length > 0 && ( -
- {t('classesUsingSlot')} -
- {slotExports[slot.name].classesUsingSlot.map(cls => ( - - ))} -
-
- )} - - {/* Classes with slot_usage overrides */} - {slotExports[slot.name].classesWithSlotUsage.length > 0 && ( -
- {t('classesWithSlotUsage')} -
- {slotExports[slot.name].classesWithSlotUsage.map(({ className: cls, overrides }) => ( - - ))} -
-
- )} - - {/* Empty state */} - {slotExports[slot.name].classesUsingSlot.length === 0 && - slotExports[slot.name].classesWithSlotUsage.length === 0 && ( -
- {t('noClassesUsingSlot')} -
- )} - - ) : null} + ))} +
+
+ )} + {/* No imports message */} + {!slotImports[slot.name].rangeType && slotImports[slot.name].anyOfTypes.length === 0 && ( +
+ {t('noSlotImports')} +
+ )} +
+ )} + + )} + + {/* Exports Section - Shows reverse dependencies (what classes use this slot) */} + {!className && isSchemaServiceComplete && ( +
+ + {expandedSlotExports.has(slot.name) && slotExports[slot.name] && ( +
+ {/* Classes that use this slot */} + {slotExports[slot.name].classesUsingSlot.length > 0 && ( +
+ {t('classesUsingSlot')} ({slotExports[slot.name].classesUsingSlot.length}) +
+ {slotExports[slot.name].classesUsingSlot.map(cls => ( + + ))} +
+
+ )} + {/* Classes with slot_usage overrides */} + {slotExports[slot.name].classesWithSlotUsage.length > 0 && ( +
+ {t('classesWithSlotUsage')} ({slotExports[slot.name].classesWithSlotUsage.length}) +
+ {slotExports[slot.name].classesWithSlotUsage.map(({ className: cls, overrides }) => ( + + ))} +
+
+ )} + {/* No exports message */} + {slotExports[slot.name].classesUsingSlot.length === 0 && + slotExports[slot.name].classesWithSlotUsage.length === 0 && ( +
+ {t('noClassesUsingSlot')} +
+ )}
)}
diff --git a/schemas/20251121/linkml/manifest.json b/schemas/20251121/linkml/manifest.json index b51dadbc5c..2aea1d0187 100644 --- a/schemas/20251121/linkml/manifest.json +++ b/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2026-01-14T12:09:32.173Z", + "generated": "2026-01-14T12:28:33.699Z", "schemaRoot": "/schemas/20251121/linkml", "totalFiles": 2884, "categoryCounts": { diff --git a/schemas/20251121/linkml/modules/classes/Approver.yaml b/schemas/20251121/linkml/modules/classes/Approver.yaml new file mode 100644 index 0000000000..0baca7835b --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Approver.yaml @@ -0,0 +1,72 @@ +id: https://nde.nl/ontology/hc/class/Approver +name: approver_class +title: Approver Class +imports: +- linkml:types +- ../slots/has_or_had_label +- ../slots/has_or_had_identifier +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + prov: http://www.w3.org/ns/prov# + foaf: http://xmlns.com/foaf/0.1/ +default_prefix: hc +classes: + Approver: + class_uri: prov:Agent + description: >- + An agent (person or organization) that approves or authorized something. + + **DEFINITION**: + + Approver represents the agent responsible for approving decisions, + policies, budgets, or other organizational actions. This replaces + simple string fields like `approved_by` with a structured class + that can link to person or organization entities. + + **ONTOLOGY ALIGNMENT**: + + - PROV-O: `prov:Agent` - entity that bears responsibility + - FOAF: `foaf:Agent` - agent (person or organization) + - Schema.org: `schema:Person` or `schema:Organization` + + **USE CASES**: + + 1. **Budget Approval**: Who approved the budget allocation + 2. **Policy Approval**: Who authorized the policy + 3. **Decision Records**: Documenting approval chains + + exact_mappings: + - prov:Agent + close_mappings: + - foaf:Agent + - schema:Person + - schema:Organization + slots: + - has_or_had_label + - has_or_had_identifier + slot_usage: + has_or_had_label: + range: string + examples: + - value: "Board of Directors" + description: Organizational approver + - value: "Museum Director" + description: Role-based approver + has_or_had_identifier: + range: uriorcurie + examples: + - value: "https://nde.nl/ontology/hc/person/jan-de-vries" + description: Link to person entity + comments: + - Generic approver class for approval provenance + - Can represent individuals or organizational bodies + - Aligns with PROV-O Agent for provenance tracking + see_also: + - https://www.w3.org/TR/prov-o/#Agent + examples: + - value: + has_or_had_label: "Museum Director" + has_or_had_identifier: "https://nde.nl/ontology/hc/person/example-director" + description: Individual approver diff --git a/schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml b/schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml index 5a2e139d34..ae7109b526 100644 --- a/schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml +++ b/schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml @@ -20,7 +20,7 @@ imports: - ../slots/technology_stack - ../slots/funding_source - ../slots/has_or_had_powered_by_cm -- ../slots/archived_at +- ../slots/is_or_was_archived_at # MIGRATED: was ../slots/archived_at (2026-01-15) - ../slots/serves_finding_aid - ../slots/has_or_had_data_service_endpoint - ../slots/has_or_had_documentation # MIGRATED: was ../slots/api_documentation (2026-01-15) @@ -113,7 +113,7 @@ classes: slots: - has_or_had_documentation # MIGRATED: was api_documentation (2026-01-15) - has_or_had_archival_status - - archived_at + - is_or_was_archived_at # MIGRATED: was archived_at (2026-01-15) - has_or_had_identifier - has_auxiliary_platform_type - cms_detected @@ -266,7 +266,8 @@ classes: description: Still accessible but not maintained - value: MIGRATED description: Content migrated to successor platform - was_archived_at: + is_or_was_archived_at: # MIGRATED: was was_archived_at (2026-01-15) + description: URL where this platform is or was archived (e.g., Wayback Machine) range: uri examples: - value: https://web.archive.org/web/20211231/https://example.nl/exhibition/ diff --git a/schemas/20251121/linkml/modules/classes/BOLDIdentifier.yaml b/schemas/20251121/linkml/modules/classes/BOLDIdentifier.yaml new file mode 100644 index 0000000000..9a1b816577 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/BOLDIdentifier.yaml @@ -0,0 +1,108 @@ +id: https://nde.nl/ontology/hc/class/BOLDIdentifier +name: bold_identifier_class +title: BOLD Identifier Class +description: >- + Barcode of Life Data System (BOLD) identifier for biological specimens. + + Links heritage biological objects to their DNA barcode records in BOLD. + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + dcterms: http://purl.org/dc/terms/ + schema: http://schema.org/ + +default_prefix: hc + +imports: + - linkml:types + - ../slots/id + - ../slots/identifier_value + - ../slots/identifier_url + - ../slots/description + - ../slots/specificity_annotation + - ../slots/template_specificity + - ./SpecificityAnnotation + - ./TemplateSpecificityScores + +classes: + BOLDIdentifier: + class_uri: schema:PropertyValue + description: >- + A Barcode of Life Data System (BOLD) identifier linking a biological + specimen to its DNA barcode record. + + **WHAT IS BOLD?** + + BOLD (Barcode of Life Data System) is an online workbench and database + for DNA barcoding. It stores specimen data and DNA barcode sequences, + enabling species identification through DNA. + + **USE CASES**: + + 1. **Specimen Identification**: Link natural history specimens to DNA data + 2. **Species Verification**: Cross-reference morphological IDs with DNA barcodes + 3. **Research Provenance**: Document genetic sampling of collection objects + + **IDENTIFIER FORMAT**: + + BOLD identifiers follow the pattern: BOLD:XXXNNN + - Process IDs: BOLD:AAA0001 + - Sample IDs: Institution-specific prefixes + + **EXTERNAL LINKS**: + + - BOLD Systems: https://boldsystems.org/ + - Record URL pattern: https://boldsystems.org/index.php/Public_RecordView?processid={id} + + exact_mappings: + - schema:PropertyValue + + close_mappings: + - dcterms:identifier + + slots: + - id + - identifier_value + - identifier_url + - description + - specificity_annotation + - template_specificity + + slot_usage: + id: + identifier: true + required: true + range: uriorcurie + pattern: ^https://nde\.nl/ontology/hc/bold-id/[A-Z0-9-]+$ + examples: + - value: https://nde.nl/ontology/hc/bold-id/NLNAT001-21 + description: Dutch natural history specimen BOLD ID + identifier_value: + description: The BOLD process ID or sample ID value. + range: string + required: true + pattern: ^[A-Z]{2,5}[0-9]{3,7}(-[0-9]{2})?$ + examples: + - value: NLNAT001-21 + description: Netherlands natural history specimen 2021 + - value: GBMIN12345-19 + description: UK specimen from 2019 + identifier_url: + description: URL to the BOLD record page. + range: uri + examples: + - value: https://boldsystems.org/index.php/Public_RecordView?processid=NLNAT001-21 + + comments: + - Used for DNA barcode identifiers in natural history collections + - Links physical specimens to molecular data + - Part of global biodiversity identification infrastructure + + examples: + - value: + id: https://nde.nl/ontology/hc/bold-id/NLNAT001-21 + identifier_value: NLNAT001-21 + identifier_url: https://boldsystems.org/index.php/Public_RecordView?processid=NLNAT001-21 + description: DNA barcode for Naturalis specimen + description: BOLD identifier for a Dutch natural history specimen diff --git a/schemas/20251121/linkml/modules/classes/BiologicalObject.yaml b/schemas/20251121/linkml/modules/classes/BiologicalObject.yaml index 3222d2f72d..48e92c6efe 100644 --- a/schemas/20251121/linkml/modules/classes/BiologicalObject.yaml +++ b/schemas/20251121/linkml/modules/classes/BiologicalObject.yaml @@ -21,7 +21,9 @@ imports: # associated_taxa REMOVED - migrated to is_or_was_associated_with (Rule 53) - ../slots/is_or_was_associated_with - ./Taxon -- ../slots/bold_id +# bold_id REMOVED - migrated to has_or_had_identifier with BOLDIdentifier class (Rule 53) +- ../slots/has_or_had_identifier +- ./BOLDIdentifier - ../slots/cites_appendix - ../slots/collection_date - ../slots/collection_locality_text @@ -104,7 +106,8 @@ classes: slots: # associated_taxa REMOVED - migrated to is_or_was_associated_with (Rule 53) - is_or_was_associated_with - - bold_id + # bold_id REMOVED - migrated to has_or_had_identifier with BOLDIdentifier (Rule 53) + - has_or_had_identifier - cites_appendix - collection_date - collection_locality_text @@ -152,6 +155,19 @@ classes: examples: - value: https://nde.nl/ontology/hc/taxon/raphus-cucullatus description: Associated with Dodo taxon + has_or_had_identifier: + description: >- + MIGRATED from bold_id (Rule 53). + BOLD (Barcode of Life Data System) identifier for DNA barcode records. + Range narrowed to BOLDIdentifier class. + range: BOLDIdentifier + multivalued: true + inlined_as_list: true + examples: + - value: + id: https://nde.nl/ontology/hc/bold-id/NLNAT001-21 + identifier_value: NLNAT001-21 + description: BOLD identifier for specimen taxon_name: required: true range: string diff --git a/schemas/20251121/linkml/modules/classes/Bookplate.yaml b/schemas/20251121/linkml/modules/classes/Bookplate.yaml new file mode 100644 index 0000000000..e423e92ab5 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/Bookplate.yaml @@ -0,0 +1,119 @@ +id: https://nde.nl/ontology/hc/class/Bookplate +name: bookplate_class +title: Bookplate Class +description: >- + Bookplate (ex libris) marking ownership of a book or manuscript. + + Records provenance information through ownership marks in heritage library items. + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + crm: http://www.cidoc-crm.org/cidoc-crm/ + bf: http://id.loc.gov/ontologies/bibframe/ + +default_prefix: hc + +imports: + - linkml:types + - ../slots/id + - ../slots/has_or_had_label + - ../slots/description + - ../slots/has_or_had_owner + - ../slots/specificity_annotation + - ../slots/template_specificity + - ./SpecificityAnnotation + - ./TemplateSpecificityScores + +classes: + Bookplate: + class_uri: bf:Bookplate + description: >- + A bookplate (ex libris) or ownership mark found in a book, manuscript, + or other library material. + + **WHAT IS A BOOKPLATE?** + + A bookplate is a printed or decorative label pasted inside a book, + typically on the front endpaper, indicating ownership. Also known + as "ex libris" (Latin: "from the books of"). + + **PROVENANCE SIGNIFICANCE**: + + Bookplates are crucial for provenance research: + - Document historical ownership chains + - Connect items to notable collectors + - Evidence of institutional vs. personal ownership + - May indicate stolen/looted heritage + + **USE CASES**: + + 1. **Provenance Research**: Track ownership history of rare books + 2. **Collection Documentation**: Record all bookplates in a volume + 3. **Restitution Claims**: Document pre-war ownership evidence + + **TYPES OF BOOKPLATES**: + + - Printed pictorial bookplates + - Armorial bookplates (with coat of arms) + - Typographic bookplates (text only) + - Stamps and ink marks + - Manuscript ownership inscriptions + + exact_mappings: + - bf:Bookplate + + close_mappings: + - crm:E37_Mark + - schema:Thing + + slots: + - id + - has_or_had_label + - description + - has_or_had_owner + - specificity_annotation + - template_specificity + + slot_usage: + id: + identifier: true + required: true + range: uriorcurie + pattern: ^https://nde\.nl/ontology/hc/bookplate/[a-z0-9-]+$ + examples: + - value: https://nde.nl/ontology/hc/bookplate/kb-exlibris-001 + description: KB bookplate record + has_or_had_label: + description: Text or name on the bookplate. + range: string + required: true + examples: + - value: "Ex Libris Johann Wolfgang von Goethe" + description: Goethe's bookplate + - value: "Bibliotheca Regia" + description: Royal library bookplate + description: + range: string + examples: + - value: Armorial bookplate with three lions, gilt border, 18th century + has_or_had_owner: + description: Person or institution who owned the book according to this bookplate. + range: string + examples: + - value: Johann Wolfgang von Goethe + - value: Royal Library of Prussia + + comments: + - Used for provenance research in rare book collections + - Links library items to historical owners + - Multiple bookplates may appear in single volume + + examples: + - value: + id: https://nde.nl/ontology/hc/bookplate/kb-exlibris-goethe-001 + has_or_had_label: "Ex Libris J.W. von Goethe" + description: Armorial bookplate with oak wreath, early 19th century + has_or_had_owner: Johann Wolfgang von Goethe + description: Goethe bookplate in rare book collection diff --git a/schemas/20251121/linkml/modules/classes/InformationCarrier.yaml b/schemas/20251121/linkml/modules/classes/InformationCarrier.yaml index e6cc7f7a77..1c506e9158 100644 --- a/schemas/20251121/linkml/modules/classes/InformationCarrier.yaml +++ b/schemas/20251121/linkml/modules/classes/InformationCarrier.yaml @@ -26,7 +26,9 @@ imports: - ../slots/has_or_had_description - ../slots/has_or_had_provenance - ../slots/has_or_had_type -- ../slots/bookplate +# bookplate REMOVED - migrated to includes_or_included with Bookplate class (Rule 53) +- ../slots/includes_or_included +- ./Bookplate - ./BindingType - ../slots/call_number - ../slots/carrier_type @@ -112,7 +114,8 @@ classes: - has_or_had_description - has_or_had_provenance - has_or_had_type - - bookplate + # bookplate REMOVED - migrated to includes_or_included (Rule 53) + - includes_or_included - call_number - carrier_type - carries_information diff --git a/schemas/20251121/linkml/modules/classes/WebObservation.yaml b/schemas/20251121/linkml/modules/classes/WebObservation.yaml index b6de48f9ed..2ba880afdf 100644 --- a/schemas/20251121/linkml/modules/classes/WebObservation.yaml +++ b/schemas/20251121/linkml/modules/classes/WebObservation.yaml @@ -13,7 +13,7 @@ prefixes: imports: - linkml:types - ./WebClaim -- ../slots/archived_at +- ../slots/is_or_was_is_or_was_archived_at # MIGRATED: was ../slots/is_or_was_archived_at (2026-01-15) - ../slots/extraction_confidence - ../slots/extraction_note - ../slots/source_url @@ -58,7 +58,7 @@ classes: \ DETECTION**:\n\nWebObservation supports tracking changes over time:\n- Link to `previous_observation` for same URL\n\ - `content_changed` flag for quick change detection\n- `content_hash` for integrity verification\n- Compare `last_modified`\ \ and `etag` across observations\n\n**ARCHIVAL INTEGRATION**:\n\nFor long-term preservation, link to archived copies:\n\ - - `archived_at` can point to Wayback Machine, Archive.today, etc.\n- Ensures cited web content remains accessible\n\n\ + - `is_or_was_archived_at` can point to Wayback Machine, Archive.today, etc.\n- Ensures cited web content remains accessible\n\n\ **EXAMPLES**:\n\n1. **EU Funding Portal Observation**\n - source_url: https://ec.europa.eu/.../topic-details/horizon-cl2-2025-heritage-01\n\ \ - retrieved_on: 2025-11-29T10:30:00Z\n - retrieved_by: \"glam-harvester/1.0\"\n - extraction_confidence: 0.95\n\ \ \n2. **Heritage Organisation Website**\n - source_url: https://www.heritagefund.org.uk/funding/medium-grants\n\ @@ -75,7 +75,7 @@ classes: - pav:sourceAccessedAt - dcterms:source slots: - - archived_at + - is_or_was_is_or_was_archived_at # MIGRATED: was is_or_was_archived_at (2026-01-15) - claim - content_changed - content_hash @@ -99,7 +99,7 @@ classes: - WebObservation is a prov:Activity documenting web content retrieval - Integrates PROV-O for provenance and PAV for retrieval-specific properties - Supports change detection via content_hash, previous_observation, content_changed - - Links to archived copies via archived_at for long-term citation + - Links to archived copies via is_or_was_archived_at for long-term citation - observed_entities links observation to extracted data (prov:generated) see_also: - https://www.w3.org/TR/prov-o/ @@ -121,7 +121,7 @@ classes: stated. Eligibility criteria parsed from HTML sections. observed_entity: - https://nde.nl/ontology/hc/call/ec/cl2-2025-heritage-01 - archived_at: https://web.archive.org/web/20251129103000/https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01 + is_or_was_archived_at: https://web.archive.org/web/20251129103000/https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01 description: Web observation of Horizon Europe CL2 2025 heritage call - value: observation_id: https://nde.nl/ontology/hc/observation/web/2025-11-28/nlhf-medium-grants diff --git a/schemas/20251121/linkml/modules/slots/archived_at.yaml b/schemas/20251121/linkml/modules/slots/archive/archived_at_archived_20260115.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/archived_at.yaml rename to schemas/20251121/linkml/modules/slots/archive/archived_at_archived_20260115.yaml diff --git a/schemas/20251121/linkml/modules/slots/bold_id.yaml b/schemas/20251121/linkml/modules/slots/archive/bold_id_archived_20260114.yaml similarity index 100% rename from schemas/20251121/linkml/modules/slots/bold_id.yaml rename to schemas/20251121/linkml/modules/slots/archive/bold_id_archived_20260114.yaml diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_approved_by.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_approved_by.yaml new file mode 100644 index 0000000000..866eda51d7 --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_approved_by.yaml @@ -0,0 +1,44 @@ +id: https://nde.nl/ontology/hc/slot/is_or_was_approved_by +name: is_or_was_approved_by_slot +title: Is Or Was Approved By Slot +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + prov: http://www.w3.org/ns/prov# + schema: http://schema.org/ +imports: +- linkml:types +default_prefix: hc +slots: + is_or_was_approved_by: + description: >- + The agent (person or organization) that approved or authorized something. + + **SEMANTIC PATTERN**: + + This slot follows the RiC-O temporal predicate pattern (is_or_was_*) + to indicate that approval is a historical event - something was approved + by someone at some point in time. + + **REPLACES**: + + - `approved_by` - Simple string field for approver name + + **RANGE OPTIONS**: + + - string: Simple approver name (backwards compatible) + - Approver: Structured approver with identity link + + Classes should use slot_usage to specify appropriate range. + + slot_uri: prov:wasAttributedTo + range: string + exact_mappings: + - prov:wasAttributedTo + close_mappings: + - schema:author + examples: + - value: "Museum Director" + description: Role-based approver + - value: "Board of Directors" + description: Organizational body approver diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_archived_at.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_archived_at.yaml new file mode 100644 index 0000000000..ceb638d4fb --- /dev/null +++ b/schemas/20251121/linkml/modules/slots/is_or_was_archived_at.yaml @@ -0,0 +1,44 @@ +id: https://nde.nl/ontology/hc/slot/is_or_was_archived_at +name: is_or_was_archived_at_slot +title: Is Or Was Archived At Slot +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + prov: http://www.w3.org/ns/prov# +imports: +- linkml:types +default_prefix: hc +slots: + is_or_was_archived_at: + description: >- + Location or URL where content was archived or preserved. + + **SEMANTIC PATTERN**: + + This slot follows the RiC-O temporal predicate pattern (is_or_was_*) + to indicate that archival location may change over time or refer to + historical archival events. + + **REPLACES**: + + - `archived_at` - URL to archived version (e.g., Wayback Machine) + - `was_archived_at` - Similar pattern + + **USE CASES**: + + 1. **Web Archival**: Link to Internet Archive/Wayback Machine snapshots + 2. **Platform Preservation**: Where deprecated platforms are preserved + 3. **Content Snapshots**: Historical versions of web content + + slot_uri: schema:archivedAt + range: uri + exact_mappings: + - schema:archivedAt + close_mappings: + - prov:atLocation + examples: + - value: "https://web.archive.org/web/20211231/https://example.nl/exhibition/" + description: Wayback Machine archived URL + - value: "https://archive.org/details/example-collection" + description: Internet Archive collection diff --git a/schemas/20251121/linkml/modules/slots/manifest.json b/schemas/20251121/linkml/modules/slots/manifest.json index 527fcb7fd3..49becf2858 100644 --- a/schemas/20251121/linkml/modules/slots/manifest.json +++ b/schemas/20251121/linkml/modules/slots/manifest.json @@ -44,7 +44,6 @@ "bio_type_classification.yaml", "birth_date.yaml", "birth_place.yaml", - "bold_id.yaml", "booking_required.yaml", "bookplate.yaml", "borrower.yaml", diff --git a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml index 0c4129cdb5..7206105005 100644 --- a/schemas/20251121/linkml/modules/slots/slot_fixes.yaml +++ b/schemas/20251121/linkml/modules/slots/slot_fixes.yaml @@ -851,10 +851,14 @@ fixes: - original_slot_id: https://nde.nl/ontology/hc/slot/bold_id processed: - status: false - timestamp: null - session: null - notes: "Requires BOLDIdentifier class creation" + status: true + timestamp: '2026-01-14T23:00:00Z' + session: "session-2026-01-14-identifier-migration" + notes: >- + FULLY MIGRATED: BiologicalObject - bold_id REMOVED. + Created BOLDIdentifier class (Barcode of Life Data System identifier). + Replaced with has_or_had_identifier slot with range BOLDIdentifier. + Slot archived to modules/slots/archive/bold_id_archived_20260114.yaml (Rule 53). revision: - label: has_or_had_identifier type: slot diff --git a/src/glam_extractor/api/entity_review.py b/src/glam_extractor/api/entity_review.py index be07527427..2090f438c8 100644 --- a/src/glam_extractor/api/entity_review.py +++ b/src/glam_extractor/api/entity_review.py @@ -32,12 +32,17 @@ from pydantic import BaseModel # Linkup API configuration LINKUP_API_KEY = os.getenv("LINKUP_API_KEY", "") LINKUP_API_URL = "https://api.linkup.so/v1/search" +LINKUP_FETCH_URL = "https://api.linkup.so/v1/fetch" # Exa API configuration for LinkedIn profile extraction EXA_API_KEY = os.getenv("EXA_API_KEY", "") EXA_API_URL = "https://api.exa.ai/contents" ENTITY_DIR = Path(os.getenv("ENTITY_DIR", "/Users/kempersc/apps/glam/data/custodian/person/entity")) +# LinkedIn profile fetch provider configuration +# Options: "exa", "linkup", "exa,linkup" (try exa first, fallback to linkup), "linkup,exa" +LINKEDIN_FETCH_PROVIDERS = os.getenv("LINKEDIN_FETCH_PROVIDERS", "exa,linkup") + # Email semantics for on-demand analysis try: from glam_extractor.entity_resolution.email_semantics import parse_email_semantics @@ -1075,7 +1080,7 @@ async def linkup_search(request: LinkupSearchRequest): # ============================================================================ -# LinkedIn Profile Extraction via Exa API +# LinkedIn Profile Extraction (Exa and Linkup providers) # ============================================================================ import re as regex_module # Avoid shadowing @@ -1087,6 +1092,7 @@ async def fetch_linkedin_profile_exa(linkedin_url: str) -> Optional[dict]: Returns parsed profile data or None if extraction fails. """ if not EXA_API_KEY: + print("Exa API: No API key configured") return None try: @@ -1111,13 +1117,15 @@ async def fetch_linkedin_profile_exa(linkedin_url: str) -> Optional[dict]: data = response.json() if not data.get('results') or len(data['results']) == 0: + print("Exa API: No results returned") return None result = data['results'][0] return { 'raw_result': result, 'request_id': data.get('requestId', ''), - 'cost': data.get('costDollars', {}).get('total', 0) + 'cost': data.get('costDollars', {}).get('total', 0), + 'provider': 'exa' } except Exception as e: @@ -1125,6 +1133,98 @@ async def fetch_linkedin_profile_exa(linkedin_url: str) -> Optional[dict]: return None +async def fetch_linkedin_profile_linkup(linkedin_url: str) -> Optional[dict]: + """ + Fetch LinkedIn profile data using Linkup Fetch API. + + Returns parsed profile data or None if extraction fails. + """ + if not LINKUP_API_KEY: + print("Linkup API: No API key configured") + return None + + try: + async with httpx.AsyncClient() as client: + response = await client.post( + LINKUP_FETCH_URL, + headers={ + "Authorization": f"Bearer {LINKUP_API_KEY}", + "Content-Type": "application/json", + }, + json={ + "url": linkedin_url, + "outputType": "markdown" + }, + timeout=60.0 + ) + + if response.status_code != 200: + print(f"Linkup API error: HTTP {response.status_code}") + return None + + data = response.json() + content = data.get('content', '') + + if not content: + print("Linkup API: No content returned") + return None + + # Transform to Exa-like format for consistent parsing + return { + 'raw_result': { + 'text': content, + 'url': linkedin_url, + 'title': data.get('title', ''), + 'author': '', # Will be extracted from content + 'image': '' + }, + 'request_id': '', + 'cost': 0, # Linkup doesn't report cost per request + 'provider': 'linkup' + } + + except Exception as e: + print(f"Linkup API exception: {e}") + return None + + +async def fetch_linkedin_profile(linkedin_url: str) -> Optional[dict]: + """ + Fetch LinkedIn profile using configured providers with fallback. + + Uses LINKEDIN_FETCH_PROVIDERS setting to determine order. + Examples: + "exa" - Use only Exa + "linkup" - Use only Linkup + "exa,linkup" - Try Exa first, fallback to Linkup + "linkup,exa" - Try Linkup first, fallback to Exa + + Returns the raw response with 'provider' field indicating which was used. + """ + providers = [p.strip().lower() for p in LINKEDIN_FETCH_PROVIDERS.split(',')] + + for provider in providers: + print(f"Trying LinkedIn profile fetch with: {provider}") + + if provider == 'exa': + result = await fetch_linkedin_profile_exa(linkedin_url) + if result: + print(f"Successfully fetched profile with Exa") + return result + + elif provider == 'linkup': + result = await fetch_linkedin_profile_linkup(linkedin_url) + if result: + print(f"Successfully fetched profile with Linkup") + return result + + else: + print(f"Unknown provider: {provider}") + + print(f"All providers failed for: {linkedin_url}") + return None + + def parse_linkedin_profile_from_exa(raw_data: dict) -> dict: """Parse Exa response into structured profile data.""" result = raw_data.get('raw_result', {}) @@ -1283,7 +1383,7 @@ async def save_entity_profile( "source_file": "manual_add_candidate", "staff_id": f"manual_add_{linkedin_slug}", "extraction_date": datetime.now(timezone.utc).isoformat(), - "extraction_method": "exa_contents", + "extraction_method": f"{raw_response.get('provider', 'unknown')}_contents", "extraction_agent": "entity_review_api", "linkedin_url": source_info.get('linkedin_url', ''), "cost_usd": raw_response.get('cost', 0),