From 2f44857028ca560ce010f006a86953124da0d3bb Mon Sep 17 00:00:00 2001 From: kempersc Date: Thu, 29 Jan 2026 18:17:47 +0100 Subject: [PATCH] Refactor LinkML schemas and slots for consistency and clarity - Updated imports in FindingAid.yaml to remove unnecessary entries and added new slots for arrangement level and provenance path. - Replaced 'full_name' with 'has_or_had_label' in LegalName.yaml and ProfileData.yaml for uniformity. - Enhanced slot definitions in various YAML files, including ceases_or_ceased_through, has_or_had_arrangement_level, has_or_had_assessment, and others, to include metadata and improve structure. - Removed the script fix_linkml_metadata.py as it is no longer needed. - Added new script fix_specific_dead_links.py to handle specific mapping updates for extraction metadata and full name fields across multiple YAML files. --- schemas/20251121/linkml/manifest.json | 2 +- .../DigitalPlatformV2OrganizationStatus.yaml | 2 +- .../modules/classes/EducationCenter.yaml | 23 ++-- .../modules/classes/ExtractionMetadata.yaml | 81 ++++-------- .../linkml/modules/classes/FindingAid.yaml | 72 ++--------- .../linkml/modules/classes/LegalName.yaml | 2 +- .../linkml/modules/classes/ProfileData.yaml | 2 +- .../slots/ceases_or_ceased_through.yaml | 34 +++-- .../slots/has_or_had_arrangement_level.yaml | 36 ++++-- .../modules/slots/has_or_had_assessment.yaml | 32 +++-- .../modules/slots/has_or_had_citation.yaml | 30 ++++- .../modules/slots/has_or_had_city_code.yaml | 30 ++++- .../slots/has_or_had_embargo_end_date.yaml | 38 ++++-- .../slots/has_or_had_embargo_reason.yaml | 30 ++++- .../slots/has_or_had_exhibition_type.yaml | 34 +++-- .../modules/slots/has_or_had_extent_text.yaml | 34 +++-- .../slots/has_or_had_file_location.yaml | 30 ++++- .../modules/slots/has_or_had_parent.yaml | 30 ++++- .../slots/has_or_had_sequence_index.yaml | 34 +++-- .../slots/is_or_was_documented_by.yaml | 38 ++++-- .../modules/slots/is_or_was_observed_by.yaml | 34 +++-- scripts/fix_linkml_metadata.py | 111 ---------------- scripts/fix_specific_dead_links.py | 119 ++++++++++++++++++ scripts/trace_dead_links.py | 33 +++-- 24 files changed, 552 insertions(+), 359 deletions(-) delete mode 100644 scripts/fix_linkml_metadata.py create mode 100644 scripts/fix_specific_dead_links.py diff --git a/schemas/20251121/linkml/manifest.json b/schemas/20251121/linkml/manifest.json index 426dd9b3f8..985f8447b5 100644 --- a/schemas/20251121/linkml/manifest.json +++ b/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2026-01-29T16:40:47.585Z", + "generated": "2026-01-29T17:17:48.016Z", "schemaRoot": "/schemas/20251121/linkml", "totalFiles": 3003, "categoryCounts": { diff --git a/schemas/20251121/linkml/modules/classes/DigitalPlatformV2OrganizationStatus.yaml b/schemas/20251121/linkml/modules/classes/DigitalPlatformV2OrganizationStatus.yaml index ab37dde770..b57a4af4d6 100644 --- a/schemas/20251121/linkml/modules/classes/DigitalPlatformV2OrganizationStatus.yaml +++ b/schemas/20251121/linkml/modules/classes/DigitalPlatformV2OrganizationStatus.yaml @@ -26,7 +26,7 @@ classes: legal_form: range: string description: Legal form of the organization (Municipal library, Foundation, etc.) - full_name: + has_or_had_label: range: string description: Full official name of the organization closure_date: diff --git a/schemas/20251121/linkml/modules/classes/EducationCenter.yaml b/schemas/20251121/linkml/modules/classes/EducationCenter.yaml index ec6e2ef750..ecfb1b2ffb 100644 --- a/schemas/20251121/linkml/modules/classes/EducationCenter.yaml +++ b/schemas/20251121/linkml/modules/classes/EducationCenter.yaml @@ -142,6 +142,7 @@ classes: ' range: EducationFacilityType inlined: true + equals_expression: '["hc:EducationProviderType"]' examples: - value: has_or_had_label: EDUCATION_CENTER @@ -265,8 +266,6 @@ classes: is_or_was_generated_by: range: ReconstructionActivity required: false - has_or_had_type: - equals_expression: '["hc:EducationProviderType"]' comments: - EducationCenter models educational facilities of heritage custodians - Schema.org EducationalOrganization for education facilities @@ -308,10 +307,16 @@ classes: av_equipped: true flexible_seating: true has_or_had_quantity: - value: 2 - unit: workshop_spaces + - numeric_value: 2 + has_or_had_unit: + unit_text: workshop_spaces + - numeric_value: 12 + has_or_had_unit: + unit_text: staff_members max_group_size: 30 - has_av_equipment: true + has_or_had_equipment: + - has_or_had_name: 4K Projector + has_or_had_type: Projector provides_or_provided: has_or_had_description: Hands-on learning lab has_or_had_accessibility_feature: @@ -321,10 +326,6 @@ classes: - has_or_had_quantity: 75000 has_or_had_time_interval: has_or_had_label: Annual - has_or_had_quantity: - - numeric_value: 12 - has_or_had_unit: - unit_text: staff_members is_or_was_required: true description: Major museum education center - value: @@ -353,7 +354,9 @@ classes: seating_capacity: 40 av_equipped: true max_group_size: 20 - has_av_equipment: true + has_or_had_equipment: + - has_or_had_name: Projector + has_or_had_type: Projector is_or_was_required: true description: Archive learning center annotations: diff --git a/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml b/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml index ba5ac1c14a..db60e02038 100644 --- a/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml +++ b/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml @@ -18,100 +18,78 @@ imports: - ./TemplateSpecificityType - ./TemplateSpecificityTypes - ../enums/ProfileExtractionMethodEnum -- ../slots/extraction_agent -- ../slots/extraction_method -- ../slots/cost_usd -- ../slots/source_file -- ../slots/staff_id -- ../slots/extraction_date -- ../slots/linkedin_url -- ../slots/request_id +- ../slots/is_or_was_retrieved_by +- ../slots/has_or_had_method +- ../slots/has_or_had_expense +- ../slots/has_or_had_source +- ../slots/has_or_had_identifier +- ../slots/retrieval_timestamp +- ../slots/has_or_had_url - ../slots/llm_response - ../slots/specificity_annotation - ../slots/has_or_had_score -- ../slots/cost_usd -- ../slots/extraction_agent -- ../slots/extraction_date -- ../slots/extraction_method -- ../slots/linkedin_url -- ../slots/llm_response -- ../slots/request_id -- ../slots/source_file -- ../slots/specificity_annotation -- ../slots/staff_id -- ../slots/has_or_had_score -- ../slots/cost_usd -- ../slots/extraction_agent -- ../slots/extraction_date -- ../slots/extraction_method -- ../slots/linkedin_url -- ../slots/llm_response -- ../slots/request_id -- ../slots/source_file -- ../slots/specificity_annotation -- ../slots/staff_id -- ../slots/has_or_had_score default_range: string classes: ExtractionMetadata: class_uri: prov:Activity - description: "Provenance metadata for data extraction activities.\n\nRecords how, when, and by what agent data was extracted from \nexternal sources (LinkedIn, web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity (the extraction process)\n- The extracted data IS the prov:Entity (output of the activity)\n- extraction_agent IS the prov:Agent (software/AI that performed extraction)\n- source_file/linkedin_url IS prov:used (input to the activity)\n\n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping provenance\n- Staff list parsing provenance\n- Connection network extraction\n\n**Example JSON Structure**:\n```json\n{\n \"extraction_metadata\": {\n \"source_file\": \"/path/to/source.json\",\n \"staff_id\": \"org_staff_0001_name\",\n \"extraction_date\": \"2025-12-12T22:00:00Z\",\n \"extraction_method\": \"exa_crawling_exa\",\n \"extraction_agent\": \"claude-opus-4.5\",\n \"linkedin_url\": \"https://www.linkedin.com/in/...\"\ - ,\n \"cost_usd\": 0.001\n }\n}\n```\n" + description: "Provenance metadata for data extraction activities.\n\nRecords how, when, and by what agent data was extracted from \nexternal sources (LinkedIn, web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity (the extraction process)\n- The extracted data IS the prov:Entity (output of the activity)\n- is_or_was_retrieved_by IS the prov:Agent (software/AI that performed extraction)\n- has_or_had_source/has_or_had_url IS prov:used (input to the activity)\n\n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping provenance\n- Staff list parsing provenance\n- Connection network extraction\n\n**Example JSON Structure**:\n```json\n{\n \"extraction_metadata\": {\n \"has_or_had_source\": \"/path/to/source.json\",\n \"has_or_had_identifier\": \"org_staff_0001_name\",\n \"retrieval_timestamp\": \"2025-12-12T22:00:00Z\",\n \"has_or_had_method\": \"exa_crawling_exa\",\n \"is_or_was_retrieved_by\": \"claude-opus-4.5\",\n \"has_or_had_url\": \"https://www.linkedin.com/in/...\"\ + ,\n \"has_or_had_expense\": 0.001\n }\n}\n```\n" exact_mappings: - prov:Activity close_mappings: - schema:Action - dct:ProvenanceStatement slots: - - cost_usd - - extraction_agent - - extraction_date - - extraction_method - - linkedin_url + - has_or_had_expense + - is_or_was_retrieved_by + - retrieval_timestamp + - has_or_had_method + - has_or_had_url - llm_response - - request_id - - source_file + - has_or_had_identifier + - has_or_had_source - specificity_annotation - - staff_id - has_or_had_score slot_usage: - source_file: + has_or_had_source: range: string examples: - value: /data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json description: Path to parsed staff list JSON - staff_id: + has_or_had_identifier: range: string pattern: ^[a-z0-9-]+_staff_[a-z0-9-_]+$ examples: - value: rijksmuseum_staff_0042_jan_van_der_berg description: Staff ID with org prefix, index, and name slug - extraction_date: + - value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv + description: Exa API request ID + retrieval_timestamp: range: datetime required: true examples: - value: '2025-12-12T22:00:00Z' description: UTC timestamp of extraction - extraction_method: + has_or_had_method: range: ProfileExtractionMethodEnum required: true examples: - value: exa_crawling_exa description: Extracted via Exa AI crawling API - extraction_agent: + is_or_was_retrieved_by: range: string examples: - value: claude-opus-4.5 description: Extracted by Claude Opus 4.5 - value: '' description: Empty string for fully automated extraction - linkedin_url: + has_or_had_url: range: uri pattern: ^https://www\.linkedin\.com/in/[a-z0-9-]+/?$ examples: - value: https://www.linkedin.com/in/jan-van-der-berg-12345 description: LinkedIn profile URL - cost_usd: + has_or_had_expense: range: float minimum_value: 0.0 examples: @@ -119,11 +97,6 @@ classes: description: Exa API call cost - value: 0.0 description: Free extraction (cached/local) - request_id: - range: string - examples: - - value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv - description: Exa API request ID llm_response: range: LLMResponse required: false @@ -134,8 +107,8 @@ classes: comments: - Every person entity file MUST have extraction_metadata - See AGENTS.md Rule 20 for required fields - - extraction_agent should be 'claude-opus-4.5' for manual extraction - - cost_usd enables budget tracking for API-heavy extractions + - is_or_was_retrieved_by should be 'claude-opus-4.5' for manual extraction + - has_or_had_expense enables budget tracking for API-heavy extractions see_also: - https://www.w3.org/TR/prov-o/ - https://docs.exa.ai/ diff --git a/schemas/20251121/linkml/modules/classes/FindingAid.yaml b/schemas/20251121/linkml/modules/classes/FindingAid.yaml index b02d7de421..de33919593 100644 --- a/schemas/20251121/linkml/modules/classes/FindingAid.yaml +++ b/schemas/20251121/linkml/modules/classes/FindingAid.yaml @@ -47,55 +47,9 @@ imports: - ./ConfidenceMethod - ../slots/html_file - ../slots/has_or_had_identifier -- ./Identifier -- ../slots/source_url - ../slots/has_or_had_label -- ../slots/date -- ../slots/note -- ../slots/creator -- ../slots/has_or_had_publisher -- ./Publisher -- ../slots/is_or_was_published_at -- ./PublicationEvent -- ./Quantity -- ../slots/isbn -- ../slots/has_or_had_access_condition -- ../slots/is_or_was_access_restricted -- ../slots/has_or_had_overview -- ../slots/has_or_had_image -- ./Image -- ../slots/has_or_had_quantity -- ../slots/includes_or_included -- ./GeoSpatialPlace -- ../slots/is_or_was_categorized_as -- ./ColonialStatus -- ../slots/content_block -- ../slots/crawler_version -- ../slots/custodian -- ../slots/was_retrieved_at -- ./Timestamp -- ../slots/de -- ../slots/has_or_had_identifier -- ./Identifier -- ./EADIdentifier -- ../slots/ead_id -- ../slots/en -- ../slots/end -- ../slots/has_or_had_external_resource -- ../slots/has_or_had_featured_item -- ../slots/has_or_had_note -- ./Note -- ../slots/has_or_had_scope -- ./Scope -- ../slots/has_or_had_type -- ../slots/has_or_had_format -- ./Format -- ../slots/has_or_had_language -- ./Language -- ../slots/full_name -- ../slots/has_or_had_geographic_extent -- ../slots/has_or_had_identifier -- ./GHCIdentifier +- ../slots/has_or_had_level +- ../slots/has_or_had_provenance_path - ../slots/contains_or_contained - ../slots/contains_or_contained - ../slots/contains_or_contained_en @@ -535,36 +489,28 @@ classes: ' slots: - has_or_had_access_condition - - full_name + - has_or_had_label - geographic_coverage - has_or_had_identifier + - has_or_had_level - contains_or_contained - - contains_or_contained - - contains_or_contained_en - list_item - section_id - - contains_or_contained - specificity_annotation - - contains_or_contained - has_or_had_score - has_or_had_provenance_path slot_usage: - contains_or_contained: + has_or_had_level: range: integer minimum_value: 1 maximum_value: 6 + description: Heading level (1-6). MIGRATED from heading_level/contains_or_contained duplicate. contains_or_contained: - required: true - content_block: - multivalued: true - contains_or_contained: - range: PageSection - multivalued: true - inlined_as_list: true - contains_or_contained: - range: PageLink multivalued: true inlined_as_list: true + any_of: + - range: PageSection + - range: PageLink has_or_had_featured_item: range: FeaturedCard multivalued: true diff --git a/schemas/20251121/linkml/modules/classes/LegalName.yaml b/schemas/20251121/linkml/modules/classes/LegalName.yaml index e0dbd4dd33..6bd5634b59 100644 --- a/schemas/20251121/linkml/modules/classes/LegalName.yaml +++ b/schemas/20251121/linkml/modules/classes/LegalName.yaml @@ -56,7 +56,7 @@ classes: description: Unique identifier for this legal name record range: uriorcurie required: true - full_name: + has_or_had_label: slot_uri: rov:legalName description: 'Complete legal name including organizational type. diff --git a/schemas/20251121/linkml/modules/classes/ProfileData.yaml b/schemas/20251121/linkml/modules/classes/ProfileData.yaml index 2d5074ef8f..6acf1d1647 100644 --- a/schemas/20251121/linkml/modules/classes/ProfileData.yaml +++ b/schemas/20251121/linkml/modules/classes/ProfileData.yaml @@ -29,7 +29,7 @@ classes: - schema:Person - prov:Entity attributes: - full_name: + has_or_had_label: range: string description: Person's full name has_or_had_title: diff --git a/schemas/20251121/linkml/modules/slots/ceases_or_ceased_through.yaml b/schemas/20251121/linkml/modules/slots/ceases_or_ceased_through.yaml index 3f74aa91c0..c16f926f28 100644 --- a/schemas/20251121/linkml/modules/slots/ceases_or_ceased_through.yaml +++ b/schemas/20251121/linkml/modules/slots/ceases_or_ceased_through.yaml @@ -1,9 +1,27 @@ +id: https://nde.nl/ontology/hc/slot/ceases_or_ceased_through name: ceases_or_ceased_through -description: >- - The event through which an entity ceases or ceased to exist/operate. - MIGRATED from cessation_observed_in (Rule 53). -range: CeasingEvent -slot_uri: prov:wasInvalidatedBy -exact_mappings: - - crm:P93i_was_taken_out_of_existence_by -multivalued: true +title: Ceases Or Ceased Through +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + ceases_or_ceased_through: + description: >- + The event through which an entity ceases or ceased to exist/operate. + MIGRATED from cessation_observed_in (Rule 53). + range: CeasingEvent + slot_uri: prov:wasInvalidatedBy + exact_mappings: + - crm:P93i_was_taken_out_of_existence_by + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_arrangement_level.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement_level.yaml index 40221ec683..c405af597a 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_arrangement_level.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_arrangement_level.yaml @@ -1,10 +1,28 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_arrangement_level name: has_or_had_arrangement_level -description: The level of arrangement of the record set or information carrier. -title: has or had arrangement level -slot_uri: rico:hasRecordSetType -range: ArrangementLevel -multivalued: false -exact_mappings: - - isad:level_of_description -close_mappings: - - rico:RecordSetType +title: Has Or Had Arrangement Level +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_arrangement_level: + description: The level of arrangement of the record set or information carrier. + title: has or had arrangement level + slot_uri: rico:hasRecordSetType + range: ArrangementLevel + multivalued: false + exact_mappings: + - isad:level_of_description + close_mappings: + - rico:RecordSetType diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_assessment.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_assessment.yaml index 6c1736cd35..fb7069ccae 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_assessment.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_assessment.yaml @@ -1,8 +1,26 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_assessment name: has_or_had_assessment -description: >- - Assessment associated with the entity. - Range should be an Assessment class. - MIGRATED from heritage_relevance (for LinkedInProfile) per Rule 53. -slot_uri: crm:P140i_was_attributed_by -range: Any -multivalued: true +title: Has Or Had Assessment +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_assessment: + description: >- + Assessment associated with the entity. + Range should be an Assessment class. + MIGRATED from heritage_relevance (for LinkedInProfile) per Rule 53. + slot_uri: crm:P140i_was_attributed_by + range: Any + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_citation.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_citation.yaml index a2a58984a9..70cae23546 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_citation.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_citation.yaml @@ -1,7 +1,25 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_citation name: has_or_had_citation -description: >- - A bibliographic citation for the resource. - MIGRATED from citation (Rule 53). -range: string -slot_uri: schema:citation -multivalued: true +title: Has Or Had Citation +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_citation: + description: >- + A bibliographic citation for the resource. + MIGRATED from citation (Rule 53). + range: string + slot_uri: schema:citation + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_city_code.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_city_code.yaml index 1def168017..bd3fdc2d37 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_city_code.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_city_code.yaml @@ -1,7 +1,25 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_city_code name: has_or_had_city_code -description: >- - The 3-letter city/settlement code (e.g., AMS for Amsterdam). - MIGRATED from city_code (Rule 53). -range: string -slot_uri: schema:code -multivalued: false +title: Has Or Had City Code +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_city_code: + description: >- + The 3-letter city/settlement code (e.g., AMS for Amsterdam). + MIGRATED from city_code (Rule 53). + range: string + slot_uri: schema:code + multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_embargo_end_date.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_embargo_end_date.yaml index 71ad50c3bf..5b719cdfc6 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_embargo_end_date.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_embargo_end_date.yaml @@ -1,11 +1,29 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_embargo_end_date name: has_or_had_embargo_end_date -description: >- - The date when an embargo or restriction ends. -title: has or had embargo end date -slot_uri: premis:endDate -range: date -multivalued: false -exact_mappings: - - premis:endDate -close_mappings: - - rico:hasEndDate +title: Has Or Had Embargo End Date +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_embargo_end_date: + description: >- + The date when an embargo or restriction ends. + title: has or had embargo end date + slot_uri: premis:endDate + range: date + multivalued: false + exact_mappings: + - premis:endDate + close_mappings: + - rico:hasEndDate diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_embargo_reason.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_embargo_reason.yaml index 354b04a358..0bde3ef03b 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_embargo_reason.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_embargo_reason.yaml @@ -1,7 +1,25 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_embargo_reason name: has_or_had_embargo_reason -description: >- - The reason for an embargo or restriction. -title: has or had embargo reason -slot_uri: rico:conditionsOfAccess -range: string -multivalued: true +title: Has Or Had Embargo Reason +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_embargo_reason: + description: >- + The reason for an embargo or restriction. + title: has or had embargo reason + slot_uri: rico:conditionsOfAccess + range: string + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_exhibition_type.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_exhibition_type.yaml index 27cd868ab7..bab722343d 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_exhibition_type.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_exhibition_type.yaml @@ -1,9 +1,27 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_exhibition_type name: has_or_had_exhibition_type -description: >- - The type or category of an exhibition (e.g., Permanent, Temporary, Traveling). -title: has or had exhibition type -slot_uri: rico:hasEventType -close_mappings: - - crm:P2_has_type -range: string -multivalued: true +title: Has Or Had Exhibition Type +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_exhibition_type: + description: >- + The type or category of an exhibition (e.g., Permanent, Temporary, Traveling). + title: has or had exhibition type + slot_uri: rico:hasEventType + close_mappings: + - crm:P2_has_type + range: string + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_extent_text.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_extent_text.yaml index a7836602a5..2479a48578 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_extent_text.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_extent_text.yaml @@ -1,9 +1,27 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_extent_text name: has_or_had_extent_text -description: >- - Textual description of the extent of an entity (e.g., '300 boxes', '2 linear meters'). -title: has or had extent text -slot_uri: rico:hasExtent -exact_mappings: - - rico:hasExtent -range: string -multivalued: true +title: Has Or Had Extent Text +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_extent_text: + description: >- + Textual description of the extent of an entity (e.g., '300 boxes', '2 linear meters'). + title: has or had extent text + slot_uri: rico:hasExtent + exact_mappings: + - rico:hasExtent + range: string + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_file_location.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_file_location.yaml index d6b16b3a2c..db82040e98 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_file_location.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_file_location.yaml @@ -1,7 +1,25 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_file_location name: has_or_had_file_location -description: >- - The location of a file. - MIGRATED from html_snapshot_path (Rule 53). -range: FileLocation -slot_uri: skos:note -multivalued: true +title: Has Or Had File Location +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_file_location: + description: >- + The location of a file. + MIGRATED from html_snapshot_path (Rule 53). + range: FileLocation + slot_uri: skos:note + multivalued: true diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_parent.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_parent.yaml index f00dc647b3..c50310ac14 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_parent.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_parent.yaml @@ -1,7 +1,25 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_parent name: has_or_had_parent -description: >- - The parent entity of this entity. - MIGRATED from parent_chapter_id (Rule 53). -range: Any -slot_uri: schema:parent -multivalued: false +title: Has Or Had Parent +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_parent: + description: >- + The parent entity of this entity. + MIGRATED from parent_chapter_id (Rule 53). + range: Any + slot_uri: schema:parent + multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/has_or_had_sequence_index.yaml b/schemas/20251121/linkml/modules/slots/has_or_had_sequence_index.yaml index c090c3eb9c..9984c98405 100644 --- a/schemas/20251121/linkml/modules/slots/has_or_had_sequence_index.yaml +++ b/schemas/20251121/linkml/modules/slots/has_or_had_sequence_index.yaml @@ -1,9 +1,27 @@ +id: https://nde.nl/ontology/hc/slot/has_or_had_sequence_index name: has_or_had_sequence_index -description: >- - The sequence index or order of an item (e.g. chapter number, page number). - MIGRATED from chapter_index (Rule 53). -range: integer -slot_uri: schema:position -exact_mappings: - - schema:position -multivalued: false +title: Has Or Had Sequence Index +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + has_or_had_sequence_index: + description: >- + The sequence index or order of an item (e.g. chapter number, page number). + MIGRATED from chapter_index (Rule 53). + range: integer + slot_uri: schema:position + exact_mappings: + - schema:position + multivalued: false diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_documented_by.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_documented_by.yaml index d1ab2f23c8..23fb08ec21 100644 --- a/schemas/20251121/linkml/modules/slots/is_or_was_documented_by.yaml +++ b/schemas/20251121/linkml/modules/slots/is_or_was_documented_by.yaml @@ -1,11 +1,29 @@ +id: https://nde.nl/ontology/hc/slot/is_or_was_documented_by name: is_or_was_documented_by -description: >- - Indicates that the entity is or was documented by another resource (e.g., a FinancialStatement documenting a Budget). -title: is or was documented by -slot_uri: schema:subjectOf -range: ReconstructedEntity -multivalued: true -exact_mappings: - - crm:P70i_is_documented_in -close_mappings: - - rico:isOrWasSubjectOf +title: Is Or Was Documented By +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + is_or_was_documented_by: + description: >- + Indicates that the entity is or was documented by another resource (e.g., a FinancialStatement documenting a Budget). + title: is or was documented by + slot_uri: schema:subjectOf + range: ReconstructedEntity + multivalued: true + exact_mappings: + - crm:P70i_is_documented_in + close_mappings: + - rico:isOrWasSubjectOf diff --git a/schemas/20251121/linkml/modules/slots/is_or_was_observed_by.yaml b/schemas/20251121/linkml/modules/slots/is_or_was_observed_by.yaml index 599863b35b..498b819a06 100644 --- a/schemas/20251121/linkml/modules/slots/is_or_was_observed_by.yaml +++ b/schemas/20251121/linkml/modules/slots/is_or_was_observed_by.yaml @@ -1,9 +1,27 @@ +id: https://nde.nl/ontology/hc/slot/is_or_was_observed_by name: is_or_was_observed_by -description: >- - The observation that documented this event or state. - MIGRATED from cessation_observed_in (Rule 53). -range: CustodianObservation -slot_uri: prov:wasGeneratedBy -exact_mappings: - - prov:wasGeneratedBy -multivalued: true +title: Is Or Was Observed By +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + rdfs: http://www.w3.org/2000/01/rdf-schema# + org: http://www.w3.org/ns/org# + xsd: http://www.w3.org/2001/XMLSchema# +imports: +- linkml:types +default_prefix: hc +slots: + is_or_was_observed_by: + description: >- + The observation that documented this event or state. + MIGRATED from cessation_observed_in (Rule 53). + range: CustodianObservation + slot_uri: prov:wasGeneratedBy + exact_mappings: + - prov:wasGeneratedBy + multivalued: true diff --git a/scripts/fix_linkml_metadata.py b/scripts/fix_linkml_metadata.py deleted file mode 100644 index acc248f7f6..0000000000 --- a/scripts/fix_linkml_metadata.py +++ /dev/null @@ -1,111 +0,0 @@ -import os -import re - -directory = "schemas/20251121/linkml/modules/classes/" - -prefixes_block = """prefixes: - linkml: https://w3id.org/linkml/ - schema: http://schema.org/ - skos: http://www.w3.org/2004/02/skos/core# - rico: https://www.ica.org/standards/RiC/ontology# - wd: http://www.wikidata.org/entity/ -""" - -imports_block = """imports: -- linkml:types -""" - -def split_camel_case(name): - return re.sub('([a-z0-9])([A-Z])', r'\1 \2', name) - -count = 0 - -for filename in os.listdir(directory): - if not filename.endswith(".yaml"): - continue - - filepath = os.path.join(directory, filename) - with open(filepath, 'r') as f: - content = f.read() - - if content.startswith("id:"): - continue # Already has metadata - - # Check if imports already exist in the file (even if unstructured) - has_imports = re.search(r"^imports:", content, re.MULTILINE) - - if not content.strip().startswith("classes:") and not has_imports: - # Some files might have comments at the top? - # If it doesn't start with classes: or id:, we should check. - # But my grep showed files starting with classes: - pass - - # Simple parsing - lines = content.splitlines() - class_name = None - description = None - - # Determine class name from filename first as fallback/confirmation - filename_class = filename.replace(".yaml", "") - - found_class_in_content = False - - for i, line in enumerate(lines): - if line.strip().startswith("classes:"): - # Look for class name in subsequent lines - for j in range(i+1, min(i+5, len(lines))): - # Matches " ClassName:" - match = re.match(r"^ ([a-zA-Z0-9_]+):", lines[j]) - if match: - class_name = match.group(1) - found_class_in_content = True - - # Look for description inside the class - for k in range(j+1, min(j+15, len(lines))): - # Matches " description: Value" - desc_match = re.match(r"^ description:\s+(.*)", lines[k]) - if desc_match: - description = desc_match.group(1).strip() - # Handle multi-line description if needed? - if description.startswith(">") or description.startswith("|"): - description = None - break - break - break - - if not class_name: - # Fallback to filename if parsing failed (e.g. if file is empty or weird) - class_name = filename_class - - # Ensure class name matches filename (convention) - if class_name != filename_class: - print(f"Warning: Class name '{class_name}' in content differs from filename '{filename_class}'. Using filename.") - class_name = filename_class - - title = split_camel_case(class_name) - if not description: - description = f"LinkML class definition for {title}" - else: - # Strip quotes if present - if (description.startswith('"') and description.endswith('"')) or (description.startswith("'") and description.endswith("'")): - description = description[1:-1] - - # Construct new content - new_header = f"id: https://nde.nl/ontology/hc/class/{class_name}\n" - new_header += f"name: {class_name}\n" - new_header += f"title: {title}\n" - new_header += f"description: {description}\n" - new_header += prefixes_block - - if not has_imports: - new_header += imports_block - - new_content = new_header + content - - with open(filepath, 'w') as f: - f.write(new_content) - - count += 1 - # print(f"Updated {filename}") - -print(f"Total files updated: {count}") diff --git a/scripts/fix_specific_dead_links.py b/scripts/fix_specific_dead_links.py new file mode 100644 index 0000000000..8498298523 --- /dev/null +++ b/scripts/fix_specific_dead_links.py @@ -0,0 +1,119 @@ +import os +import re + +SCHEMA_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/" + +# Mappings for ExtractionMetadata.yaml +EXTRACTION_METADATA_MAP = { + "extraction_agent": "is_or_was_retrieved_by", + "extraction_method": "has_or_had_method", + "extraction_date": "retrieval_timestamp", + "cost_usd": "has_or_had_expense", + "source_file": "has_or_had_source", + "staff_id": "has_or_had_identifier", + "linkedin_url": "has_or_had_url", + "request_id": "has_or_had_identifier" # request_id also maps to identifier +} + +# General mapping for full_name +FULL_NAME_MAP = { + "full_name": "has_or_had_label" +} + +def fix_extraction_metadata(): + filepath = os.path.join(SCHEMA_DIR, "ExtractionMetadata.yaml") + if not os.path.exists(filepath): return + + print(f"Fixing {filepath}...") + with open(filepath, 'r') as f: + lines = f.readlines() + + new_lines = [] + + # Track which new slots we've already imported to avoid dupes + added_imports = set() + + for line in lines: + stripped = line.strip() + replaced = False + + # 1. Imports + if stripped.startswith("- ../slots/"): + slot_name = stripped.split("/")[-1] + if slot_name in EXTRACTION_METADATA_MAP: + new_slot = EXTRACTION_METADATA_MAP[slot_name] + if new_slot not in added_imports: + new_lines.append(line.replace(slot_name, new_slot)) + added_imports.add(new_slot) + replaced = True + + # 2. Slots list + elif stripped.startswith("- ") and stripped[2:] in EXTRACTION_METADATA_MAP: + slot_name = stripped[2:] + new_slot = EXTRACTION_METADATA_MAP[slot_name] + # Avoid duplicate slots in list if possible, but simple replacement is safer than deletion logic + # However, request_id and staff_id BOTH map to has_or_had_identifier. + # If we just replace, we get duplicates. + # LinkML allows duplicate slot entries (it dedupes), but cleaner to avoid. + # But simple replace is fine for now. + new_lines.append(line.replace(slot_name, new_slot)) + replaced = True + + # 3. Slot usage keys + elif stripped.endswith(":") and stripped[:-1] in EXTRACTION_METADATA_MAP: + slot_name = stripped[:-1] + new_slot = EXTRACTION_METADATA_MAP[slot_name] + new_lines.append(line.replace(slot_name, new_slot)) + replaced = True + + if not replaced: + new_lines.append(line) + + with open(filepath, 'w') as f: + f.writelines(new_lines) + +def fix_full_name(filename): + filepath = os.path.join(SCHEMA_DIR, filename) + if not os.path.exists(filepath): return + + print(f"Fixing {filepath}...") + with open(filepath, 'r') as f: + lines = f.readlines() + + new_lines = [] + for line in lines: + stripped = line.strip() + replaced = False + + # Imports + if stripped == "- ../slots/full_name": + new_lines.append(line.replace("full_name", "has_or_had_label")) + replaced = True + + # Slots list + elif stripped == "- full_name": + new_lines.append(line.replace("full_name", "has_or_had_label")) + replaced = True + + # Slot usage key + elif stripped == "full_name:": + new_lines.append(line.replace("full_name:", "has_or_had_label:")) + replaced = True + + if not replaced: + new_lines.append(line) + + with open(filepath, 'w') as f: + f.writelines(new_lines) + +def main(): + fix_extraction_metadata() + fix_full_name("FindingAid.yaml") + fix_full_name("OrganizationBranch.yaml") + fix_full_name("DigitalPlatformV2OrganizationStatus.yaml") # Also flagged + fix_full_name("LegalName.yaml") # Also flagged + fix_full_name("CustodianLegalStatus.yaml") # Also flagged + fix_full_name("ProfileData.yaml") # Also flagged + +if __name__ == "__main__": + main() diff --git a/scripts/trace_dead_links.py b/scripts/trace_dead_links.py index e38324c89f..706a9862e5 100644 --- a/scripts/trace_dead_links.py +++ b/scripts/trace_dead_links.py @@ -21,6 +21,15 @@ def get_archived_slot_names(): def find_references(archived_slots): references = {} # {slot_name: [file_paths]} + # Metadata keys that mimic slot names but are valid LinkML structure + # We ignore "Usage as key" for these + SAFE_METADATA_KEYS = { + "title", "description", "name", "id", "status", "notes", "comments", "examples", + "todos", "see_also", "range", "slot_usage", "required", "multivalued", + "inlined", "identifier", "value", "unit", "prefixes", "imports", "classes", + "slots", "attributes", "exact_mappings", "close_mappings", "related_mappings" + } + class_files = glob.glob(os.path.join(CLASSES_DIR, "*.yaml")) for cls_file in class_files: @@ -30,28 +39,30 @@ def find_references(archived_slots): for i, line in enumerate(lines): stripped = line.strip() - # Check for imports: "- ../slots/slotname" - # Check for slot usage: "- slotname" (in slots list) - # Check for slot_usage keys: "slotname:" - for slot in archived_slots: - # Import check - if f"../slots/{slot}" in stripped and not stripped.strip().startswith("#"): + # Import check: "- ../slots/slotname" + if f"../slots/{slot}" in stripped and not stripped.startswith("#"): if slot not in references: references[slot] = [] references[slot].append(f"{cls_file} (line {i+1}): Import") continue - # Loose usage check (can be false positive if slot name is common word like 'description') - # But we restrict to archived slots. - - # Check for "- slotname" + # Usage in slots list: "- slotname" + # Must be exact match to avoid partials if stripped == f"- {slot}": if slot not in references: references[slot] = [] references[slot].append(f"{cls_file} (line {i+1}): Usage in slots list") continue - # Check for "slotname:" + # Usage as key: "slotname:" if stripped.startswith(f"{slot}:"): + # Check if it's a safe metadata key + if slot in SAFE_METADATA_KEYS: + continue + + # Also, if we are inside a slot_usage block, "slotname:" is valid ONLY IF + # we are refining that slot. But if the slot is archived, we shouldn't be refining it! + # So "Usage as key" is actually relevant for slot_usage of archived slots. + if slot not in references: references[slot] = [] references[slot].append(f"{cls_file} (line {i+1}): Usage as key") continue