Person Enrichment Scripts: - enrich_person_comprehensive.py: Full-featured web search enrichment via Linkup with Rule 6/21/26/34/35 compliance (dual timestamps, no fabrication) - enrich_ppids_linkup.py: Batch PPID enrichment pipeline - extract_persons_with_provenance.py: Extract person data from LinkedIn HTML with XPath provenance tracking LinkML Slot Management: - update_slot_mappings.py: Update slots for RiC-O naming (Rule 39) and semantic URI requirements (Rule 38) - update_class_slot_references.py: Update class files referencing renamed slots - validate_slot_mappings.py: Validate slot definitions against ontology rules All scripts follow established project conventions for provenance and ontology alignment.
474 lines
23 KiB
Python
474 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate slot mappings against actual ontology predicates.
|
|
|
|
This script checks each slot's mappings against the predicates actually
|
|
defined in the ontology files at data/ontology/.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
import yaml
|
|
|
|
# Known predicates from ontology files (extracted from data/ontology/)
|
|
VALID_PREDICATES = {
|
|
# Schema.org (verified from schemaorg.owl)
|
|
"schema:about", "schema:abstract", "schema:acceptedPaymentMethod", "schema:accessibilityFeature",
|
|
"schema:accessibilityHazard", "schema:accessibilitySummary", "schema:accessMode",
|
|
"schema:accessModeSufficient", "schema:acquiredFrom", "schema:additionalProperty", "schema:additionalType", "schema:address",
|
|
"schema:addressLocality", "schema:addressRegion", "schema:affiliation", "schema:age", "schema:aggregateRating", "schema:alternateName",
|
|
"schema:alternativeHeadline", "schema:alumniOf", "schema:amenityFeature", "schema:applicationDeadline", "schema:areaServed",
|
|
"schema:archivedAt", "schema:attendee", "schema:audience", "schema:author", "schema:availabilityStarts", "schema:availabilityEnds",
|
|
"schema:award", "schema:birthDate", "schema:birthPlace", "schema:businessFunction", "schema:collection", "schema:commentCount",
|
|
"schema:conditionsOfAccess", "schema:contactPoint", "schema:containsPlace", "schema:contributor", "schema:creator",
|
|
"schema:dateCreated", "schema:dateModified", "schema:datePublished", "schema:deathDate", "schema:deathPlace", "schema:description",
|
|
"schema:documentation", "schema:duration", "schema:email", "schema:employee", "schema:encodingFormat",
|
|
"schema:endDate", "schema:event", "schema:eventStatus", "schema:faxNumber", "schema:familyName",
|
|
"schema:foundingDate", "schema:foundingLocation", "schema:funder", "schema:funding", "schema:geo",
|
|
"schema:givenName", "schema:hasCourse", "schema:hasCourseInstance", "schema:hasCredential", "schema:hasOfferCatalog",
|
|
"schema:hasPart", "schema:holdingArchive", "schema:identifier", "schema:image", "schema:inLanguage",
|
|
"schema:includedInDataCatalog", "schema:isAccessibleForFree", "schema:isPartOf", "schema:isRelatedTo", "schema:issuedBy",
|
|
"schema:itemListElement", "schema:knowsAbout", "schema:knowsLanguage", "schema:latitude", "schema:legalName", "schema:location",
|
|
"schema:logo", "schema:longitude", "schema:mainEntityOfPage", "schema:makesOffer", "schema:maximumAttendeeCapacity", "schema:member", "schema:memberOf",
|
|
"schema:name", "schema:numberOfEmployees", "schema:numberOfItems", "schema:offers", "schema:openingHours",
|
|
"schema:parentOrganization", "schema:paymentAccepted", "schema:performer", "schema:photo", "schema:postalCode",
|
|
"schema:potentialAction", "schema:price", "schema:priceRange", "schema:publicAccess",
|
|
"schema:publishingPrinciples", "schema:ratingValue", "schema:recognizedBy", "schema:roleName",
|
|
"schema:reservationRequired", "schema:review", "schema:sameAs", "schema:seller", "schema:serviceType",
|
|
"schema:size", "schema:softwareApplication", "schema:sponsor", "schema:startDate", "schema:streetAddress", "schema:subjectOf",
|
|
"schema:subtitleLanguage", "schema:telephone", "schema:text", "schema:url", "schema:value", "schema:version",
|
|
"schema:videoFrameSize",
|
|
|
|
# Dublin Core Terms (verified from dublin_core_elements.rdf and usage)
|
|
"dcterms:abstract", "dcterms:accessRights", "dcterms:accrualPeriodicity", "dcterms:audience",
|
|
"dcterms:conformsTo", "dcterms:contributor", "dcterms:coverage", "dcterms:creator", "dcterms:date", "dcterms:dateAccepted",
|
|
"dcterms:dateSubmitted", "dcterms:description", "dcterms:extent", "dcterms:format", "dcterms:hasPart", "dcterms:hasVersion",
|
|
"dcterms:identifier", "dcterms:isPartOf", "dcterms:isReferencedBy", "dcterms:isReplacedBy",
|
|
"dcterms:issued", "dcterms:language", "dcterms:license", "dcterms:mediator", "dcterms:medium",
|
|
"dcterms:modified", "dcterms:provenance", "dcterms:publisher", "dcterms:references", "dcterms:relation",
|
|
"dcterms:replaces", "dcterms:rights", "dcterms:rightsHolder", "dcterms:source", "dcterms:spatial",
|
|
"dcterms:subject", "dcterms:tableOfContents", "dcterms:temporal", "dcterms:title", "dcterms:type",
|
|
"dcterms:valid",
|
|
|
|
# RiC-O (verified from RiC-O_1-1.rdf)
|
|
"rico:accrualsStatus", "rico:accumulationDate", "rico:affectsOrAffected", "rico:authenticityNote",
|
|
"rico:conditionsOfAccess", "rico:conditionsOfUse", "rico:containsOrContained", "rico:date",
|
|
"rico:describesOrDescribed", "rico:generalDescription", "rico:hasAccumulationDate", "rico:hasBeginningDate",
|
|
"rico:hasEndDate", "rico:hasOrHadAgentName", "rico:hasOrHadAllMembersWithContentType",
|
|
"rico:hasOrHadAppellation", "rico:hasOrHadComponent", "rico:hasOrHadConstituent",
|
|
"rico:hasOrHadController", "rico:hasOrHadCoordinates", "rico:hasOrHadHolder", "rico:hasOrHadIdentifier",
|
|
"rico:hasOrHadLanguage", "rico:hasOrHadLegalStatus", "rico:hasOrHadLocation", "rico:hasOrHadMainSubject",
|
|
"rico:hasOrHadManager", "rico:hasOrHadMember", "rico:hasOrHadName", "rico:hasOrHadOwner",
|
|
"rico:hasOrHadPart", "rico:hasOrHadPhysicalLocation", "rico:hasOrHadPosition", "rico:hasOrHadSubdivision",
|
|
"rico:hasOrHadSubject", "rico:hasOrHadSubordinate", "rico:hasOrHadType", "rico:hasRecordSetType",
|
|
"rico:hasRecordState", "rico:history", "rico:identifier", "rico:includesOrIncluded",
|
|
"rico:isOrWasAffectedBy", "rico:isOrWasComponentOf", "rico:isOrWasConstituentOf",
|
|
"rico:isOrWasDescribedBy", "rico:isOrWasHolderOf", "rico:isOrWasIncludedIn", "rico:isOrWasLocationOf",
|
|
"rico:isOrWasMemberOf", "rico:isOrWasPartOf", "rico:isOrWasSubdivisionOf", "rico:isOrWasSubjectOf",
|
|
"rico:isOrWasSubordinateTo", "rico:isRelatedTo", "rico:isTriggeredByEvent", "rico:name", "rico:note",
|
|
"rico:scopeAndContent", "rico:title", "rico:type",
|
|
|
|
# PROV-O (verified from prov-o.ttl)
|
|
"prov:actedOnBehalfOf", "prov:activity", "prov:agent", "prov:atLocation", "prov:atTime",
|
|
"prov:endedAtTime", "prov:entity", "prov:generated", "prov:generatedAtTime", "prov:hadPlan",
|
|
"prov:hadPrimarySource", "prov:hadReason", "prov:hadRole", "prov:influenced", "prov:invalidatedAtTime",
|
|
"prov:qualifiedAssociation", "prov:qualifiedAttribution", "prov:qualifiedDerivation", "prov:qualifiedGeneration",
|
|
"prov:qualifiedInfluence", "prov:startedAtTime", "prov:used", "prov:value", "prov:wasAssociatedWith",
|
|
"prov:wasAttributedTo", "prov:wasDerivedFrom", "prov:wasGeneratedBy", "prov:wasInfluencedBy",
|
|
"prov:wasInvalidatedBy", "prov:wasRevisionOf",
|
|
|
|
# SKOS (verified from skos.rdf)
|
|
"skos:altLabel", "skos:broader", "skos:broaderTransitive", "skos:broadMatch", "skos:closeMatch",
|
|
"skos:definition", "skos:exactMatch", "skos:example", "skos:hiddenLabel", "skos:narrower",
|
|
"skos:narrowerTransitive", "skos:narrowMatch", "skos:notation", "skos:note", "skos:prefLabel",
|
|
"skos:related", "skos:relatedMatch", "skos:scopeNote",
|
|
|
|
# FOAF (verified from foaf.ttl)
|
|
"foaf:account", "foaf:accountName", "foaf:age", "foaf:based_near", "foaf:birthday", "foaf:depiction", "foaf:familyName",
|
|
"foaf:firstName", "foaf:gender", "foaf:givenName", "foaf:homepage", "foaf:img", "foaf:interest",
|
|
"foaf:isPrimaryTopicOf", "foaf:knows", "foaf:lastName", "foaf:logo", "foaf:made", "foaf:maker",
|
|
"foaf:mbox", "foaf:member", "foaf:name", "foaf:nick", "foaf:page", "foaf:phone", "foaf:primaryTopic",
|
|
"foaf:publications", "foaf:surname", "foaf:title", "foaf:topic", "foaf:weblog", "foaf:workplaceHomepage",
|
|
|
|
# ORG (verified from org.rdf)
|
|
"org:changedBy", "org:classification", "org:hasMembership", "org:hasSite", "org:hasSubOrganization",
|
|
"org:hasUnit", "org:headOf", "org:identifier", "org:linkedTo", "org:member", "org:memberOf",
|
|
"org:organization", "org:originalOrganization", "org:purpose", "org:reportsTo", "org:resultedFrom",
|
|
"org:resultingOrganization", "org:role", "org:siteOf", "org:subOrganizationOf", "org:unitOf",
|
|
|
|
# DCAT (verified from dcat3.ttl)
|
|
"dcat:accessService", "dcat:accessURL", "dcat:catalog", "dcat:contactPoint", "dcat:dataset",
|
|
"dcat:distribution", "dcat:downloadURL", "dcat:endDate", "dcat:endpointDescription", "dcat:endpointURL",
|
|
"dcat:hasCurrentVersion", "dcat:hasVersion", "dcat:inCatalog", "dcat:keyword", "dcat:landingPage",
|
|
"dcat:mediaType", "dcat:qualifiedRelation", "dcat:startDate", "dcat:theme", "dcat:version",
|
|
|
|
# CIDOC-CRM (verified from CIDOC_CRM_v7.1.3.rdf - using common predicates)
|
|
"crm:P1_is_identified_by", "crm:P2_has_type", "crm:P3_has_note", "crm:P4_has_time-span",
|
|
"crm:P7_took_place_at", "crm:P12_occurred_in_the_presence_of", "crm:P14_carried_out_by",
|
|
"crm:P14.1_in_the_role_of", "crm:P16_used_specific_object", "crm:P29_custody_received_by",
|
|
"crm:P31i_was_modified_by", "crm:P43_has_dimension", "crm:P44_has_condition", "crm:P46_is_composed_of",
|
|
"crm:P46i_forms_part_of", "crm:P48_has_preferred_identifier", "crm:P50_has_current_keeper",
|
|
"crm:P52_has_current_owner", "crm:P81b_begin_of_the_end", "crm:P82a_begin_of_the_begin",
|
|
"crm:P98i_was_born", "crm:P128_carries", "crm:P141_assigned",
|
|
|
|
# EDM (verified from edm.owl)
|
|
"edm:aggregatedCHO", "edm:begin", "edm:collectionName", "edm:end", "edm:happenedAt", "edm:hasMet",
|
|
"edm:hasView", "edm:isNextInSequence", "edm:isRelatedTo", "edm:isShownAt", "edm:isShownBy",
|
|
"edm:isSimilarTo", "edm:occurredAt", "edm:rights", "edm:wasPresentAt",
|
|
|
|
# ORE (verified from ore.rdf)
|
|
"ore:aggregates", "ore:describes", "ore:isAggregatedBy", "ore:proxyFor", "ore:proxyIn",
|
|
|
|
# GLEIF (verified from gleif_base.ttl)
|
|
"gleif-base:hasAbbreviation", "gleif-base:hasAbbreviationLocal", "gleif-base:hasAbbreviationTransliterated",
|
|
"gleif-base:hasLegalName", "gleif-base:hasLegalNameLocal", "gleif-base:hasLegalNameTransliterated",
|
|
|
|
# GeoNames (verified from geonames_ontology.rdf)
|
|
"gn:alternateName", "gn:countryCode", "gn:featureClass", "gn:featureCode", "gn:geonamesID",
|
|
"gn:lat", "gn:locatedIn", "gn:locationMap", "gn:long", "gn:name", "gn:nearby", "gn:officialName",
|
|
"gn:parentCountry", "gn:parentFeature", "gn:population", "gn:postalCode", "gn:shortName",
|
|
"gn:wikipediaArticle",
|
|
|
|
# GeoSPARQL (commonly used)
|
|
"geo:alt", "geo:asWKT", "geo:hasGeometry", "geo:lat", "geo:long",
|
|
"geosparql:hasBoundingBox", "geosparql:hasGeometry", "geosparql:asWKT",
|
|
|
|
# WGS84 (commonly used)
|
|
"wgs84:alt", "wgs84:lat", "wgs84:long",
|
|
|
|
# RDFS (standard)
|
|
"rdfs:comment", "rdfs:label", "rdfs:seeAlso",
|
|
|
|
# RDF (standard)
|
|
"rdf:type", "rdf:value",
|
|
|
|
# PREMIS (verified from premis3.owl)
|
|
"premis:hasRightsStatement",
|
|
|
|
# BIBFRAME (verified from bibframe.rdf)
|
|
"bf:acquisitionSource", "bf:arrangement", "bf:binding", "bf:classification", "bf:code", "bf:contribution",
|
|
"bf:creationDate", "bf:custodialHistory", "bf:shelfMark",
|
|
|
|
# DBpedia (commonly used)
|
|
"dbp:abbreviation", "dbp:architecturalStyle", "dbp:programCost",
|
|
|
|
# GoodRelations (commonly used)
|
|
"gr:acceptedPaymentMethods", "gr:eligibleCustomerTypes", "gr:hasPriceSpecification",
|
|
|
|
# Web Annotation (OA)
|
|
"oa:annotatedBy", "oa:hasBody", "oa:hasSelector", "oa:hasTarget", "oa:motivatedBy",
|
|
|
|
# Darwin Core (dwc)
|
|
"dwc:associatedTaxa", "dwc:dateIdentified", "dwc:eventDate", "dwc:fieldNumber", "dwc:locality",
|
|
"dwc:recordedBy", "dwc:scientificName", "dwc:verbatimLocality", "dwc:vernacularName",
|
|
|
|
# LOCN (ISA Core Location)
|
|
"locn:address", "locn:geometry", "locn:postCode", "locn:postName",
|
|
|
|
# vCard
|
|
"vcard:country-name", "vcard:email", "vcard:hasEmail", "vcard:hasTelephone", "vcard:locality",
|
|
"vcard:organization-name", "vcard:postal-code", "vcard:region", "vcard:street-address", "vcard:tel",
|
|
|
|
# PiCo (Person in Context)
|
|
"pico:hasAffiliation", "pico:observedName",
|
|
|
|
# TOOI (Dutch government)
|
|
"tooi:onderwerp",
|
|
|
|
# LCC (Language codes)
|
|
"lcc-lr:hasTag",
|
|
|
|
# PAV (Provenance)
|
|
"pav:version",
|
|
|
|
# Hydra
|
|
"hydra:entrypoint",
|
|
|
|
# Custom HC predicates (allowed for domain-specific concepts)
|
|
"hc:acceptsOrAcceptedExternalWork", "hc:acceptsOrAcceptedVisitingScholar",
|
|
"hc:hasAirChangesPerHour", "hc:hasAllDataRealFlag", "hc:hasSearchScore",
|
|
"hc:isApproximate",
|
|
|
|
# Additional Schema.org predicates
|
|
"schema:addressCountry", "schema:audienceType", "schema:contentUrl", "schema:director",
|
|
"schema:dissolutionDate", "schema:educationalLevel", "schema:editor", "schema:eligibleRegion",
|
|
"schema:elevation", "schema:eventSchedule", "schema:expires", "schema:floorSize",
|
|
"schema:gender", "schema:genre", "schema:homeLocation", "schema:jobTitle",
|
|
"schema:locationCreated", "schema:organizer", "schema:owns", "schema:position",
|
|
"schema:priceCurrency", "schema:propertyID", "schema:requiredFeatures", "schema:scheduledTime",
|
|
"schema:servesCuisine", "schema:subOrganization", "schema:teaches", "schema:validFrom",
|
|
"schema:valuePattern", "schema:warning", "schema:workExample", "schema:workFeatured",
|
|
"schema:availableOnDevice", "schema:citation",
|
|
|
|
# LDP (Linked Data Platform)
|
|
"ldp:contains", "ldp:member", "ldp:memberSubject", "ldp:hasMemberRelation",
|
|
|
|
# RDFS
|
|
"rdfs:member",
|
|
|
|
# ODRL (Open Digital Rights Language)
|
|
"odrl:hasPolicy", "odrl:permission", "odrl:prohibition", "odrl:duty",
|
|
"odrl:action", "odrl:assignee", "odrl:assigner", "odrl:constraint",
|
|
|
|
# DCAT additional
|
|
"dcat:servesDataset", "dcat:checksum",
|
|
|
|
# BIBO (Bibliographic Ontology)
|
|
"bibo:doi", "bibo:isbn", "bibo:issn", "bibo:edition", "bibo:volume", "bibo:pages",
|
|
"bibo:abstract", "bibo:authorList", "bibo:editor",
|
|
|
|
# PREMIS additional
|
|
"premis:hasRepresentation", "premis:fixity", "premis:hasRelatedStatementInformation",
|
|
"premis:hasIdentifier", "premis:hasEvent", "premis:hasAgent",
|
|
|
|
# SPDX (Software Package Data Exchange)
|
|
"spdx:checksumValue", "spdx:algorithm", "spdx:checksum",
|
|
|
|
# GeoNames additional (using geonames: prefix)
|
|
"geonames:featureClass", "geonames:featureCode",
|
|
|
|
# EDM additional
|
|
"edm:provider", "edm:dataProvider", "edm:object", "edm:preview", "edm:country",
|
|
|
|
# PAV (Provenance, Authoring and Versioning)
|
|
"pav:createdBy", "pav:authoredBy", "pav:contributedBy", "pav:curatedBy",
|
|
"pav:createdOn", "pav:authoredOn", "pav:lastUpdateOn",
|
|
|
|
# ADMS (Asset Description Metadata Schema)
|
|
"adms:status", "adms:identifier", "adms:sample", "adms:translation",
|
|
|
|
# PNV (Person Name Vocabulary)
|
|
"pnv:baseSurname", "pnv:givenName", "pnv:initials", "pnv:literalName",
|
|
"pnv:prefix", "pnv:suffix", "pnv:patronym", "pnv:hasName", "pnv:surname",
|
|
|
|
# PiCo additional
|
|
"pico:hasObservation", "pico:hasName", "pico:observationDate",
|
|
|
|
# CIDOC-CRM additional
|
|
"crm:P11_had_participant", "crm:P12i_was_present_at", "crm:P23_transferred_title_from",
|
|
"crm:P33_used_specific_technique", "crm:P62_depicts", "crm:P81a_end_of_the_begin",
|
|
"crm:P82b_end_of_the_end", "crm:P1i_identifies", "crm:P48i_is_preferred_identifier_of",
|
|
"crm:P147_curated", "crm:P147i_was_curated_by", "crm:P148_has_component",
|
|
|
|
# RiC-O additional
|
|
"rico:isDescribedBy", "rico:hasInstantiation", "rico:hasContentOfType",
|
|
"rico:hasDateRange", "rico:hasOrHadAgent", "rico:hasOrHadActivityType",
|
|
"rico:hasOrHadArrangement", "rico:hasAccessionNumber",
|
|
|
|
# BIBFRAME additional
|
|
"bf:extent", "bf:editionStatement", "bf:illustrationNote",
|
|
|
|
# FRAPO (Funding, Research Administration and Projects Ontology)
|
|
"frapo:hasFunding", "frapo:hasFundingProgram", "frapo:hasGrant",
|
|
|
|
# Darwin Core additional
|
|
"dwc:habitat", "dwc:higherClassification", "dwc:identificationQualifier",
|
|
"dwc:occurrenceID",
|
|
|
|
# SKOS additional
|
|
"skos:inScheme", "skos:topConceptOf", "skos:hasTopConcept", "skos:member",
|
|
"skos:memberList", "skos:changeNote", "skos:editorialNote", "skos:historyNote",
|
|
|
|
# DCTerms additional
|
|
"dcterms:bibliographicCitation", "dcterms:requires", "dct:type", "dct:identifier",
|
|
|
|
# ORG additional
|
|
"org:hasMember", "org:name", "org:OrganizationalUnit",
|
|
|
|
# ROV (Registered Organization Vocabulary)
|
|
"rov:orgType", "rov:legalName", "rov:orgStatus", "rov:orgActivity",
|
|
|
|
# PROV-O additional
|
|
"prov:informed", "prov:alternateOf", "prov:hadDerivation",
|
|
|
|
# CPOV (Core Public Organisation Vocabulary)
|
|
"cpov:purpose", "cpov:hasSubOrganization", "cpov:address",
|
|
|
|
# TOOI additional
|
|
"tooi:heeft_informatieobject", "tooi:naam", "tooi:begindatum", "tooi:einddatum",
|
|
|
|
# GLEIF additional
|
|
"gleif_base:hasCoverageArea", "gleif_base:hasLegalForm",
|
|
|
|
# Additional Schema.org predicates (batch 2)
|
|
"schema:agent", "schema:courseCode", "schema:department", "schema:educationalProgramMode",
|
|
"schema:height", "schema:organization", "schema:participant", "schema:width",
|
|
|
|
# SOSA (Sensor, Observation, Sample, and Actuator)
|
|
"sosa:hosts", "sosa:hasResult", "sosa:observes", "sosa:madeObservation",
|
|
"sosa:madeBySensor", "sosa:hasFeatureOfInterest", "sosa:isHostedBy",
|
|
|
|
# GeoSPARQL additional
|
|
"geosparql:hasSpatialResolution", "geosparql:hasCentroid", "geosparql:sfContains",
|
|
|
|
# RDA (Resource Description and Access)
|
|
"rda:carrierType", "rda:contentType", "rda:mediaType", "rda:modeOfIssuance",
|
|
|
|
# Dublin Core (additional dcterms)
|
|
"dcterms:created",
|
|
|
|
# OWL
|
|
"owl:sameAs", "owl:equivalentClass", "owl:equivalentProperty",
|
|
|
|
# Schema.org (batch 3 - more predicates)
|
|
"schema:isbn", "schema:keywords", "schema:category", "schema:educationalUse",
|
|
"schema:validThrough", "schema:maintainer", "schema:usageInfo", "schema:approximateValue",
|
|
"schema:applicationContact", "schema:legalForm", "schema:hasOccupation",
|
|
"schema:artMedium", "schema:legislationIdentifier", "schema:eligibilityToWorkRequirement",
|
|
"schema:organizationRole", "schema:softwareVersion", "schema:mainEntity", "schema:name",
|
|
|
|
# PNV additional
|
|
"pnv:nameSpecification", "pnv:nameComponent", "pnv:surnamePrefix",
|
|
|
|
# GLEIF additional (gleif_base prefix)
|
|
"gleif_base:hasLegalJurisdiction", "gleif_base:isManagedBy",
|
|
|
|
# CIDOC-CRM additional (batch 3)
|
|
"crm:P45_consists_of", "crm:P126_employed", "crm:P140_assigned_attribute_to",
|
|
"crm:P16_used_specific_object", "crm:P138_represents",
|
|
|
|
# PiCo additional (batch 2)
|
|
"pico:hasReligion",
|
|
|
|
# Dublin Core (additional)
|
|
"dct:language",
|
|
|
|
# BIBO additional
|
|
"bibo:isbn13", "bibo:isbn10", "bibo:oclcnum", "bibo:lccn",
|
|
|
|
# Darwin Core additional
|
|
"dwc:lifeStage", "dwc:sex", "dwc:preparations", "dwc:recordNumber",
|
|
|
|
# VoID (Vocabulary of Interlinked Datasets)
|
|
"void:sparqlEndpoint", "void:vocabulary", "void:dataDump", "void:exampleResource",
|
|
"void:uriSpace", "void:linkPredicate", "void:triples", "void:entities",
|
|
|
|
# GLEIF additional (gleif: prefix)
|
|
"gleif:hasLegalForm", "gleif:hasEntityStatus", "gleif:hasLegalAddress",
|
|
|
|
# CIDOC-CRM additional (batch 2)
|
|
"crm:P28_custody_surrendered_by", "crm:P30_transferred_custody_of",
|
|
"crm:P30i_custody_transferred_through", "crm:P50i_is_current_keeper_of",
|
|
"crm:P70_documents", "crm:P70i_is_documented_in",
|
|
|
|
# ORG additional (batch 2)
|
|
"org:basedAt", "org:siteAddress",
|
|
|
|
# RiC-O additional (batch 2)
|
|
"rico:isManagerOf",
|
|
|
|
# TOOI additional (batch 2)
|
|
"tooi:organisatievorm", "tooi:rechtsvorm",
|
|
}
|
|
|
|
|
|
def extract_predicates_from_slot(slot_file: Path) -> dict:
|
|
"""Extract all predicates from a slot file."""
|
|
try:
|
|
with open(slot_file, 'r') as f:
|
|
content = yaml.safe_load(f)
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
if not content or 'slots' not in content:
|
|
return {"error": "No slots found"}
|
|
|
|
predicates = {}
|
|
for slot_name, slot_def in content.get('slots', {}).items():
|
|
predicates[slot_name] = {
|
|
"slot_uri": slot_def.get('slot_uri'),
|
|
"exact_mappings": slot_def.get('exact_mappings', []),
|
|
"close_mappings": slot_def.get('close_mappings', []),
|
|
"related_mappings": slot_def.get('related_mappings', []),
|
|
"narrow_mappings": slot_def.get('narrow_mappings', []),
|
|
"broad_mappings": slot_def.get('broad_mappings', []),
|
|
}
|
|
|
|
return predicates
|
|
|
|
|
|
def validate_predicate(predicate: str) -> tuple:
|
|
"""Validate a predicate against known valid predicates."""
|
|
if predicate is None:
|
|
return False, "None"
|
|
|
|
if predicate in VALID_PREDICATES:
|
|
return True, None
|
|
|
|
# Check if it's a custom HC predicate (allowed)
|
|
if predicate.startswith("hc:"):
|
|
return True, "custom"
|
|
|
|
return False, f"Unknown predicate: {predicate}"
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Validate slot mappings against ontology predicates")
|
|
parser.add_argument("--slots-dir", default="schemas/20251121/linkml/modules/slots",
|
|
help="Path to slots directory")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Show all predicates")
|
|
args = parser.parse_args()
|
|
|
|
slots_dir = Path(args.slots_dir)
|
|
if not slots_dir.exists():
|
|
print(f"Slots directory not found: {slots_dir}")
|
|
return 1
|
|
|
|
# Get list of recently updated slots
|
|
updated_slots = [
|
|
"has_or_had_abbreviation", "is_or_was_about_digital_presence", "has_or_had_about_text",
|
|
"has_or_had_academic_affiliation", "has_or_had_academic_program", "accepts_or_accepted_external_work",
|
|
"accepts_or_accepted_payment_method", "accepts_or_accepted_visiting_scholar",
|
|
"has_or_had_access_condition", "has_access_application_url", "has_or_had_access_control",
|
|
# ... add more as needed
|
|
]
|
|
|
|
total_valid = 0
|
|
total_invalid = 0
|
|
invalid_predicates = []
|
|
|
|
for slot_file in sorted(slots_dir.glob("*.yaml")):
|
|
predicates = extract_predicates_from_slot(slot_file)
|
|
|
|
if "error" in predicates:
|
|
continue
|
|
|
|
for slot_name, mappings in predicates.items():
|
|
# Check slot_uri
|
|
valid, error = validate_predicate(mappings["slot_uri"])
|
|
if not valid and error != "None":
|
|
invalid_predicates.append((slot_file.name, "slot_uri", mappings["slot_uri"]))
|
|
total_invalid += 1
|
|
else:
|
|
total_valid += 1
|
|
|
|
# Check all mapping types
|
|
for mapping_type in ["exact_mappings", "close_mappings", "related_mappings",
|
|
"narrow_mappings", "broad_mappings"]:
|
|
for pred in mappings.get(mapping_type, []) or []:
|
|
valid, error = validate_predicate(pred)
|
|
if not valid:
|
|
invalid_predicates.append((slot_file.name, mapping_type, pred))
|
|
total_invalid += 1
|
|
else:
|
|
total_valid += 1
|
|
|
|
print(f"Validation Results:")
|
|
print(f" Valid predicates: {total_valid}")
|
|
print(f" Invalid predicates: {total_invalid}")
|
|
print()
|
|
|
|
if invalid_predicates:
|
|
print("Invalid predicates found:")
|
|
for filename, mapping_type, pred in sorted(set(invalid_predicates)):
|
|
print(f" {filename}: {mapping_type} = {pred}")
|
|
|
|
return 0 if total_invalid == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|