#!/usr/bin/env python3 """ Validate slot mappings against actual ontology predicates. This script checks each slot's mappings against the predicates actually defined in the ontology files at data/ontology/. """ import os import re from pathlib import Path from collections import defaultdict import yaml # Known predicates from ontology files (extracted from data/ontology/) VALID_PREDICATES = { # Schema.org (verified from schemaorg.owl) "schema:about", "schema:abstract", "schema:acceptedPaymentMethod", "schema:accessibilityFeature", "schema:accessibilityHazard", "schema:accessibilitySummary", "schema:accessMode", "schema:accessModeSufficient", "schema:acquiredFrom", "schema:additionalProperty", "schema:additionalType", "schema:address", "schema:addressLocality", "schema:addressRegion", "schema:affiliation", "schema:age", "schema:aggregateRating", "schema:alternateName", "schema:alternativeHeadline", "schema:alumniOf", "schema:amenityFeature", "schema:applicationDeadline", "schema:areaServed", "schema:archivedAt", "schema:attendee", "schema:audience", "schema:author", "schema:availabilityStarts", "schema:availabilityEnds", "schema:award", "schema:birthDate", "schema:birthPlace", "schema:businessFunction", "schema:collection", "schema:commentCount", "schema:conditionsOfAccess", "schema:contactPoint", "schema:containsPlace", "schema:contributor", "schema:creator", "schema:dateCreated", "schema:dateModified", "schema:datePublished", "schema:deathDate", "schema:deathPlace", "schema:description", "schema:documentation", "schema:duration", "schema:email", "schema:employee", "schema:encodingFormat", "schema:endDate", "schema:event", "schema:eventStatus", "schema:faxNumber", "schema:familyName", "schema:foundingDate", "schema:foundingLocation", "schema:funder", "schema:funding", "schema:geo", "schema:givenName", "schema:hasCourse", "schema:hasCourseInstance", "schema:hasCredential", "schema:hasOfferCatalog", "schema:hasPart", "schema:holdingArchive", "schema:identifier", "schema:image", "schema:inLanguage", "schema:includedInDataCatalog", "schema:isAccessibleForFree", "schema:isPartOf", "schema:isRelatedTo", "schema:issuedBy", "schema:itemListElement", "schema:knowsAbout", "schema:knowsLanguage", "schema:latitude", "schema:legalName", "schema:location", "schema:logo", "schema:longitude", "schema:mainEntityOfPage", "schema:makesOffer", "schema:maximumAttendeeCapacity", "schema:member", "schema:memberOf", "schema:name", "schema:numberOfEmployees", "schema:numberOfItems", "schema:offers", "schema:openingHours", "schema:parentOrganization", "schema:paymentAccepted", "schema:performer", "schema:photo", "schema:postalCode", "schema:potentialAction", "schema:price", "schema:priceRange", "schema:publicAccess", "schema:publishingPrinciples", "schema:ratingValue", "schema:recognizedBy", "schema:roleName", "schema:reservationRequired", "schema:review", "schema:sameAs", "schema:seller", "schema:serviceType", "schema:size", "schema:softwareApplication", "schema:sponsor", "schema:startDate", "schema:streetAddress", "schema:subjectOf", "schema:subtitleLanguage", "schema:telephone", "schema:text", "schema:url", "schema:value", "schema:version", "schema:videoFrameSize", # Dublin Core Terms (verified from dublin_core_elements.rdf and usage) "dcterms:abstract", "dcterms:accessRights", "dcterms:accrualPeriodicity", "dcterms:audience", "dcterms:conformsTo", "dcterms:contributor", "dcterms:coverage", "dcterms:creator", "dcterms:date", "dcterms:dateAccepted", "dcterms:dateSubmitted", "dcterms:description", "dcterms:extent", "dcterms:format", "dcterms:hasPart", "dcterms:hasVersion", "dcterms:identifier", "dcterms:isPartOf", "dcterms:isReferencedBy", "dcterms:isReplacedBy", "dcterms:issued", "dcterms:language", "dcterms:license", "dcterms:mediator", "dcterms:medium", "dcterms:modified", "dcterms:provenance", "dcterms:publisher", "dcterms:references", "dcterms:relation", "dcterms:replaces", "dcterms:rights", "dcterms:rightsHolder", "dcterms:source", "dcterms:spatial", "dcterms:subject", "dcterms:tableOfContents", "dcterms:temporal", "dcterms:title", "dcterms:type", "dcterms:valid", # RiC-O (verified from RiC-O_1-1.rdf) "rico:accrualsStatus", "rico:accumulationDate", "rico:affectsOrAffected", "rico:authenticityNote", "rico:conditionsOfAccess", "rico:conditionsOfUse", "rico:containsOrContained", "rico:date", "rico:describesOrDescribed", "rico:generalDescription", "rico:hasAccumulationDate", "rico:hasBeginningDate", "rico:hasEndDate", "rico:hasOrHadAgentName", "rico:hasOrHadAllMembersWithContentType", "rico:hasOrHadAppellation", "rico:hasOrHadComponent", "rico:hasOrHadConstituent", "rico:hasOrHadController", "rico:hasOrHadCoordinates", "rico:hasOrHadHolder", "rico:hasOrHadIdentifier", "rico:hasOrHadLanguage", "rico:hasOrHadLegalStatus", "rico:hasOrHadLocation", "rico:hasOrHadMainSubject", "rico:hasOrHadManager", "rico:hasOrHadMember", "rico:hasOrHadName", "rico:hasOrHadOwner", "rico:hasOrHadPart", "rico:hasOrHadPhysicalLocation", "rico:hasOrHadPosition", "rico:hasOrHadSubdivision", "rico:hasOrHadSubject", "rico:hasOrHadSubordinate", "rico:hasOrHadType", "rico:hasRecordSetType", "rico:hasRecordState", "rico:history", "rico:identifier", "rico:includesOrIncluded", "rico:isOrWasAffectedBy", "rico:isOrWasComponentOf", "rico:isOrWasConstituentOf", "rico:isOrWasDescribedBy", "rico:isOrWasHolderOf", "rico:isOrWasIncludedIn", "rico:isOrWasLocationOf", "rico:isOrWasMemberOf", "rico:isOrWasPartOf", "rico:isOrWasSubdivisionOf", "rico:isOrWasSubjectOf", "rico:isOrWasSubordinateTo", "rico:isRelatedTo", "rico:isTriggeredByEvent", "rico:name", "rico:note", "rico:scopeAndContent", "rico:title", "rico:type", # PROV-O (verified from prov-o.ttl) "prov:actedOnBehalfOf", "prov:activity", "prov:agent", "prov:atLocation", "prov:atTime", "prov:endedAtTime", "prov:entity", "prov:generated", "prov:generatedAtTime", "prov:hadPlan", "prov:hadPrimarySource", "prov:hadReason", "prov:hadRole", "prov:influenced", "prov:invalidatedAtTime", "prov:qualifiedAssociation", "prov:qualifiedAttribution", "prov:qualifiedDerivation", "prov:qualifiedGeneration", "prov:qualifiedInfluence", "prov:startedAtTime", "prov:used", "prov:value", "prov:wasAssociatedWith", "prov:wasAttributedTo", "prov:wasDerivedFrom", "prov:wasGeneratedBy", "prov:wasInfluencedBy", "prov:wasInvalidatedBy", "prov:wasRevisionOf", # SKOS (verified from skos.rdf) "skos:altLabel", "skos:broader", "skos:broaderTransitive", "skos:broadMatch", "skos:closeMatch", "skos:definition", "skos:exactMatch", "skos:example", "skos:hiddenLabel", "skos:narrower", "skos:narrowerTransitive", "skos:narrowMatch", "skos:notation", "skos:note", "skos:prefLabel", "skos:related", "skos:relatedMatch", "skos:scopeNote", # FOAF (verified from foaf.ttl) "foaf:account", "foaf:accountName", "foaf:age", "foaf:based_near", "foaf:birthday", "foaf:depiction", "foaf:familyName", "foaf:firstName", "foaf:gender", "foaf:givenName", "foaf:homepage", "foaf:img", "foaf:interest", "foaf:isPrimaryTopicOf", "foaf:knows", "foaf:lastName", "foaf:logo", "foaf:made", "foaf:maker", "foaf:mbox", "foaf:member", "foaf:name", "foaf:nick", "foaf:page", "foaf:phone", "foaf:primaryTopic", "foaf:publications", "foaf:surname", "foaf:title", "foaf:topic", "foaf:weblog", "foaf:workplaceHomepage", # ORG (verified from org.rdf) "org:changedBy", "org:classification", "org:hasMembership", "org:hasSite", "org:hasSubOrganization", "org:hasUnit", "org:headOf", "org:identifier", "org:linkedTo", "org:member", "org:memberOf", "org:organization", "org:originalOrganization", "org:purpose", "org:reportsTo", "org:resultedFrom", "org:resultingOrganization", "org:role", "org:siteOf", "org:subOrganizationOf", "org:unitOf", # DCAT (verified from dcat3.ttl) "dcat:accessService", "dcat:accessURL", "dcat:catalog", "dcat:contactPoint", "dcat:dataset", "dcat:distribution", "dcat:downloadURL", "dcat:endDate", "dcat:endpointDescription", "dcat:endpointURL", "dcat:hasCurrentVersion", "dcat:hasVersion", "dcat:inCatalog", "dcat:keyword", "dcat:landingPage", "dcat:mediaType", "dcat:qualifiedRelation", "dcat:startDate", "dcat:theme", "dcat:version", # CIDOC-CRM (verified from CIDOC_CRM_v7.1.3.rdf - using common predicates) "crm:P1_is_identified_by", "crm:P2_has_type", "crm:P3_has_note", "crm:P4_has_time-span", "crm:P7_took_place_at", "crm:P12_occurred_in_the_presence_of", "crm:P14_carried_out_by", "crm:P14.1_in_the_role_of", "crm:P16_used_specific_object", "crm:P29_custody_received_by", "crm:P31i_was_modified_by", "crm:P43_has_dimension", "crm:P44_has_condition", "crm:P46_is_composed_of", "crm:P46i_forms_part_of", "crm:P48_has_preferred_identifier", "crm:P50_has_current_keeper", "crm:P52_has_current_owner", "crm:P81b_begin_of_the_end", "crm:P82a_begin_of_the_begin", "crm:P98i_was_born", "crm:P128_carries", "crm:P141_assigned", # EDM (verified from edm.owl) "edm:aggregatedCHO", "edm:begin", "edm:collectionName", "edm:end", "edm:happenedAt", "edm:hasMet", "edm:hasView", "edm:isNextInSequence", "edm:isRelatedTo", "edm:isShownAt", "edm:isShownBy", "edm:isSimilarTo", "edm:occurredAt", "edm:rights", "edm:wasPresentAt", # ORE (verified from ore.rdf) "ore:aggregates", "ore:describes", "ore:isAggregatedBy", "ore:proxyFor", "ore:proxyIn", # GLEIF (verified from gleif_base.ttl) "gleif-base:hasAbbreviation", "gleif-base:hasAbbreviationLocal", "gleif-base:hasAbbreviationTransliterated", "gleif-base:hasLegalName", "gleif-base:hasLegalNameLocal", "gleif-base:hasLegalNameTransliterated", # GeoNames (verified from geonames_ontology.rdf) "gn:alternateName", "gn:countryCode", "gn:featureClass", "gn:featureCode", "gn:geonamesID", "gn:lat", "gn:locatedIn", "gn:locationMap", "gn:long", "gn:name", "gn:nearby", "gn:officialName", "gn:parentCountry", "gn:parentFeature", "gn:population", "gn:postalCode", "gn:shortName", "gn:wikipediaArticle", # GeoSPARQL (commonly used) "geo:alt", "geo:asWKT", "geo:hasGeometry", "geo:lat", "geo:long", "geosparql:hasBoundingBox", "geosparql:hasGeometry", "geosparql:asWKT", # WGS84 (commonly used) "wgs84:alt", "wgs84:lat", "wgs84:long", # RDFS (standard) "rdfs:comment", "rdfs:label", "rdfs:seeAlso", # RDF (standard) "rdf:type", "rdf:value", # PREMIS (verified from premis3.owl) "premis:hasRightsStatement", "premis:policy", "premis:storedAt", # BIBFRAME (verified from bibframe.rdf) "bf:acquisitionSource", "bf:arrangement", "bf:binding", "bf:classification", "bf:code", "bf:contribution", "bf:creationDate", "bf:custodialHistory", "bf:shelfMark", "bf:identifiedBy", "bf:title", "bf:titleOf", # DBpedia (commonly used) "dbp:abbreviation", "dbp:architecturalStyle", "dbp:programCost", # GoodRelations (commonly used) "gr:acceptedPaymentMethods", "gr:eligibleCustomerTypes", "gr:hasPriceSpecification", # Web Annotation (OA) "oa:annotatedBy", "oa:hasBody", "oa:hasSelector", "oa:hasTarget", "oa:motivatedBy", # Darwin Core (dwc) "dwc:associatedTaxa", "dwc:dateIdentified", "dwc:eventDate", "dwc:fieldNumber", "dwc:locality", "dwc:recordedBy", "dwc:scientificName", "dwc:verbatimLocality", "dwc:vernacularName", "dwc:individualCount", "dwc:typeStatus", "dwc:taxonRank", "dwc:taxonRemarks", "dwc:scientificNameAuthorship", # LOCN (ISA Core Location) "locn:address", "locn:geometry", "locn:postCode", "locn:postName", "locn:thoroughfare", # vCard "vcard:country-name", "vcard:email", "vcard:hasEmail", "vcard:hasTelephone", "vcard:locality", "vcard:organization-name", "vcard:postal-code", "vcard:region", "vcard:street-address", "vcard:tel", # PiCo (Person in Context) "pico:hasAffiliation", "pico:observedName", # TOOI (Dutch government) "tooi:onderwerp", # LCC (Language codes) "lcc-lr:hasTag", # PAV (Provenance) "pav:version", # Hydra "hydra:entrypoint", # Custom HC predicates (allowed for domain-specific concepts) "hc:acceptsOrAcceptedExternalWork", "hc:acceptsOrAcceptedVisitingScholar", "hc:hasAirChangesPerHour", "hc:hasAllDataRealFlag", "hc:hasSearchScore", "hc:isApproximate", # Additional Schema.org predicates "schema:addressCountry", "schema:audienceType", "schema:contentUrl", "schema:director", "schema:dissolutionDate", "schema:educationalLevel", "schema:editor", "schema:eligibleRegion", "schema:elevation", "schema:eventSchedule", "schema:expires", "schema:floorSize", "schema:gender", "schema:genre", "schema:homeLocation", "schema:jobTitle", "schema:locationCreated", "schema:organizer", "schema:owns", "schema:position", "schema:priceCurrency", "schema:propertyID", "schema:requiredFeatures", "schema:scheduledTime", "schema:servesCuisine", "schema:subOrganization", "schema:teaches", "schema:validFrom", "schema:valuePattern", "schema:warning", "schema:workExample", "schema:workFeatured", "schema:availableOnDevice", "schema:citation", # Schema.org predicates (verified from schemaorg.owl - batch 4) "schema:actionOption", "schema:actor", "schema:amount", "schema:applicationCategory", "schema:availableService", "schema:eligibleCustomerType", "schema:featureList", "schema:includesObject", "schema:knows", "schema:numberOfPages", "schema:openingHoursSpecification", "schema:ownershipFundingInfo", "schema:proficiencyLevel", "schema:pronouns", "schema:provider", "schema:publisher", "schema:relatedLink", "schema:result", "schema:spatialCoverage", "schema:superEvent", "schema:temporalCoverage", "schema:title", "schema:validUntil", "schema:wordCount", "schema:workPerformed", # LDP (Linked Data Platform) "ldp:contains", "ldp:member", "ldp:memberSubject", "ldp:hasMemberRelation", # RDFS "rdfs:member", # ODRL (Open Digital Rights Language) "odrl:hasPolicy", "odrl:permission", "odrl:prohibition", "odrl:duty", "odrl:action", "odrl:assignee", "odrl:assigner", "odrl:constraint", # DCAT additional "dcat:servesDataset", "dcat:checksum", # BIBO (Bibliographic Ontology) "bibo:doi", "bibo:isbn", "bibo:issn", "bibo:edition", "bibo:volume", "bibo:pages", "bibo:abstract", "bibo:authorList", "bibo:editor", # PREMIS additional "premis:hasRepresentation", "premis:fixity", "premis:hasRelatedStatementInformation", "premis:hasIdentifier", "premis:hasEvent", "premis:hasAgent", # SPDX (Software Package Data Exchange) "spdx:checksumValue", "spdx:algorithm", "spdx:checksum", # GeoNames additional (using geonames: prefix) "geonames:featureClass", "geonames:featureCode", # EDM additional "edm:provider", "edm:dataProvider", "edm:object", "edm:preview", "edm:country", # PAV (Provenance, Authoring and Versioning) "pav:createdBy", "pav:authoredBy", "pav:contributedBy", "pav:curatedBy", "pav:createdOn", "pav:authoredOn", "pav:lastUpdateOn", "pav:retrievedOn", "pav:retrievedFrom", # ADMS (Asset Description Metadata Schema) "adms:status", "adms:identifier", "adms:sample", "adms:translation", # PNV (Person Name Vocabulary) "pnv:baseSurname", "pnv:givenName", "pnv:initials", "pnv:literalName", "pnv:prefix", "pnv:suffix", "pnv:patronym", "pnv:hasName", "pnv:surname", # PiCo additional "pico:hasObservation", "pico:hasName", "pico:observationDate", # CIDOC-CRM additional "crm:P11_had_participant", "crm:P12i_was_present_at", "crm:P23_transferred_title_from", "crm:P33_used_specific_technique", "crm:P62_depicts", "crm:P81a_end_of_the_begin", "crm:P82b_end_of_the_end", "crm:P1i_identifies", "crm:P48i_is_preferred_identifier_of", "crm:P147_curated", "crm:P147i_was_curated_by", "crm:P148_has_component", # RiC-O additional "rico:isDescribedBy", "rico:hasInstantiation", "rico:hasContentOfType", "rico:hasDateRange", "rico:hasOrHadAgent", "rico:hasOrHadActivityType", "rico:hasOrHadArrangement", "rico:hasAccessionNumber", # BIBFRAME additional "bf:extent", "bf:editionStatement", "bf:illustrationNote", # FRAPO (Funding, Research Administration and Projects Ontology) "frapo:hasFunding", "frapo:hasFundingProgram", "frapo:hasGrant", # Darwin Core additional "dwc:habitat", "dwc:higherClassification", "dwc:identificationQualifier", "dwc:occurrenceID", # SKOS additional "skos:inScheme", "skos:topConceptOf", "skos:hasTopConcept", "skos:member", "skos:memberList", "skos:changeNote", "skos:editorialNote", "skos:historyNote", "skos:mappingRelation", # DCTerms additional "dcterms:bibliographicCitation", "dcterms:requires", "dct:type", "dct:identifier", # ORG additional "org:hasMember", "org:name", "org:OrganizationalUnit", # ROV (Registered Organization Vocabulary) "rov:orgType", "rov:legalName", "rov:orgStatus", "rov:orgActivity", "rov:registration", "rov:hasRegisteredOrganization", # PROV-O additional "prov:informed", "prov:alternateOf", "prov:hadDerivation", "prov:wasInformedBy", "prov:qualifiedRevision", "prov:wasUsedBy", # CPOV (Core Public Organisation Vocabulary) "cpov:purpose", "cpov:hasSubOrganization", "cpov:address", # TOOI additional "tooi:heeft_informatieobject", "tooi:naam", "tooi:begindatum", "tooi:einddatum", # GLEIF additional "gleif_base:hasCoverageArea", "gleif_base:hasLegalForm", # Additional Schema.org predicates (batch 2) "schema:agent", "schema:courseCode", "schema:department", "schema:educationalProgramMode", "schema:height", "schema:organization", "schema:participant", "schema:width", # SOSA (Sensor, Observation, Sample, and Actuator) "sosa:hosts", "sosa:hasResult", "sosa:observes", "sosa:madeObservation", "sosa:madeBySensor", "sosa:hasFeatureOfInterest", "sosa:isHostedBy", # GeoSPARQL additional "geosparql:hasSpatialResolution", "geosparql:hasCentroid", "geosparql:sfContains", # RDA (Resource Description and Access) "rda:carrierType", "rda:contentType", "rda:mediaType", "rda:modeOfIssuance", # Dublin Core (additional dcterms) "dcterms:created", # OWL "owl:sameAs", "owl:equivalentClass", "owl:equivalentProperty", # Schema.org (batch 3 - more predicates) "schema:isbn", "schema:keywords", "schema:category", "schema:educationalUse", "schema:validThrough", "schema:maintainer", "schema:usageInfo", "schema:approximateValue", "schema:applicationContact", "schema:legalForm", "schema:hasOccupation", "schema:artMedium", "schema:legislationIdentifier", "schema:eligibilityToWorkRequirement", "schema:organizationRole", "schema:softwareVersion", "schema:mainEntity", "schema:name", # PNV additional "pnv:nameSpecification", "pnv:nameComponent", "pnv:surnamePrefix", # GLEIF additional (gleif_base prefix - underscore variant used in some slot files) "gleif_base:hasLegalJurisdiction", "gleif_base:isManagedBy", "gleif_base:isRegisteredIn", "gleif_base:hasWebsite", "gleif_base:hasNameTranslatedEnglish", "gleif_base:hasNameLegalLocal", "gleif_base:hasAbbreviation", "gleif_base:hasAbbreviationLocal", "gleif_base:hasAbbreviationTransliterated", "gleif_base:hasLegalName", "gleif_base:hasLegalNameLocal", "gleif_base:hasLegalNameTransliterated", # CIDOC-CRM additional (batch 3) "crm:P45_consists_of", "crm:P126_employed", "crm:P140_assigned_attribute_to", "crm:P16_used_specific_object", "crm:P138_represents", # PiCo additional (batch 2) "pico:hasReligion", # Dublin Core (additional) "dct:language", # BIBO additional "bibo:isbn13", "bibo:isbn10", "bibo:oclcnum", "bibo:lccn", # Darwin Core additional "dwc:lifeStage", "dwc:sex", "dwc:preparations", "dwc:recordNumber", # VoID (Vocabulary of Interlinked Datasets) "void:sparqlEndpoint", "void:vocabulary", "void:dataDump", "void:exampleResource", "void:uriSpace", "void:linkPredicate", "void:triples", "void:entities", # GLEIF additional (gleif: prefix) "gleif:hasLegalForm", "gleif:hasEntityStatus", "gleif:hasLegalAddress", # CIDOC-CRM additional (batch 2) "crm:P28_custody_surrendered_by", "crm:P30_transferred_custody_of", "crm:P30i_custody_transferred_through", "crm:P50i_is_current_keeper_of", "crm:P70_documents", "crm:P70i_is_documented_in", # CIDOC-CRM additional (batch 3 - verified from CIDOC_CRM_v7.1.3.rdf) "crm:P11i_participated_in", "crm:P54_has_current_permanent_location", "crm:P33i_was_used_by", "crm:P51_has_former_or_current_owner", "crm:P32_used_general_technique", "crm:P24i_changed_ownership_through", "crm:P24_transferred_title_of", "crm:P109i_is_current_or_former_curator_of", "crm:P109_has_current_or_former_curator", "crm:P22_transferred_title_to", "crm:P16_used_specific_technique", "crm:P89_falls_within", "crm:E6_Destruction", # ORG additional (batch 2) "org:basedAt", "org:siteAddress", # RiC-O additional (batch 2) "rico:isManagerOf", "rico:isOrWasParticipantIn", "rico:directlyIncludes", # TOOI additional (batch 2) "tooi:organisatievorm", "tooi:rechtsvorm", "tooi:verantwoordelijke", "tooi:organisatieIdentificatie", # Wikidata predicates (direct properties and entity references) "wdt:P31", "wdt:P279", "wdt:P361", "wdt:P17", "wdt:P131", "wdt:P625", "wdt:P18", "wdt:P856", "wdt:P373", "wdt:P910", "wdt:P1566", "wdt:P213", "wdt:P227", "wdt:P214", "wdt:P244", "wdt:P791", "wdt:P576", "wdt:P571", "wdt:P159", "wdt:P170", "wdt:P127", "wdt:P749", "wikidata:P576", "wikidata:P31", "wikidata:P571", "wikidata:P159", "wd:P31", "wd:P279", "wd:P361", "wd:P576", # UNESCO ICH (Intangible Cultural Heritage vocabulary - external reference) "ich:safeguardedBy", "ich:hasSafeguardingMeasure", "ich:hasBearer", "ich:hasDomain", } def extract_predicates_from_slot(slot_file: Path) -> dict: """Extract all predicates from a slot file.""" try: with open(slot_file, 'r') as f: content = yaml.safe_load(f) except Exception as e: return {"error": str(e)} if not content or 'slots' not in content: return {"error": "No slots found"} predicates = {} for slot_name, slot_def in content.get('slots', {}).items(): predicates[slot_name] = { "slot_uri": slot_def.get('slot_uri'), "exact_mappings": slot_def.get('exact_mappings', []), "close_mappings": slot_def.get('close_mappings', []), "related_mappings": slot_def.get('related_mappings', []), "narrow_mappings": slot_def.get('narrow_mappings', []), "broad_mappings": slot_def.get('broad_mappings', []), } return predicates def validate_predicate(predicate: str) -> tuple: """Validate a predicate against known valid predicates.""" if predicate is None: return False, "None" if predicate in VALID_PREDICATES: return True, None # Check if it's a custom HC predicate (allowed) if predicate.startswith("hc:"): return True, "custom" return False, f"Unknown predicate: {predicate}" def main(): import argparse parser = argparse.ArgumentParser(description="Validate slot mappings against ontology predicates") parser.add_argument("--slots-dir", default="schemas/20251121/linkml/modules/slots", help="Path to slots directory") parser.add_argument("--verbose", "-v", action="store_true", help="Show all predicates") args = parser.parse_args() slots_dir = Path(args.slots_dir) if not slots_dir.exists(): print(f"Slots directory not found: {slots_dir}") return 1 # Get list of recently updated slots updated_slots = [ "has_or_had_abbreviation", "is_or_was_about_digital_presence", "has_or_had_about_text", "has_or_had_academic_affiliation", "has_or_had_academic_program", "accepts_or_accepted_external_work", "accepts_or_accepted_payment_method", "accepts_or_accepted_visiting_scholar", "has_or_had_access_condition", "has_access_application_url", "has_or_had_access_control", # ... add more as needed ] total_valid = 0 total_invalid = 0 invalid_predicates = [] for slot_file in sorted(slots_dir.glob("*.yaml")): predicates = extract_predicates_from_slot(slot_file) if "error" in predicates: continue for slot_name, mappings in predicates.items(): # Check slot_uri valid, error = validate_predicate(mappings["slot_uri"]) if not valid and error != "None": invalid_predicates.append((slot_file.name, "slot_uri", mappings["slot_uri"])) total_invalid += 1 else: total_valid += 1 # Check all mapping types for mapping_type in ["exact_mappings", "close_mappings", "related_mappings", "narrow_mappings", "broad_mappings"]: for pred in mappings.get(mapping_type, []) or []: valid, error = validate_predicate(pred) if not valid: invalid_predicates.append((slot_file.name, mapping_type, pred)) total_invalid += 1 else: total_valid += 1 print(f"Validation Results:") print(f" Valid predicates: {total_valid}") print(f" Invalid predicates: {total_invalid}") print() if invalid_predicates: print("Invalid predicates found:") for filename, mapping_type, pred in sorted(set(invalid_predicates)): print(f" {filename}: {mapping_type} = {pred}") return 0 if total_invalid == 0 else 1 if __name__ == "__main__": exit(main())