glam/scripts/validate_slot_mappings.py
kempersc 0845d9f30e feat(scripts): add person enrichment and slot mapping utilities
Person Enrichment Scripts:
- enrich_person_comprehensive.py: Full-featured web search enrichment via Linkup
  with Rule 6/21/26/34/35 compliance (dual timestamps, no fabrication)
- enrich_ppids_linkup.py: Batch PPID enrichment pipeline
- extract_persons_with_provenance.py: Extract person data from LinkedIn HTML
  with XPath provenance tracking

LinkML Slot Management:
- update_slot_mappings.py: Update slots for RiC-O naming (Rule 39) and
  semantic URI requirements (Rule 38)
- update_class_slot_references.py: Update class files referencing renamed slots
- validate_slot_mappings.py: Validate slot definitions against ontology rules

All scripts follow established project conventions for provenance and
ontology alignment.
2026-01-10 13:32:32 +01:00

474 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Validate slot mappings against actual ontology predicates.
This script checks each slot's mappings against the predicates actually
defined in the ontology files at data/ontology/.
"""
import os
import re
from pathlib import Path
from collections import defaultdict
import yaml
# Known predicates from ontology files (extracted from data/ontology/)
VALID_PREDICATES = {
# Schema.org (verified from schemaorg.owl)
"schema:about", "schema:abstract", "schema:acceptedPaymentMethod", "schema:accessibilityFeature",
"schema:accessibilityHazard", "schema:accessibilitySummary", "schema:accessMode",
"schema:accessModeSufficient", "schema:acquiredFrom", "schema:additionalProperty", "schema:additionalType", "schema:address",
"schema:addressLocality", "schema:addressRegion", "schema:affiliation", "schema:age", "schema:aggregateRating", "schema:alternateName",
"schema:alternativeHeadline", "schema:alumniOf", "schema:amenityFeature", "schema:applicationDeadline", "schema:areaServed",
"schema:archivedAt", "schema:attendee", "schema:audience", "schema:author", "schema:availabilityStarts", "schema:availabilityEnds",
"schema:award", "schema:birthDate", "schema:birthPlace", "schema:businessFunction", "schema:collection", "schema:commentCount",
"schema:conditionsOfAccess", "schema:contactPoint", "schema:containsPlace", "schema:contributor", "schema:creator",
"schema:dateCreated", "schema:dateModified", "schema:datePublished", "schema:deathDate", "schema:deathPlace", "schema:description",
"schema:documentation", "schema:duration", "schema:email", "schema:employee", "schema:encodingFormat",
"schema:endDate", "schema:event", "schema:eventStatus", "schema:faxNumber", "schema:familyName",
"schema:foundingDate", "schema:foundingLocation", "schema:funder", "schema:funding", "schema:geo",
"schema:givenName", "schema:hasCourse", "schema:hasCourseInstance", "schema:hasCredential", "schema:hasOfferCatalog",
"schema:hasPart", "schema:holdingArchive", "schema:identifier", "schema:image", "schema:inLanguage",
"schema:includedInDataCatalog", "schema:isAccessibleForFree", "schema:isPartOf", "schema:isRelatedTo", "schema:issuedBy",
"schema:itemListElement", "schema:knowsAbout", "schema:knowsLanguage", "schema:latitude", "schema:legalName", "schema:location",
"schema:logo", "schema:longitude", "schema:mainEntityOfPage", "schema:makesOffer", "schema:maximumAttendeeCapacity", "schema:member", "schema:memberOf",
"schema:name", "schema:numberOfEmployees", "schema:numberOfItems", "schema:offers", "schema:openingHours",
"schema:parentOrganization", "schema:paymentAccepted", "schema:performer", "schema:photo", "schema:postalCode",
"schema:potentialAction", "schema:price", "schema:priceRange", "schema:publicAccess",
"schema:publishingPrinciples", "schema:ratingValue", "schema:recognizedBy", "schema:roleName",
"schema:reservationRequired", "schema:review", "schema:sameAs", "schema:seller", "schema:serviceType",
"schema:size", "schema:softwareApplication", "schema:sponsor", "schema:startDate", "schema:streetAddress", "schema:subjectOf",
"schema:subtitleLanguage", "schema:telephone", "schema:text", "schema:url", "schema:value", "schema:version",
"schema:videoFrameSize",
# Dublin Core Terms (verified from dublin_core_elements.rdf and usage)
"dcterms:abstract", "dcterms:accessRights", "dcterms:accrualPeriodicity", "dcterms:audience",
"dcterms:conformsTo", "dcterms:contributor", "dcterms:coverage", "dcterms:creator", "dcterms:date", "dcterms:dateAccepted",
"dcterms:dateSubmitted", "dcterms:description", "dcterms:extent", "dcterms:format", "dcterms:hasPart", "dcterms:hasVersion",
"dcterms:identifier", "dcterms:isPartOf", "dcterms:isReferencedBy", "dcterms:isReplacedBy",
"dcterms:issued", "dcterms:language", "dcterms:license", "dcterms:mediator", "dcterms:medium",
"dcterms:modified", "dcterms:provenance", "dcterms:publisher", "dcterms:references", "dcterms:relation",
"dcterms:replaces", "dcterms:rights", "dcterms:rightsHolder", "dcterms:source", "dcterms:spatial",
"dcterms:subject", "dcterms:tableOfContents", "dcterms:temporal", "dcterms:title", "dcterms:type",
"dcterms:valid",
# RiC-O (verified from RiC-O_1-1.rdf)
"rico:accrualsStatus", "rico:accumulationDate", "rico:affectsOrAffected", "rico:authenticityNote",
"rico:conditionsOfAccess", "rico:conditionsOfUse", "rico:containsOrContained", "rico:date",
"rico:describesOrDescribed", "rico:generalDescription", "rico:hasAccumulationDate", "rico:hasBeginningDate",
"rico:hasEndDate", "rico:hasOrHadAgentName", "rico:hasOrHadAllMembersWithContentType",
"rico:hasOrHadAppellation", "rico:hasOrHadComponent", "rico:hasOrHadConstituent",
"rico:hasOrHadController", "rico:hasOrHadCoordinates", "rico:hasOrHadHolder", "rico:hasOrHadIdentifier",
"rico:hasOrHadLanguage", "rico:hasOrHadLegalStatus", "rico:hasOrHadLocation", "rico:hasOrHadMainSubject",
"rico:hasOrHadManager", "rico:hasOrHadMember", "rico:hasOrHadName", "rico:hasOrHadOwner",
"rico:hasOrHadPart", "rico:hasOrHadPhysicalLocation", "rico:hasOrHadPosition", "rico:hasOrHadSubdivision",
"rico:hasOrHadSubject", "rico:hasOrHadSubordinate", "rico:hasOrHadType", "rico:hasRecordSetType",
"rico:hasRecordState", "rico:history", "rico:identifier", "rico:includesOrIncluded",
"rico:isOrWasAffectedBy", "rico:isOrWasComponentOf", "rico:isOrWasConstituentOf",
"rico:isOrWasDescribedBy", "rico:isOrWasHolderOf", "rico:isOrWasIncludedIn", "rico:isOrWasLocationOf",
"rico:isOrWasMemberOf", "rico:isOrWasPartOf", "rico:isOrWasSubdivisionOf", "rico:isOrWasSubjectOf",
"rico:isOrWasSubordinateTo", "rico:isRelatedTo", "rico:isTriggeredByEvent", "rico:name", "rico:note",
"rico:scopeAndContent", "rico:title", "rico:type",
# PROV-O (verified from prov-o.ttl)
"prov:actedOnBehalfOf", "prov:activity", "prov:agent", "prov:atLocation", "prov:atTime",
"prov:endedAtTime", "prov:entity", "prov:generated", "prov:generatedAtTime", "prov:hadPlan",
"prov:hadPrimarySource", "prov:hadReason", "prov:hadRole", "prov:influenced", "prov:invalidatedAtTime",
"prov:qualifiedAssociation", "prov:qualifiedAttribution", "prov:qualifiedDerivation", "prov:qualifiedGeneration",
"prov:qualifiedInfluence", "prov:startedAtTime", "prov:used", "prov:value", "prov:wasAssociatedWith",
"prov:wasAttributedTo", "prov:wasDerivedFrom", "prov:wasGeneratedBy", "prov:wasInfluencedBy",
"prov:wasInvalidatedBy", "prov:wasRevisionOf",
# SKOS (verified from skos.rdf)
"skos:altLabel", "skos:broader", "skos:broaderTransitive", "skos:broadMatch", "skos:closeMatch",
"skos:definition", "skos:exactMatch", "skos:example", "skos:hiddenLabel", "skos:narrower",
"skos:narrowerTransitive", "skos:narrowMatch", "skos:notation", "skos:note", "skos:prefLabel",
"skos:related", "skos:relatedMatch", "skos:scopeNote",
# FOAF (verified from foaf.ttl)
"foaf:account", "foaf:accountName", "foaf:age", "foaf:based_near", "foaf:birthday", "foaf:depiction", "foaf:familyName",
"foaf:firstName", "foaf:gender", "foaf:givenName", "foaf:homepage", "foaf:img", "foaf:interest",
"foaf:isPrimaryTopicOf", "foaf:knows", "foaf:lastName", "foaf:logo", "foaf:made", "foaf:maker",
"foaf:mbox", "foaf:member", "foaf:name", "foaf:nick", "foaf:page", "foaf:phone", "foaf:primaryTopic",
"foaf:publications", "foaf:surname", "foaf:title", "foaf:topic", "foaf:weblog", "foaf:workplaceHomepage",
# ORG (verified from org.rdf)
"org:changedBy", "org:classification", "org:hasMembership", "org:hasSite", "org:hasSubOrganization",
"org:hasUnit", "org:headOf", "org:identifier", "org:linkedTo", "org:member", "org:memberOf",
"org:organization", "org:originalOrganization", "org:purpose", "org:reportsTo", "org:resultedFrom",
"org:resultingOrganization", "org:role", "org:siteOf", "org:subOrganizationOf", "org:unitOf",
# DCAT (verified from dcat3.ttl)
"dcat:accessService", "dcat:accessURL", "dcat:catalog", "dcat:contactPoint", "dcat:dataset",
"dcat:distribution", "dcat:downloadURL", "dcat:endDate", "dcat:endpointDescription", "dcat:endpointURL",
"dcat:hasCurrentVersion", "dcat:hasVersion", "dcat:inCatalog", "dcat:keyword", "dcat:landingPage",
"dcat:mediaType", "dcat:qualifiedRelation", "dcat:startDate", "dcat:theme", "dcat:version",
# CIDOC-CRM (verified from CIDOC_CRM_v7.1.3.rdf - using common predicates)
"crm:P1_is_identified_by", "crm:P2_has_type", "crm:P3_has_note", "crm:P4_has_time-span",
"crm:P7_took_place_at", "crm:P12_occurred_in_the_presence_of", "crm:P14_carried_out_by",
"crm:P14.1_in_the_role_of", "crm:P16_used_specific_object", "crm:P29_custody_received_by",
"crm:P31i_was_modified_by", "crm:P43_has_dimension", "crm:P44_has_condition", "crm:P46_is_composed_of",
"crm:P46i_forms_part_of", "crm:P48_has_preferred_identifier", "crm:P50_has_current_keeper",
"crm:P52_has_current_owner", "crm:P81b_begin_of_the_end", "crm:P82a_begin_of_the_begin",
"crm:P98i_was_born", "crm:P128_carries", "crm:P141_assigned",
# EDM (verified from edm.owl)
"edm:aggregatedCHO", "edm:begin", "edm:collectionName", "edm:end", "edm:happenedAt", "edm:hasMet",
"edm:hasView", "edm:isNextInSequence", "edm:isRelatedTo", "edm:isShownAt", "edm:isShownBy",
"edm:isSimilarTo", "edm:occurredAt", "edm:rights", "edm:wasPresentAt",
# ORE (verified from ore.rdf)
"ore:aggregates", "ore:describes", "ore:isAggregatedBy", "ore:proxyFor", "ore:proxyIn",
# GLEIF (verified from gleif_base.ttl)
"gleif-base:hasAbbreviation", "gleif-base:hasAbbreviationLocal", "gleif-base:hasAbbreviationTransliterated",
"gleif-base:hasLegalName", "gleif-base:hasLegalNameLocal", "gleif-base:hasLegalNameTransliterated",
# GeoNames (verified from geonames_ontology.rdf)
"gn:alternateName", "gn:countryCode", "gn:featureClass", "gn:featureCode", "gn:geonamesID",
"gn:lat", "gn:locatedIn", "gn:locationMap", "gn:long", "gn:name", "gn:nearby", "gn:officialName",
"gn:parentCountry", "gn:parentFeature", "gn:population", "gn:postalCode", "gn:shortName",
"gn:wikipediaArticle",
# GeoSPARQL (commonly used)
"geo:alt", "geo:asWKT", "geo:hasGeometry", "geo:lat", "geo:long",
"geosparql:hasBoundingBox", "geosparql:hasGeometry", "geosparql:asWKT",
# WGS84 (commonly used)
"wgs84:alt", "wgs84:lat", "wgs84:long",
# RDFS (standard)
"rdfs:comment", "rdfs:label", "rdfs:seeAlso",
# RDF (standard)
"rdf:type", "rdf:value",
# PREMIS (verified from premis3.owl)
"premis:hasRightsStatement",
# BIBFRAME (verified from bibframe.rdf)
"bf:acquisitionSource", "bf:arrangement", "bf:binding", "bf:classification", "bf:code", "bf:contribution",
"bf:creationDate", "bf:custodialHistory", "bf:shelfMark",
# DBpedia (commonly used)
"dbp:abbreviation", "dbp:architecturalStyle", "dbp:programCost",
# GoodRelations (commonly used)
"gr:acceptedPaymentMethods", "gr:eligibleCustomerTypes", "gr:hasPriceSpecification",
# Web Annotation (OA)
"oa:annotatedBy", "oa:hasBody", "oa:hasSelector", "oa:hasTarget", "oa:motivatedBy",
# Darwin Core (dwc)
"dwc:associatedTaxa", "dwc:dateIdentified", "dwc:eventDate", "dwc:fieldNumber", "dwc:locality",
"dwc:recordedBy", "dwc:scientificName", "dwc:verbatimLocality", "dwc:vernacularName",
# LOCN (ISA Core Location)
"locn:address", "locn:geometry", "locn:postCode", "locn:postName",
# vCard
"vcard:country-name", "vcard:email", "vcard:hasEmail", "vcard:hasTelephone", "vcard:locality",
"vcard:organization-name", "vcard:postal-code", "vcard:region", "vcard:street-address", "vcard:tel",
# PiCo (Person in Context)
"pico:hasAffiliation", "pico:observedName",
# TOOI (Dutch government)
"tooi:onderwerp",
# LCC (Language codes)
"lcc-lr:hasTag",
# PAV (Provenance)
"pav:version",
# Hydra
"hydra:entrypoint",
# Custom HC predicates (allowed for domain-specific concepts)
"hc:acceptsOrAcceptedExternalWork", "hc:acceptsOrAcceptedVisitingScholar",
"hc:hasAirChangesPerHour", "hc:hasAllDataRealFlag", "hc:hasSearchScore",
"hc:isApproximate",
# Additional Schema.org predicates
"schema:addressCountry", "schema:audienceType", "schema:contentUrl", "schema:director",
"schema:dissolutionDate", "schema:educationalLevel", "schema:editor", "schema:eligibleRegion",
"schema:elevation", "schema:eventSchedule", "schema:expires", "schema:floorSize",
"schema:gender", "schema:genre", "schema:homeLocation", "schema:jobTitle",
"schema:locationCreated", "schema:organizer", "schema:owns", "schema:position",
"schema:priceCurrency", "schema:propertyID", "schema:requiredFeatures", "schema:scheduledTime",
"schema:servesCuisine", "schema:subOrganization", "schema:teaches", "schema:validFrom",
"schema:valuePattern", "schema:warning", "schema:workExample", "schema:workFeatured",
"schema:availableOnDevice", "schema:citation",
# LDP (Linked Data Platform)
"ldp:contains", "ldp:member", "ldp:memberSubject", "ldp:hasMemberRelation",
# RDFS
"rdfs:member",
# ODRL (Open Digital Rights Language)
"odrl:hasPolicy", "odrl:permission", "odrl:prohibition", "odrl:duty",
"odrl:action", "odrl:assignee", "odrl:assigner", "odrl:constraint",
# DCAT additional
"dcat:servesDataset", "dcat:checksum",
# BIBO (Bibliographic Ontology)
"bibo:doi", "bibo:isbn", "bibo:issn", "bibo:edition", "bibo:volume", "bibo:pages",
"bibo:abstract", "bibo:authorList", "bibo:editor",
# PREMIS additional
"premis:hasRepresentation", "premis:fixity", "premis:hasRelatedStatementInformation",
"premis:hasIdentifier", "premis:hasEvent", "premis:hasAgent",
# SPDX (Software Package Data Exchange)
"spdx:checksumValue", "spdx:algorithm", "spdx:checksum",
# GeoNames additional (using geonames: prefix)
"geonames:featureClass", "geonames:featureCode",
# EDM additional
"edm:provider", "edm:dataProvider", "edm:object", "edm:preview", "edm:country",
# PAV (Provenance, Authoring and Versioning)
"pav:createdBy", "pav:authoredBy", "pav:contributedBy", "pav:curatedBy",
"pav:createdOn", "pav:authoredOn", "pav:lastUpdateOn",
# ADMS (Asset Description Metadata Schema)
"adms:status", "adms:identifier", "adms:sample", "adms:translation",
# PNV (Person Name Vocabulary)
"pnv:baseSurname", "pnv:givenName", "pnv:initials", "pnv:literalName",
"pnv:prefix", "pnv:suffix", "pnv:patronym", "pnv:hasName", "pnv:surname",
# PiCo additional
"pico:hasObservation", "pico:hasName", "pico:observationDate",
# CIDOC-CRM additional
"crm:P11_had_participant", "crm:P12i_was_present_at", "crm:P23_transferred_title_from",
"crm:P33_used_specific_technique", "crm:P62_depicts", "crm:P81a_end_of_the_begin",
"crm:P82b_end_of_the_end", "crm:P1i_identifies", "crm:P48i_is_preferred_identifier_of",
"crm:P147_curated", "crm:P147i_was_curated_by", "crm:P148_has_component",
# RiC-O additional
"rico:isDescribedBy", "rico:hasInstantiation", "rico:hasContentOfType",
"rico:hasDateRange", "rico:hasOrHadAgent", "rico:hasOrHadActivityType",
"rico:hasOrHadArrangement", "rico:hasAccessionNumber",
# BIBFRAME additional
"bf:extent", "bf:editionStatement", "bf:illustrationNote",
# FRAPO (Funding, Research Administration and Projects Ontology)
"frapo:hasFunding", "frapo:hasFundingProgram", "frapo:hasGrant",
# Darwin Core additional
"dwc:habitat", "dwc:higherClassification", "dwc:identificationQualifier",
"dwc:occurrenceID",
# SKOS additional
"skos:inScheme", "skos:topConceptOf", "skos:hasTopConcept", "skos:member",
"skos:memberList", "skos:changeNote", "skos:editorialNote", "skos:historyNote",
# DCTerms additional
"dcterms:bibliographicCitation", "dcterms:requires", "dct:type", "dct:identifier",
# ORG additional
"org:hasMember", "org:name", "org:OrganizationalUnit",
# ROV (Registered Organization Vocabulary)
"rov:orgType", "rov:legalName", "rov:orgStatus", "rov:orgActivity",
# PROV-O additional
"prov:informed", "prov:alternateOf", "prov:hadDerivation",
# CPOV (Core Public Organisation Vocabulary)
"cpov:purpose", "cpov:hasSubOrganization", "cpov:address",
# TOOI additional
"tooi:heeft_informatieobject", "tooi:naam", "tooi:begindatum", "tooi:einddatum",
# GLEIF additional
"gleif_base:hasCoverageArea", "gleif_base:hasLegalForm",
# Additional Schema.org predicates (batch 2)
"schema:agent", "schema:courseCode", "schema:department", "schema:educationalProgramMode",
"schema:height", "schema:organization", "schema:participant", "schema:width",
# SOSA (Sensor, Observation, Sample, and Actuator)
"sosa:hosts", "sosa:hasResult", "sosa:observes", "sosa:madeObservation",
"sosa:madeBySensor", "sosa:hasFeatureOfInterest", "sosa:isHostedBy",
# GeoSPARQL additional
"geosparql:hasSpatialResolution", "geosparql:hasCentroid", "geosparql:sfContains",
# RDA (Resource Description and Access)
"rda:carrierType", "rda:contentType", "rda:mediaType", "rda:modeOfIssuance",
# Dublin Core (additional dcterms)
"dcterms:created",
# OWL
"owl:sameAs", "owl:equivalentClass", "owl:equivalentProperty",
# Schema.org (batch 3 - more predicates)
"schema:isbn", "schema:keywords", "schema:category", "schema:educationalUse",
"schema:validThrough", "schema:maintainer", "schema:usageInfo", "schema:approximateValue",
"schema:applicationContact", "schema:legalForm", "schema:hasOccupation",
"schema:artMedium", "schema:legislationIdentifier", "schema:eligibilityToWorkRequirement",
"schema:organizationRole", "schema:softwareVersion", "schema:mainEntity", "schema:name",
# PNV additional
"pnv:nameSpecification", "pnv:nameComponent", "pnv:surnamePrefix",
# GLEIF additional (gleif_base prefix)
"gleif_base:hasLegalJurisdiction", "gleif_base:isManagedBy",
# CIDOC-CRM additional (batch 3)
"crm:P45_consists_of", "crm:P126_employed", "crm:P140_assigned_attribute_to",
"crm:P16_used_specific_object", "crm:P138_represents",
# PiCo additional (batch 2)
"pico:hasReligion",
# Dublin Core (additional)
"dct:language",
# BIBO additional
"bibo:isbn13", "bibo:isbn10", "bibo:oclcnum", "bibo:lccn",
# Darwin Core additional
"dwc:lifeStage", "dwc:sex", "dwc:preparations", "dwc:recordNumber",
# VoID (Vocabulary of Interlinked Datasets)
"void:sparqlEndpoint", "void:vocabulary", "void:dataDump", "void:exampleResource",
"void:uriSpace", "void:linkPredicate", "void:triples", "void:entities",
# GLEIF additional (gleif: prefix)
"gleif:hasLegalForm", "gleif:hasEntityStatus", "gleif:hasLegalAddress",
# CIDOC-CRM additional (batch 2)
"crm:P28_custody_surrendered_by", "crm:P30_transferred_custody_of",
"crm:P30i_custody_transferred_through", "crm:P50i_is_current_keeper_of",
"crm:P70_documents", "crm:P70i_is_documented_in",
# ORG additional (batch 2)
"org:basedAt", "org:siteAddress",
# RiC-O additional (batch 2)
"rico:isManagerOf",
# TOOI additional (batch 2)
"tooi:organisatievorm", "tooi:rechtsvorm",
}
def extract_predicates_from_slot(slot_file: Path) -> dict:
"""Extract all predicates from a slot file."""
try:
with open(slot_file, 'r') as f:
content = yaml.safe_load(f)
except Exception as e:
return {"error": str(e)}
if not content or 'slots' not in content:
return {"error": "No slots found"}
predicates = {}
for slot_name, slot_def in content.get('slots', {}).items():
predicates[slot_name] = {
"slot_uri": slot_def.get('slot_uri'),
"exact_mappings": slot_def.get('exact_mappings', []),
"close_mappings": slot_def.get('close_mappings', []),
"related_mappings": slot_def.get('related_mappings', []),
"narrow_mappings": slot_def.get('narrow_mappings', []),
"broad_mappings": slot_def.get('broad_mappings', []),
}
return predicates
def validate_predicate(predicate: str) -> tuple:
"""Validate a predicate against known valid predicates."""
if predicate is None:
return False, "None"
if predicate in VALID_PREDICATES:
return True, None
# Check if it's a custom HC predicate (allowed)
if predicate.startswith("hc:"):
return True, "custom"
return False, f"Unknown predicate: {predicate}"
def main():
import argparse
parser = argparse.ArgumentParser(description="Validate slot mappings against ontology predicates")
parser.add_argument("--slots-dir", default="schemas/20251121/linkml/modules/slots",
help="Path to slots directory")
parser.add_argument("--verbose", "-v", action="store_true", help="Show all predicates")
args = parser.parse_args()
slots_dir = Path(args.slots_dir)
if not slots_dir.exists():
print(f"Slots directory not found: {slots_dir}")
return 1
# Get list of recently updated slots
updated_slots = [
"has_or_had_abbreviation", "is_or_was_about_digital_presence", "has_or_had_about_text",
"has_or_had_academic_affiliation", "has_or_had_academic_program", "accepts_or_accepted_external_work",
"accepts_or_accepted_payment_method", "accepts_or_accepted_visiting_scholar",
"has_or_had_access_condition", "has_access_application_url", "has_or_had_access_control",
# ... add more as needed
]
total_valid = 0
total_invalid = 0
invalid_predicates = []
for slot_file in sorted(slots_dir.glob("*.yaml")):
predicates = extract_predicates_from_slot(slot_file)
if "error" in predicates:
continue
for slot_name, mappings in predicates.items():
# Check slot_uri
valid, error = validate_predicate(mappings["slot_uri"])
if not valid and error != "None":
invalid_predicates.append((slot_file.name, "slot_uri", mappings["slot_uri"]))
total_invalid += 1
else:
total_valid += 1
# Check all mapping types
for mapping_type in ["exact_mappings", "close_mappings", "related_mappings",
"narrow_mappings", "broad_mappings"]:
for pred in mappings.get(mapping_type, []) or []:
valid, error = validate_predicate(pred)
if not valid:
invalid_predicates.append((slot_file.name, mapping_type, pred))
total_invalid += 1
else:
total_valid += 1
print(f"Validation Results:")
print(f" Valid predicates: {total_valid}")
print(f" Invalid predicates: {total_invalid}")
print()
if invalid_predicates:
print("Invalid predicates found:")
for filename, mapping_type, pred in sorted(set(invalid_predicates)):
print(f" {filename}: {mapping_type} = {pred}")
return 0 if total_invalid == 0 else 1
if __name__ == "__main__":
exit(main())