glam/scripts/verify_ontology_terms.py
kempersc 7cf10084b4 Implement scripts for schema modifications and ontology verification
- Added `fix_dual_class_link.py` to remove dual class link references from specified YAML files.
- Created `fix_specific_ghosts.py` to apply specific replacements in YAML files based on defined mappings.
- Introduced `migrate_staff_count.py` to migrate staff count references to a new structure in specified YAML files.
- Developed `migrate_type_slots.py` to replace type-related slots with new identifiers across YAML files.
- Implemented `scan_ghost_references.py` to identify and report ghost references to archived slots and classes in YAML files.
- Added `verify_ontology_terms.py` to verify the presence of ontology terms in specified ontology files against schema definitions.
2026-01-29 17:10:25 +01:00

152 lines
5.1 KiB
Python

import os
import re
import yaml
# Configuration
SCHEMA_DIR = "schemas/20251121/linkml"
ONTOLOGY_DIR = "data/ontology"
# Mapping from prefix to filename (based on ONTOLOGY_CATALOG.md and ls output)
# Corrected based on actual file existence
PREFIX_TO_FILE = {
"crm": "CIDOC_CRM_v7.1.3.rdf",
"crmgeo": "CRMgeo_v1_2.rdfs",
"rico": "RiC-O_1-1.rdf",
"edm": "edm.owl",
"bf": "bibframe.rdf",
"premis": "premis3.owl",
"pico": "pico.ttl",
"foaf": "foaf.ttl",
"org": "org.rdf",
"cpov": "core-public-organisation-ap.ttl",
"regorg": "regorg.ttl",
"tooi": "tooiont.ttl",
"gleif-base": "gleif_base.ttl",
"gleif-L1": "gleif_l1.ttl",
"gleif-L2": "gleif_l2.ttl",
"gleif-elf": "gleif_legal_form.ttl",
"gleif-ra": "gleif_ra.ttl",
"ebg": "ebg-ontology.ttl",
"fibo-fnd": "fibo.rdf",
"gn": "geonames_ontology.rdf",
"geo": "geo.ttl",
"wgs84_pos": "wgs84_pos.rdf",
"lcc-cr": "lcc-cr.rdf",
"lcc-3166-1": "lcc-3166-1.rdf",
"lcc-3166-2": "lcc-3166-2.rdf",
"lcc-lr": "lcc-lr.rdf",
"lcc-639-1": "lcc-639-1.rdf",
"sosa": "sosa.ttl",
"ssn": "ssn.ttl",
"skos": "skos.rdf",
"dc": "dublin_core_elements.rdf",
"dcterms": "dcterms.rdf", # Changed from dublin_core_elements.rdf to dcterms.rdf
"dcat": "dcat3.ttl",
"schema": "schemaorg.owl",
"vcard": "vcard.ttl",
"dqv": "dqv.ttl",
"adms": "adms.ttl",
"prov": "prov-o.ttl", # Prefer PROV-O
"pav": "pav.rdf",
"time": "time.ttl",
"ore": "ore.rdf",
"era": "era_ontology.ttl",
"doap": "doap.rdf",
"dbo": "dbpedia_ontology.owl",
"hydra": "hydra_cg.jsonld",
"oasis": "oasis.owl",
"wod": "wod_thing.ttl"
}
# Cache for ontology content
ontology_content = {}
def get_ontology_content(filename):
if filename in ontology_content:
return ontology_content[filename]
path = os.path.join(ONTOLOGY_DIR, filename)
if not os.path.exists(path):
# specific fallback for commonly confused extensions
if filename.endswith('.ttl') and os.path.exists(path.replace('.ttl', '.rdf')):
path = path.replace('.ttl', '.rdf')
elif filename.endswith('.rdf') and os.path.exists(path.replace('.rdf', '.ttl')):
path = path.replace('.rdf', '.ttl')
else:
return None
try:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
ontology_content[filename] = content
return content
except Exception as e:
return None
def check_term_in_ontology(prefix, term, filename):
content = get_ontology_content(filename)
if content is None:
return False # Ontology file missing
# Simple heuristic checks
# 1. Check for "prefix:term" (common in Turtle)
if f"{prefix}:{term}" in content:
return True
# 2. Check for "term" (if defined with local base or ID) - risky but often needed for XML/RDF
# We try to be a bit specific: ID="term", about=".../term", >term< (for XML)
if f'ID="{term}"' in content: return True
if f'about="{term}"' in content: return True
if f'/{term}"' in content: return True
if f'#{term}"' in content: return True
if f' {term} ' in content: return True # Very loose
# 3. Check for ":term" (Turtle/N3 local name)
if f":{term}" in content: return True
# 4. Check for term as a subject in RDF/XML "rdf:ID=" or "rdf:about="
if f'rdf:ID="{term}"' in content: return True
if f'rdf:about="{term}"' in content: return True
if f'rdf:about="#{term}"' in content: return True
# Special case for schema.org which uses http://schema.org/Term
if prefix == 'schema' and f"schema.org/{term}" in content:
return True
return False
def verify_files():
# Walk through schema directory
for root, dirs, files in os.walk(SCHEMA_DIR):
# Exclude archive directory
if "archive" in dirs:
dirs.remove("archive")
# Also skip if current root is inside an archive directory
if "/archive" in root or os.path.sep + "archive" in root:
continue
for file in files:
if not file.endswith(".yaml"):
continue
filepath = os.path.join(root, file)
try:
with open(filepath, 'r') as f:
content = f.read()
except:
continue
# Find all CURIEs: prefix:Term
# Regex to find patterns like " prefix:Term " or "prefix:Term"
# Exclude http:// and https://
curies = re.findall(r'(?<!http:)(?<!https:)\b([a-z0-9-]+):([a-zA-Z0-9_]+)\b', content)
for prefix, term in curies:
if prefix in PREFIX_TO_FILE:
filename = PREFIX_TO_FILE[prefix]
exists = check_term_in_ontology(prefix, term, filename)
if not exists:
print(f"File: {filepath} | Message: Term not found in {filename} ontology file. | Term: {prefix}:{term}")
if __name__ == "__main__":
verify_files()