- Added `fix_dual_class_link.py` to remove dual class link references from specified YAML files. - Created `fix_specific_ghosts.py` to apply specific replacements in YAML files based on defined mappings. - Introduced `migrate_staff_count.py` to migrate staff count references to a new structure in specified YAML files. - Developed `migrate_type_slots.py` to replace type-related slots with new identifiers across YAML files. - Implemented `scan_ghost_references.py` to identify and report ghost references to archived slots and classes in YAML files. - Added `verify_ontology_terms.py` to verify the presence of ontology terms in specified ontology files against schema definitions.
152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
|
|
import os
|
|
import re
|
|
import yaml
|
|
|
|
# Configuration
|
|
SCHEMA_DIR = "schemas/20251121/linkml"
|
|
ONTOLOGY_DIR = "data/ontology"
|
|
|
|
# Mapping from prefix to filename (based on ONTOLOGY_CATALOG.md and ls output)
|
|
# Corrected based on actual file existence
|
|
PREFIX_TO_FILE = {
|
|
"crm": "CIDOC_CRM_v7.1.3.rdf",
|
|
"crmgeo": "CRMgeo_v1_2.rdfs",
|
|
"rico": "RiC-O_1-1.rdf",
|
|
"edm": "edm.owl",
|
|
"bf": "bibframe.rdf",
|
|
"premis": "premis3.owl",
|
|
"pico": "pico.ttl",
|
|
"foaf": "foaf.ttl",
|
|
"org": "org.rdf",
|
|
"cpov": "core-public-organisation-ap.ttl",
|
|
"regorg": "regorg.ttl",
|
|
"tooi": "tooiont.ttl",
|
|
"gleif-base": "gleif_base.ttl",
|
|
"gleif-L1": "gleif_l1.ttl",
|
|
"gleif-L2": "gleif_l2.ttl",
|
|
"gleif-elf": "gleif_legal_form.ttl",
|
|
"gleif-ra": "gleif_ra.ttl",
|
|
"ebg": "ebg-ontology.ttl",
|
|
"fibo-fnd": "fibo.rdf",
|
|
"gn": "geonames_ontology.rdf",
|
|
"geo": "geo.ttl",
|
|
"wgs84_pos": "wgs84_pos.rdf",
|
|
"lcc-cr": "lcc-cr.rdf",
|
|
"lcc-3166-1": "lcc-3166-1.rdf",
|
|
"lcc-3166-2": "lcc-3166-2.rdf",
|
|
"lcc-lr": "lcc-lr.rdf",
|
|
"lcc-639-1": "lcc-639-1.rdf",
|
|
"sosa": "sosa.ttl",
|
|
"ssn": "ssn.ttl",
|
|
"skos": "skos.rdf",
|
|
"dc": "dublin_core_elements.rdf",
|
|
"dcterms": "dcterms.rdf", # Changed from dublin_core_elements.rdf to dcterms.rdf
|
|
"dcat": "dcat3.ttl",
|
|
"schema": "schemaorg.owl",
|
|
"vcard": "vcard.ttl",
|
|
"dqv": "dqv.ttl",
|
|
"adms": "adms.ttl",
|
|
"prov": "prov-o.ttl", # Prefer PROV-O
|
|
"pav": "pav.rdf",
|
|
"time": "time.ttl",
|
|
"ore": "ore.rdf",
|
|
"era": "era_ontology.ttl",
|
|
"doap": "doap.rdf",
|
|
"dbo": "dbpedia_ontology.owl",
|
|
"hydra": "hydra_cg.jsonld",
|
|
"oasis": "oasis.owl",
|
|
"wod": "wod_thing.ttl"
|
|
}
|
|
|
|
# Cache for ontology content
|
|
ontology_content = {}
|
|
|
|
def get_ontology_content(filename):
|
|
if filename in ontology_content:
|
|
return ontology_content[filename]
|
|
|
|
path = os.path.join(ONTOLOGY_DIR, filename)
|
|
if not os.path.exists(path):
|
|
# specific fallback for commonly confused extensions
|
|
if filename.endswith('.ttl') and os.path.exists(path.replace('.ttl', '.rdf')):
|
|
path = path.replace('.ttl', '.rdf')
|
|
elif filename.endswith('.rdf') and os.path.exists(path.replace('.rdf', '.ttl')):
|
|
path = path.replace('.rdf', '.ttl')
|
|
else:
|
|
return None
|
|
|
|
try:
|
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
ontology_content[filename] = content
|
|
return content
|
|
except Exception as e:
|
|
return None
|
|
|
|
def check_term_in_ontology(prefix, term, filename):
|
|
content = get_ontology_content(filename)
|
|
if content is None:
|
|
return False # Ontology file missing
|
|
|
|
# Simple heuristic checks
|
|
# 1. Check for "prefix:term" (common in Turtle)
|
|
if f"{prefix}:{term}" in content:
|
|
return True
|
|
# 2. Check for "term" (if defined with local base or ID) - risky but often needed for XML/RDF
|
|
# We try to be a bit specific: ID="term", about=".../term", >term< (for XML)
|
|
if f'ID="{term}"' in content: return True
|
|
if f'about="{term}"' in content: return True
|
|
if f'/{term}"' in content: return True
|
|
if f'#{term}"' in content: return True
|
|
if f' {term} ' in content: return True # Very loose
|
|
|
|
# 3. Check for ":term" (Turtle/N3 local name)
|
|
if f":{term}" in content: return True
|
|
# 4. Check for term as a subject in RDF/XML "rdf:ID=" or "rdf:about="
|
|
if f'rdf:ID="{term}"' in content: return True
|
|
if f'rdf:about="{term}"' in content: return True
|
|
if f'rdf:about="#{term}"' in content: return True
|
|
|
|
# Special case for schema.org which uses http://schema.org/Term
|
|
if prefix == 'schema' and f"schema.org/{term}" in content:
|
|
return True
|
|
|
|
return False
|
|
|
|
def verify_files():
|
|
# Walk through schema directory
|
|
for root, dirs, files in os.walk(SCHEMA_DIR):
|
|
# Exclude archive directory
|
|
if "archive" in dirs:
|
|
dirs.remove("archive")
|
|
|
|
# Also skip if current root is inside an archive directory
|
|
if "/archive" in root or os.path.sep + "archive" in root:
|
|
continue
|
|
|
|
for file in files:
|
|
if not file.endswith(".yaml"):
|
|
continue
|
|
|
|
filepath = os.path.join(root, file)
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
content = f.read()
|
|
except:
|
|
continue
|
|
|
|
# Find all CURIEs: prefix:Term
|
|
# Regex to find patterns like " prefix:Term " or "prefix:Term"
|
|
# Exclude http:// and https://
|
|
curies = re.findall(r'(?<!http:)(?<!https:)\b([a-z0-9-]+):([a-zA-Z0-9_]+)\b', content)
|
|
|
|
for prefix, term in curies:
|
|
if prefix in PREFIX_TO_FILE:
|
|
filename = PREFIX_TO_FILE[prefix]
|
|
exists = check_term_in_ontology(prefix, term, filename)
|
|
if not exists:
|
|
print(f"File: {filepath} | Message: Term not found in {filename} ontology file. | Term: {prefix}:{term}")
|
|
|
|
if __name__ == "__main__":
|
|
verify_files()
|