import os import re import yaml # Configuration SCHEMA_DIR = "schemas/20251121/linkml" ONTOLOGY_DIR = "data/ontology" # Mapping from prefix to filename (based on ONTOLOGY_CATALOG.md and ls output) # Corrected based on actual file existence PREFIX_TO_FILE = { "crm": "CIDOC_CRM_v7.1.3.rdf", "crmgeo": "CRMgeo_v1_2.rdfs", "rico": "RiC-O_1-1.rdf", "edm": "edm.owl", "bf": "bibframe.rdf", "premis": "premis3.owl", "pico": "pico.ttl", "foaf": "foaf.ttl", "org": "org.rdf", "cpov": "core-public-organisation-ap.ttl", "regorg": "regorg.ttl", "tooi": "tooiont.ttl", "gleif-base": "gleif_base.ttl", "gleif-L1": "gleif_l1.ttl", "gleif-L2": "gleif_l2.ttl", "gleif-elf": "gleif_legal_form.ttl", "gleif-ra": "gleif_ra.ttl", "ebg": "ebg-ontology.ttl", "fibo-fnd": "fibo.rdf", "gn": "geonames_ontology.rdf", "geo": "geo.ttl", "wgs84_pos": "wgs84_pos.rdf", "lcc-cr": "lcc-cr.rdf", "lcc-3166-1": "lcc-3166-1.rdf", "lcc-3166-2": "lcc-3166-2.rdf", "lcc-lr": "lcc-lr.rdf", "lcc-639-1": "lcc-639-1.rdf", "sosa": "sosa.ttl", "ssn": "ssn.ttl", "skos": "skos.rdf", "dc": "dublin_core_elements.rdf", "dcterms": "dcterms.rdf", # Changed from dublin_core_elements.rdf to dcterms.rdf "dcat": "dcat3.ttl", "schema": "schemaorg.owl", "vcard": "vcard.ttl", "dqv": "dqv.ttl", "adms": "adms.ttl", "prov": "prov-o.ttl", # Prefer PROV-O "pav": "pav.rdf", "time": "time.ttl", "ore": "ore.rdf", "era": "era_ontology.ttl", "doap": "doap.rdf", "dbo": "dbpedia_ontology.owl", "hydra": "hydra_cg.jsonld", "oasis": "oasis.owl", "wod": "wod_thing.ttl" } # Cache for ontology content ontology_content = {} def get_ontology_content(filename): if filename in ontology_content: return ontology_content[filename] path = os.path.join(ONTOLOGY_DIR, filename) if not os.path.exists(path): # specific fallback for commonly confused extensions if filename.endswith('.ttl') and os.path.exists(path.replace('.ttl', '.rdf')): path = path.replace('.ttl', '.rdf') elif filename.endswith('.rdf') and os.path.exists(path.replace('.rdf', '.ttl')): path = path.replace('.rdf', '.ttl') else: return None try: with open(path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() ontology_content[filename] = content return content except Exception as e: return None def check_term_in_ontology(prefix, term, filename): content = get_ontology_content(filename) if content is None: return False # Ontology file missing # Simple heuristic checks # 1. Check for "prefix:term" (common in Turtle) if f"{prefix}:{term}" in content: return True # 2. Check for "term" (if defined with local base or ID) - risky but often needed for XML/RDF # We try to be a bit specific: ID="term", about=".../term", >term< (for XML) if f'ID="{term}"' in content: return True if f'about="{term}"' in content: return True if f'/{term}"' in content: return True if f'#{term}"' in content: return True if f' {term} ' in content: return True # Very loose # 3. Check for ":term" (Turtle/N3 local name) if f":{term}" in content: return True # 4. Check for term as a subject in RDF/XML "rdf:ID=" or "rdf:about=" if f'rdf:ID="{term}"' in content: return True if f'rdf:about="{term}"' in content: return True if f'rdf:about="#{term}"' in content: return True # Special case for schema.org which uses http://schema.org/Term if prefix == 'schema' and f"schema.org/{term}" in content: return True return False def verify_files(): # Walk through schema directory for root, dirs, files in os.walk(SCHEMA_DIR): # Exclude archive directory if "archive" in dirs: dirs.remove("archive") # Also skip if current root is inside an archive directory if "/archive" in root or os.path.sep + "archive" in root: continue for file in files: if not file.endswith(".yaml"): continue filepath = os.path.join(root, file) try: with open(filepath, 'r') as f: content = f.read() except: continue # Find all CURIEs: prefix:Term # Regex to find patterns like " prefix:Term " or "prefix:Term" # Exclude http:// and https:// curies = re.findall(r'(?