glam/schemas/20251121/linkml/scripts/fix_schema_structure.py

125 lines
4.4 KiB
Python

import os
import glob
import yaml
import re
SLOTS_DIR = "schemas/20251121/linkml/modules/slots"
CLASSES_DIR = "schemas/20251121/linkml/modules/classes"
STANDARD_PREFIXES = {
"linkml": "https://w3id.org/linkml/",
"hc": "https://nde.nl/ontology/hc/",
"schema": "http://schema.org/",
"dcterms": "http://purl.org/dc/terms/",
"prov": "http://www.w3.org/ns/prov#",
"crm": "http://www.cidoc-crm.org/cidoc-crm/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"org": "http://www.w3.org/ns/org#",
"xsd": "http://www.w3.org/2001/XMLSchema#"
}
def fix_file_structure(filepath, is_class=False):
print(f"Processing {filepath}...")
try:
with open(filepath, 'r') as f:
content = yaml.safe_load(f)
except Exception as e:
print(f"Error reading {filepath}: {e}")
return
if not content:
return
modified = False
# 1. Fix Nesting
main_key = 'classes' if is_class else 'slots'
if main_key not in content:
# Check if it looks like a flat definition
if 'name' in content:
name = content['name']
# Create nested structure
new_content = {
'id': content.get('id', f"https://nde.nl/ontology/hc/{'class' if is_class else 'slot'}/{name}"),
'name': name,
'title': content.get('title', name),
'prefixes': content.get('prefixes', STANDARD_PREFIXES),
'default_prefix': 'hc',
'imports': content.get('imports', ['linkml:types']),
main_key: {
name: content
}
}
# Remove metadata from the inner object that is now at top level
# (though LinkML allows overrides, usually we clean it up)
# We'll keep them in inner for safety but ensure top level has them
content = new_content
modified = True
else:
print(f"Skipping {filepath}: Cannot determine entity name for nesting.")
return
# 2. Fix Prefixes
if 'prefixes' not in content:
content['prefixes'] = STANDARD_PREFIXES
modified = True
else:
# Merge missing standard prefixes
for k, v in STANDARD_PREFIXES.items():
if k not in content['prefixes']:
content['prefixes'][k] = v
modified = True
# 3. Fix Imports
if 'imports' not in content:
content['imports'] = ['linkml:types']
modified = True
elif 'linkml:types' not in content['imports']:
content['imports'].append('linkml:types')
modified = True
# 4. Fix Annotations (in the inner entity)
entities = content[main_key]
for entity_name, entity_def in entities.items():
if 'annotations' not in entity_def:
entity_def['annotations'] = {}
modified = True
anns = entity_def['annotations']
if 'custodian_types' not in anns:
anns['custodian_types'] = ['*']
anns['custodian_types_rationale'] = 'Universal utility concept'
modified = True
# 5. Check/Add Mappings (if slot_uri exists but no mappings)
if 'slot_uri' in entity_def and 'exact_mappings' not in entity_def and 'close_mappings' not in entity_def:
# We won't auto-generate mappings to avoid duplicates if slot_uri IS the mapping
# But the user asked for them.
# Let's add exact_mappings = [slot_uri] if it looks like a CURIE
uri = entity_def['slot_uri']
if ':' in uri and not uri.startswith('http'):
entity_def['exact_mappings'] = [uri]
modified = True
if modified:
with open(filepath, 'w') as f:
yaml.dump(content, f, sort_keys=False, width=1000)
print(f"Fixed {filepath}")
def run():
# Fix Slots
slot_files = glob.glob(os.path.join(SLOTS_DIR, "*.yaml"))
for f in slot_files:
# We need to detect if it needs nesting.
# The check_schema_quality.py identified them.
# We'll just run generic fix on all, as it checks for 'slots' key.
fix_file_structure(f, is_class=False)
# Fix Classes (less likely to be flat, but good to check)
class_files = glob.glob(os.path.join(CLASSES_DIR, "*.yaml"))
for f in class_files:
fix_file_structure(f, is_class=True)
if __name__ == "__main__":
run()