glam/schemas/20251121/linkml/scripts/audit_compliance.py

149 lines
4.5 KiB
Python

import os
import yaml
import glob
CLASSES_DIR = "modules/classes"
SLOTS_DIR = "modules/slots"
# Default values for missing fields
DEFAULT_SPECIFICITY_SCORE = 0.1
DEFAULT_SPECIFICITY_RATIONALE = "Generic utility class/slot created during migration"
DEFAULT_CUSTODIAN_TYPES = ["*"]
DEFAULT_CUSTODIAN_RATIONALE = "Universal utility concept"
def fix_yaml_file(filepath, is_class=True):
try:
with open(filepath, 'r') as f:
content = yaml.safe_load(f)
except Exception as e:
print(f"Error reading {filepath}: {e}")
return False
if not content:
return False
modified = False
# Get the main entity dict (class or slot name)
# The file structure is usually:
# classes:
# ClassName:
# ...
# OR
# slots:
# slot_name:
# ...
target_key = 'classes' if is_class else 'slots'
if target_key not in content:
# Some slot files might be defined at top level? No, LinkML usually nests them.
# But my heredocs created them like:
# id: ...
# name: ...
# classes:
# Name: ...
# So this structure assumes standard LinkML file layout.
# However, for slots created via heredoc in previous turns:
# id: ...
# name: has_or_had_policy
# ...
# This is a flat slot definition file, NOT nested under 'slots:'.
# LinkML usually expects 'slots:' for modular files, but sometimes flat works if imported correctly.
# Let's check the structure.
pass
# Handle the structure where root keys ARE the metadata
# My heredocs created files like:
# id: ...
# name: ...
# description: ...
# slot_uri: ...
# range: ...
#
# This is valid for a single-slot module if imported as such.
# But wait, usually classes are nested under 'classes:'.
# My class heredocs DID nest:
# classes:
# Audit: ...
# My slot heredocs DID NOT nest (mostly):
# cat > modules/slots/has_or_had_policy.yaml <<EOF
# id: ...
# name: has_or_had_policy
# ...
entity_dict = None
if is_class:
if 'classes' in content:
# We assume one class per file for these modules
class_name = list(content['classes'].keys())[0]
entity_dict = content['classes'][class_name]
else:
# For slots, it might be flat or nested
if 'slots' in content:
slot_name = list(content['slots'].keys())[0]
entity_dict = content['slots'][slot_name]
else:
# Flat structure
entity_dict = content
if entity_dict is None:
print(f"Could not locate entity definition in {filepath}")
return False
# Check/Add annotations
if 'annotations' not in entity_dict:
entity_dict['annotations'] = {}
modified = True
annotations = entity_dict['annotations']
# Rule 37: Specificity Score (Classes only)
if is_class:
if 'specificity_score' not in annotations:
annotations['specificity_score'] = DEFAULT_SPECIFICITY_SCORE
annotations['specificity_rationale'] = DEFAULT_SPECIFICITY_RATIONALE
modified = True
# Rule 13: Custodian Types (Classes and Slots)
if 'custodian_types' not in annotations:
annotations['custodian_types'] = DEFAULT_CUSTODIAN_TYPES
annotations['custodian_types_rationale'] = DEFAULT_CUSTODIAN_RATIONALE
modified = True
# Rule 38: Slot URI (Slots only)
if not is_class:
if 'slot_uri' not in entity_dict:
print(f"WARNING: Slot {filepath} missing slot_uri")
# We can't auto-fix this easily without knowledge, but we can flag it.
# Rule 50: Class URI (Classes only)
if is_class:
if 'class_uri' not in entity_dict:
print(f"WARNING: Class {filepath} missing class_uri")
if modified:
with open(filepath, 'w') as f:
yaml.dump(content, f, sort_keys=False, width=1000)
print(f"Fixed {filepath}")
return True
return False
def run():
# Fix Classes
print("Scanning Classes...")
class_files = glob.glob(os.path.join(CLASSES_DIR, "*.yaml"))
for f in class_files:
fix_yaml_file(f, is_class=True)
# Fix Slots
print("Scanning Slots...")
slot_files = glob.glob(os.path.join(SLOTS_DIR, "*.yaml"))
for f in slot_files:
fix_yaml_file(f, is_class=False)
if __name__ == "__main__":
run()