#!/usr/bin/env python3 """ Extract classes from custodian_source.yaml to individual files in modules/classes/ This script: 1. Parses the monolithic custodian_source.yaml 2. Creates individual class files in modules/classes/ 3. Each class file includes proper headers and imports Per Rule 38 in AGENTS.md - modular schema files """ import yaml import os from pathlib import Path from datetime import datetime # Paths SCHEMA_DIR = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml") SOURCE_FILE = SCHEMA_DIR / "custodian_source.yaml" CLASSES_DIR = SCHEMA_DIR / "modules" / "classes" ENUMS_DIR = SCHEMA_DIR / "modules" / "enums" # Template for class file CLASS_TEMPLATE = """# {class_name} - {description} # Extracted from custodian_source.yaml per Rule 38 (modular schema files) # Extraction date: {date} id: https://nde.nl/ontology/hc/classes/{class_name} name: {class_name} title: {class_name} prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ prov: http://www.w3.org/ns/prov# xsd: http://www.w3.org/2001/XMLSchema# imports: - linkml:types {enum_imports} {class_imports} default_range: string classes: {class_name}: {class_content} """ def get_class_description(class_def): """Extract description from class definition.""" if isinstance(class_def, dict): return class_def.get('description', 'No description available') return 'No description available' def find_enum_references(class_def): """Find enums referenced by this class.""" enums = set() enums_list = [ 'DataTierEnum', 'EnrichmentStatusEnum', 'InstitutionTypeCodeEnum', 'IdentifierSchemeEnum', 'LocationResolutionMethodEnum', 'GoogleMapsStatusEnum', 'WikidataListTypeEnum' ] def search_dict(d): if isinstance(d, dict): for key, value in d.items(): if key == 'range' and value in enums_list: enums.add(value) else: search_dict(value) elif isinstance(d, list): for item in d: search_dict(item) search_dict(class_def) return enums def find_class_references(class_def, all_classes): """Find other classes referenced by this class.""" refs = set() def search_dict(d): if isinstance(d, dict): for key, value in d.items(): if key == 'range' and isinstance(value, str) and value in all_classes: refs.add(value) else: search_dict(value) elif isinstance(d, list): for item in d: search_dict(item) search_dict(class_def) return refs def format_class_content(class_def, indent=4): """Format class definition with proper indentation.""" lines = [] def format_value(value, current_indent): if isinstance(value, dict): result = [] for k, v in value.items(): formatted = format_value(v, current_indent + 2) if isinstance(v, (dict, list)): result.append(f"{' ' * current_indent}{k}:") result.append(formatted) else: result.append(f"{' ' * current_indent}{k}: {formatted}") return '\n'.join(result) elif isinstance(value, list): if not value: return "[]" result = [] for item in value: if isinstance(item, dict): formatted = format_value(item, current_indent + 2) result.append(f"{' ' * current_indent}- ") # Need special handling for dict items in lists first_line = True for line in formatted.split('\n'): if first_line: result[-1] = f"{' ' * current_indent}- {line.strip()}" first_line = False else: result.append(f"{' ' * (current_indent + 2)}{line.strip()}") else: result.append(f"{' ' * current_indent}- {item}") return '\n'.join(result) elif isinstance(value, bool): return str(value).lower() elif value is None: return "null" else: # Handle multiline strings if isinstance(value, str) and '\n' in value: return f"|\n{' ' * (current_indent + 2)}" + f"\n{' ' * (current_indent + 2)}".join(value.strip().split('\n')) return str(value) if value != '' else '""' for key, value in class_def.items(): formatted = format_value(value, indent + 2) if isinstance(value, (dict, list)) and value: lines.append(f"{' ' * indent}{key}:") lines.append(formatted) else: lines.append(f"{' ' * indent}{key}: {formatted}") return '\n'.join(lines) def main(): print(f"Loading {SOURCE_FILE}...") with open(SOURCE_FILE, 'r') as f: schema = yaml.safe_load(f) classes = schema.get('classes', {}) print(f"Found {len(classes)} classes") # Track all class names for cross-reference detection all_class_names = set(classes.keys()) # Check which classes already exist existing_classes = set(f.stem for f in CLASSES_DIR.glob("*.yaml")) print(f"Found {len(existing_classes)} existing class files") # Extract each class created = 0 skipped = 0 for class_name, class_def in classes.items(): # Check if already exists if class_name in existing_classes: print(f" SKIP: {class_name} (already exists)") skipped += 1 continue # Find dependencies enum_refs = find_enum_references(class_def) class_refs = find_class_references(class_def, all_class_names) class_refs.discard(class_name) # Remove self-reference # Build imports enum_imports = "" if enum_refs: enum_imports = "\n".join(f" - ./modules/enums/{e}" for e in sorted(enum_refs)) enum_imports = f"\n{enum_imports}" class_imports = "" if class_refs: class_imports = "\n".join(f" - ./modules/classes/{c}" for c in sorted(class_refs)) class_imports = f"\n{class_imports}" # Get description description = get_class_description(class_def) if isinstance(description, str) and len(description) > 60: description = description[:57] + "..." # Format class content using yaml.dump for accuracy class_yaml = yaml.dump({class_name: class_def}, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove the class name line since template includes it class_lines = class_yaml.split('\n') # Indent properly (4 spaces for class content) class_content = '\n'.join(' ' + line if line.strip() else '' for line in class_lines[1:]) # Create file content content = CLASS_TEMPLATE.format( class_name=class_name, description=description.replace('\n', ' ').strip(), date=datetime.now().strftime("%Y-%m-%d"), enum_imports=enum_imports, class_imports=class_imports, class_content=class_content.rstrip() ) # Write file output_file = CLASSES_DIR / f"{class_name}.yaml" with open(output_file, 'w') as f: f.write(content) print(f" CREATE: {class_name}") created += 1 print(f"\nSummary:") print(f" Created: {created}") print(f" Skipped: {skipped}") print(f" Total classes: {len(classes)}") if __name__ == "__main__": main()