230 lines
7.7 KiB
Python
230 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract classes from custodian_source.yaml to individual files in modules/classes/
|
|
|
|
This script:
|
|
1. Parses the monolithic custodian_source.yaml
|
|
2. Creates individual class files in modules/classes/
|
|
3. Each class file includes proper headers and imports
|
|
|
|
Per Rule 38 in AGENTS.md - modular schema files
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Paths
|
|
SCHEMA_DIR = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml")
|
|
SOURCE_FILE = SCHEMA_DIR / "custodian_source.yaml"
|
|
CLASSES_DIR = SCHEMA_DIR / "modules" / "classes"
|
|
ENUMS_DIR = SCHEMA_DIR / "modules" / "enums"
|
|
|
|
# Template for class file
|
|
CLASS_TEMPLATE = """# {class_name} - {description}
|
|
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
|
|
# Extraction date: {date}
|
|
|
|
id: https://nde.nl/ontology/hc/classes/{class_name}
|
|
name: {class_name}
|
|
title: {class_name}
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
hc: https://nde.nl/ontology/hc/
|
|
schema: http://schema.org/
|
|
prov: http://www.w3.org/ns/prov#
|
|
xsd: http://www.w3.org/2001/XMLSchema#
|
|
|
|
imports:
|
|
- linkml:types
|
|
{enum_imports}
|
|
{class_imports}
|
|
default_range: string
|
|
|
|
classes:
|
|
{class_name}:
|
|
{class_content}
|
|
"""
|
|
|
|
def get_class_description(class_def):
|
|
"""Extract description from class definition."""
|
|
if isinstance(class_def, dict):
|
|
return class_def.get('description', 'No description available')
|
|
return 'No description available'
|
|
|
|
def find_enum_references(class_def):
|
|
"""Find enums referenced by this class."""
|
|
enums = set()
|
|
enums_list = [
|
|
'DataTierEnum', 'EnrichmentStatusEnum', 'InstitutionTypeCodeEnum',
|
|
'IdentifierSchemeEnum', 'LocationResolutionMethodEnum',
|
|
'GoogleMapsStatusEnum', 'WikidataListTypeEnum'
|
|
]
|
|
|
|
def search_dict(d):
|
|
if isinstance(d, dict):
|
|
for key, value in d.items():
|
|
if key == 'range' and value in enums_list:
|
|
enums.add(value)
|
|
else:
|
|
search_dict(value)
|
|
elif isinstance(d, list):
|
|
for item in d:
|
|
search_dict(item)
|
|
|
|
search_dict(class_def)
|
|
return enums
|
|
|
|
def find_class_references(class_def, all_classes):
|
|
"""Find other classes referenced by this class."""
|
|
refs = set()
|
|
|
|
def search_dict(d):
|
|
if isinstance(d, dict):
|
|
for key, value in d.items():
|
|
if key == 'range' and isinstance(value, str) and value in all_classes:
|
|
refs.add(value)
|
|
else:
|
|
search_dict(value)
|
|
elif isinstance(d, list):
|
|
for item in d:
|
|
search_dict(item)
|
|
|
|
search_dict(class_def)
|
|
return refs
|
|
|
|
def format_class_content(class_def, indent=4):
|
|
"""Format class definition with proper indentation."""
|
|
lines = []
|
|
|
|
def format_value(value, current_indent):
|
|
if isinstance(value, dict):
|
|
result = []
|
|
for k, v in value.items():
|
|
formatted = format_value(v, current_indent + 2)
|
|
if isinstance(v, (dict, list)):
|
|
result.append(f"{' ' * current_indent}{k}:")
|
|
result.append(formatted)
|
|
else:
|
|
result.append(f"{' ' * current_indent}{k}: {formatted}")
|
|
return '\n'.join(result)
|
|
elif isinstance(value, list):
|
|
if not value:
|
|
return "[]"
|
|
result = []
|
|
for item in value:
|
|
if isinstance(item, dict):
|
|
formatted = format_value(item, current_indent + 2)
|
|
result.append(f"{' ' * current_indent}- ")
|
|
# Need special handling for dict items in lists
|
|
first_line = True
|
|
for line in formatted.split('\n'):
|
|
if first_line:
|
|
result[-1] = f"{' ' * current_indent}- {line.strip()}"
|
|
first_line = False
|
|
else:
|
|
result.append(f"{' ' * (current_indent + 2)}{line.strip()}")
|
|
else:
|
|
result.append(f"{' ' * current_indent}- {item}")
|
|
return '\n'.join(result)
|
|
elif isinstance(value, bool):
|
|
return str(value).lower()
|
|
elif value is None:
|
|
return "null"
|
|
else:
|
|
# Handle multiline strings
|
|
if isinstance(value, str) and '\n' in value:
|
|
return f"|\n{' ' * (current_indent + 2)}" + f"\n{' ' * (current_indent + 2)}".join(value.strip().split('\n'))
|
|
return str(value) if value != '' else '""'
|
|
|
|
for key, value in class_def.items():
|
|
formatted = format_value(value, indent + 2)
|
|
if isinstance(value, (dict, list)) and value:
|
|
lines.append(f"{' ' * indent}{key}:")
|
|
lines.append(formatted)
|
|
else:
|
|
lines.append(f"{' ' * indent}{key}: {formatted}")
|
|
|
|
return '\n'.join(lines)
|
|
|
|
def main():
|
|
print(f"Loading {SOURCE_FILE}...")
|
|
with open(SOURCE_FILE, 'r') as f:
|
|
schema = yaml.safe_load(f)
|
|
|
|
classes = schema.get('classes', {})
|
|
print(f"Found {len(classes)} classes")
|
|
|
|
# Track all class names for cross-reference detection
|
|
all_class_names = set(classes.keys())
|
|
|
|
# Check which classes already exist
|
|
existing_classes = set(f.stem for f in CLASSES_DIR.glob("*.yaml"))
|
|
print(f"Found {len(existing_classes)} existing class files")
|
|
|
|
# Extract each class
|
|
created = 0
|
|
skipped = 0
|
|
|
|
for class_name, class_def in classes.items():
|
|
# Check if already exists
|
|
if class_name in existing_classes:
|
|
print(f" SKIP: {class_name} (already exists)")
|
|
skipped += 1
|
|
continue
|
|
|
|
# Find dependencies
|
|
enum_refs = find_enum_references(class_def)
|
|
class_refs = find_class_references(class_def, all_class_names)
|
|
class_refs.discard(class_name) # Remove self-reference
|
|
|
|
# Build imports
|
|
enum_imports = ""
|
|
if enum_refs:
|
|
enum_imports = "\n".join(f" - ./modules/enums/{e}" for e in sorted(enum_refs))
|
|
enum_imports = f"\n{enum_imports}"
|
|
|
|
class_imports = ""
|
|
if class_refs:
|
|
class_imports = "\n".join(f" - ./modules/classes/{c}" for c in sorted(class_refs))
|
|
class_imports = f"\n{class_imports}"
|
|
|
|
# Get description
|
|
description = get_class_description(class_def)
|
|
if isinstance(description, str) and len(description) > 60:
|
|
description = description[:57] + "..."
|
|
|
|
# Format class content using yaml.dump for accuracy
|
|
class_yaml = yaml.dump({class_name: class_def}, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
# Remove the class name line since template includes it
|
|
class_lines = class_yaml.split('\n')
|
|
# Indent properly (4 spaces for class content)
|
|
class_content = '\n'.join(' ' + line if line.strip() else '' for line in class_lines[1:])
|
|
|
|
# Create file content
|
|
content = CLASS_TEMPLATE.format(
|
|
class_name=class_name,
|
|
description=description.replace('\n', ' ').strip(),
|
|
date=datetime.now().strftime("%Y-%m-%d"),
|
|
enum_imports=enum_imports,
|
|
class_imports=class_imports,
|
|
class_content=class_content.rstrip()
|
|
)
|
|
|
|
# Write file
|
|
output_file = CLASSES_DIR / f"{class_name}.yaml"
|
|
with open(output_file, 'w') as f:
|
|
f.write(content)
|
|
|
|
print(f" CREATE: {class_name}")
|
|
created += 1
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Created: {created}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Total classes: {len(classes)}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|