glam/scripts/extract_custodian_source_classes.py
2026-01-08 15:56:28 +01:00

230 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""
Extract classes from custodian_source.yaml to individual files in modules/classes/
This script:
1. Parses the monolithic custodian_source.yaml
2. Creates individual class files in modules/classes/
3. Each class file includes proper headers and imports
Per Rule 38 in AGENTS.md - modular schema files
"""
import yaml
import os
from pathlib import Path
from datetime import datetime
# Paths
SCHEMA_DIR = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml")
SOURCE_FILE = SCHEMA_DIR / "custodian_source.yaml"
CLASSES_DIR = SCHEMA_DIR / "modules" / "classes"
ENUMS_DIR = SCHEMA_DIR / "modules" / "enums"
# Template for class file
CLASS_TEMPLATE = """# {class_name} - {description}
# Extracted from custodian_source.yaml per Rule 38 (modular schema files)
# Extraction date: {date}
id: https://nde.nl/ontology/hc/classes/{class_name}
name: {class_name}
title: {class_name}
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
prov: http://www.w3.org/ns/prov#
xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
{enum_imports}
{class_imports}
default_range: string
classes:
{class_name}:
{class_content}
"""
def get_class_description(class_def):
"""Extract description from class definition."""
if isinstance(class_def, dict):
return class_def.get('description', 'No description available')
return 'No description available'
def find_enum_references(class_def):
"""Find enums referenced by this class."""
enums = set()
enums_list = [
'DataTierEnum', 'EnrichmentStatusEnum', 'InstitutionTypeCodeEnum',
'IdentifierSchemeEnum', 'LocationResolutionMethodEnum',
'GoogleMapsStatusEnum', 'WikidataListTypeEnum'
]
def search_dict(d):
if isinstance(d, dict):
for key, value in d.items():
if key == 'range' and value in enums_list:
enums.add(value)
else:
search_dict(value)
elif isinstance(d, list):
for item in d:
search_dict(item)
search_dict(class_def)
return enums
def find_class_references(class_def, all_classes):
"""Find other classes referenced by this class."""
refs = set()
def search_dict(d):
if isinstance(d, dict):
for key, value in d.items():
if key == 'range' and isinstance(value, str) and value in all_classes:
refs.add(value)
else:
search_dict(value)
elif isinstance(d, list):
for item in d:
search_dict(item)
search_dict(class_def)
return refs
def format_class_content(class_def, indent=4):
"""Format class definition with proper indentation."""
lines = []
def format_value(value, current_indent):
if isinstance(value, dict):
result = []
for k, v in value.items():
formatted = format_value(v, current_indent + 2)
if isinstance(v, (dict, list)):
result.append(f"{' ' * current_indent}{k}:")
result.append(formatted)
else:
result.append(f"{' ' * current_indent}{k}: {formatted}")
return '\n'.join(result)
elif isinstance(value, list):
if not value:
return "[]"
result = []
for item in value:
if isinstance(item, dict):
formatted = format_value(item, current_indent + 2)
result.append(f"{' ' * current_indent}- ")
# Need special handling for dict items in lists
first_line = True
for line in formatted.split('\n'):
if first_line:
result[-1] = f"{' ' * current_indent}- {line.strip()}"
first_line = False
else:
result.append(f"{' ' * (current_indent + 2)}{line.strip()}")
else:
result.append(f"{' ' * current_indent}- {item}")
return '\n'.join(result)
elif isinstance(value, bool):
return str(value).lower()
elif value is None:
return "null"
else:
# Handle multiline strings
if isinstance(value, str) and '\n' in value:
return f"|\n{' ' * (current_indent + 2)}" + f"\n{' ' * (current_indent + 2)}".join(value.strip().split('\n'))
return str(value) if value != '' else '""'
for key, value in class_def.items():
formatted = format_value(value, indent + 2)
if isinstance(value, (dict, list)) and value:
lines.append(f"{' ' * indent}{key}:")
lines.append(formatted)
else:
lines.append(f"{' ' * indent}{key}: {formatted}")
return '\n'.join(lines)
def main():
print(f"Loading {SOURCE_FILE}...")
with open(SOURCE_FILE, 'r') as f:
schema = yaml.safe_load(f)
classes = schema.get('classes', {})
print(f"Found {len(classes)} classes")
# Track all class names for cross-reference detection
all_class_names = set(classes.keys())
# Check which classes already exist
existing_classes = set(f.stem for f in CLASSES_DIR.glob("*.yaml"))
print(f"Found {len(existing_classes)} existing class files")
# Extract each class
created = 0
skipped = 0
for class_name, class_def in classes.items():
# Check if already exists
if class_name in existing_classes:
print(f" SKIP: {class_name} (already exists)")
skipped += 1
continue
# Find dependencies
enum_refs = find_enum_references(class_def)
class_refs = find_class_references(class_def, all_class_names)
class_refs.discard(class_name) # Remove self-reference
# Build imports
enum_imports = ""
if enum_refs:
enum_imports = "\n".join(f" - ./modules/enums/{e}" for e in sorted(enum_refs))
enum_imports = f"\n{enum_imports}"
class_imports = ""
if class_refs:
class_imports = "\n".join(f" - ./modules/classes/{c}" for c in sorted(class_refs))
class_imports = f"\n{class_imports}"
# Get description
description = get_class_description(class_def)
if isinstance(description, str) and len(description) > 60:
description = description[:57] + "..."
# Format class content using yaml.dump for accuracy
class_yaml = yaml.dump({class_name: class_def}, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove the class name line since template includes it
class_lines = class_yaml.split('\n')
# Indent properly (4 spaces for class content)
class_content = '\n'.join(' ' + line if line.strip() else '' for line in class_lines[1:])
# Create file content
content = CLASS_TEMPLATE.format(
class_name=class_name,
description=description.replace('\n', ' ').strip(),
date=datetime.now().strftime("%Y-%m-%d"),
enum_imports=enum_imports,
class_imports=class_imports,
class_content=class_content.rstrip()
)
# Write file
output_file = CLASSES_DIR / f"{class_name}.yaml"
with open(output_file, 'w') as f:
f.write(content)
print(f" CREATE: {class_name}")
created += 1
print(f"\nSummary:")
print(f" Created: {created}")
print(f" Skipped: {skipped}")
print(f" Total classes: {len(classes)}")
if __name__ == "__main__":
main()