glam/scripts/generate_mermaid_with_instances.py
kempersc 48a2b26f59 feat: Add script to generate Mermaid ER diagrams with instance data from LinkML schemas
- Implemented `generate_mermaid_with_instances.py` to create ER diagrams that include all classes, relationships, enum values, and instance data.
- Loaded instance data from YAML files and enriched enum definitions with meaningful annotations.
- Configured output paths for generated diagrams in both frontend and schema directories.
- Added support for excluding technical classes and limiting the number of displayed enum and instance values for readability.
2025-12-01 16:58:03 +01:00

291 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Generate Mermaid ER diagrams with instance data from LinkML schemas.
This script extends the standard Mermaid generation to include:
1. All classes and their relationships
2. Enum values (from LinkML schema)
3. Instance data (from instances/enums/*.yaml) as annotations
The instance data provides semantically meaningful "allowed values" for
CustodianType classes like MuseumType, LibraryType, HeritageSocietyType, etc.
Usage:
python3 scripts/generate_mermaid_with_instances.py
Output:
frontend/public/data/heritage_custodian_ontology.mmd
schemas/20251121/uml/mermaid/complete_schema_with_instances_YYYYMMDD_HHMMSS.mmd
"""
import sys
import yaml
from pathlib import Path
from datetime import datetime
from linkml_runtime.utils.schemaview import SchemaView
# Configuration
SCHEMA_PATH = "schemas/20251121/linkml/01_custodian_name_modular.yaml"
INSTANCES_DIR = "schemas/20251121/linkml/instances/enums"
OUTPUT_DIR = "schemas/20251121/uml/mermaid"
FRONTEND_OUTPUT = "frontend/public/data/heritage_custodian_ontology.mmd"
# Classes to exclude from diagrams (technical artifacts with no semantic significance)
EXCLUDED_CLASSES = {
"Container", # LinkML tree_root for validation only, not part of ontology
}
# Maximum number of enum values to show in diagram (for readability)
MAX_ENUM_VALUES_IN_DIAGRAM = 10
# Maximum number of instance values to show (for readability)
MAX_INSTANCE_VALUES = 15
# Mapping from enum names to their instance files
ENUM_INSTANCE_FILES = {
"CustodianPrimaryTypeEnum": "custodian_primary_type.yaml",
"AppellationTypeEnum": "appellation_type.yaml",
"OrganizationalChangeEventTypeEnum": "organizational_change_event_type.yaml",
"StaffRoleTypeEnum": "staff_role_type.yaml",
"OrganizationalUnitTypeEnum": "organizational_unit_type.yaml",
"LegalStatusEnum": "legal_status_type.yaml",
"PlaceSpecificityEnum": "place_specificity.yaml",
"EncompassingBodyTypeEnum": "encompassing_body_type.yaml",
"AuxiliaryDigitalPlatformTypeEnum": "auxiliary_digital_platform_type.yaml",
"AgentTypeEnum": "agent_type.yaml",
"EntityTypeEnum": "entity_type.yaml",
"SourceDocumentTypeEnum": "source_document_type.yaml",
"ReconstructionActivityTypeEnum": "reconstruction_activity_type.yaml",
"WebPortalTypeEnum": "web_portal_type.yaml",
"SocialMediaPlatformTypeEnum": "social_media_platform_type.yaml",
"RecordsLifecycleStageEnum": "records_lifecycle_stage.yaml",
"ArchiveProcessingStatusEnum": "archive_processing_status.yaml",
"StorageTypeEnum": "storage_type.yaml",
"DigitalPresenceTypeEnum": "digital_presence_type.yaml",
"FeatureTypeEnum": "feature_type.yaml",
"ProjectStatusEnum": "project_status.yaml",
"FinancialStatementTypeEnum": "financial_statement_type.yaml",
"StorageConditionStatusEnum": "storage_condition_status.yaml",
"AuxiliaryPlaceTypeEnum": "auxiliary_place_type.yaml",
"GiftShopTypeEnum": "gift_shop_type.yaml",
"FundingRequirementTypeEnum": "funding_requirement_type.yaml",
"OrganizationBranchTypeEnum": "organization_branch_type.yaml",
}
def load_instance_data(instances_dir: Path) -> dict:
"""Load all instance data from YAML files."""
instance_data = {}
for enum_name, filename in ENUM_INSTANCE_FILES.items():
filepath = instances_dir / filename
if filepath.exists():
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and 'instances' in data:
# Extract value names and their English labels
values = []
for instance in data['instances']:
value = instance.get('value', '')
code = instance.get('code', '')
# Get English label if available
pref_label = instance.get('skos:prefLabel', {})
en_label = pref_label.get('en', '') if isinstance(pref_label, dict) else ''
# Get Wikidata entity
wikidata = instance.get('wikidata', {})
qid = wikidata.get('entity', '') if isinstance(wikidata, dict) else ''
values.append({
'value': value,
'code': code,
'label': en_label,
'wikidata': qid
})
instance_data[enum_name] = {
'name': data.get('name', enum_name),
'description': data.get('description', ''),
'values': values
}
print(f" ✓ Loaded {len(values)} instances from {filename}", file=sys.stderr)
except Exception as e:
print(f" ⚠ Warning: Could not load {filename}: {e}", file=sys.stderr)
return instance_data
def generate_mermaid_with_instances(sv: SchemaView, instance_data: dict, include_enums: bool = True) -> str:
"""
Generate Mermaid ER diagram with instance data annotations.
"""
lines = ["```mermaid"]
lines.append("erDiagram")
lines.append("")
lines.append(" %% Heritage Custodian Ontology - Complete Schema with Instance Data")
lines.append(f" %% Generated: {datetime.now().isoformat()}")
lines.append(f" %% Schema: {sv.schema.name}")
lines.append("")
# Get all classes except excluded ones
all_classes = [c for c in sv.all_classes() if c not in EXCLUDED_CLASSES]
# Get all enums
all_enums = list(sv.all_enums()) if include_enums else []
# Generate class entities
for class_name in all_classes:
cls = sv.get_class(class_name)
lines.append(f"{class_name} {{")
# Add ALL attributes/slots
for slot_name in sv.class_slots(class_name):
slot = sv.induced_slot(slot_name, class_name)
if slot:
slot_range = slot.range if slot.range else "string"
# Skip excluded classes only
if slot_range in EXCLUDED_CLASSES:
continue
# Format: type attribute_name
multivalued_marker = "List" if slot.multivalued else ""
required_marker = " PK" if slot.required else ""
lines.append(f" {slot_range}{multivalued_marker} {slot_name}{required_marker}")
lines.append("}")
# Generate enum entities with instance data enrichment
if include_enums and all_enums:
lines.append("")
lines.append(" %% Enumerations with Instance Data")
for enum_name in all_enums:
enum_def = sv.get_enum(enum_name)
if enum_def and enum_def.permissible_values:
lines.append(f"{enum_name} {{")
lines.append(" string enum_type PK")
# Check if we have instance data for this enum
if enum_name in instance_data:
inst_data = instance_data[enum_name]
values = inst_data['values']
# Show values with their labels and Wikidata IDs
for i, val_info in enumerate(values[:MAX_INSTANCE_VALUES]):
value = val_info['value']
code = val_info.get('code', '')
label = val_info.get('label', '')
qid = val_info.get('wikidata', '')
# Build annotation string
annotation = f" string {value}"
# Add comment with code, label, and Wikidata ID
# Note: Mermaid ER diagrams don't support comments in entities
# but we include the code for context
if code:
annotation = f" string {value}_{code}"
lines.append(annotation)
if len(values) > MAX_INSTANCE_VALUES:
remaining = len(values) - MAX_INSTANCE_VALUES
lines.append(f" string _and_{remaining}_more")
else:
# Fall back to schema enum values
values = list(enum_def.permissible_values.keys())
for i, value_name in enumerate(values[:MAX_ENUM_VALUES_IN_DIAGRAM]):
lines.append(f" string {value_name}")
if len(values) > MAX_ENUM_VALUES_IN_DIAGRAM:
remaining = len(values) - MAX_ENUM_VALUES_IN_DIAGRAM
lines.append(f" string _and_{remaining}_more")
lines.append("}")
lines.append("")
# Generate relationships
for class_name in all_classes:
cls = sv.get_class(class_name)
# Inheritance relationships
if cls.is_a and cls.is_a not in EXCLUDED_CLASSES:
lines.append(f'{class_name} ||--|| {cls.is_a} : "inherits"')
# Association relationships
for slot_name in sv.class_slots(class_name):
slot = sv.induced_slot(slot_name, class_name)
if slot and slot.range:
# Check if range is a class
if slot.range in all_classes:
if slot.multivalued:
cardinality = "||--}|" if slot.required else "||--}o"
else:
cardinality = "||--||" if slot.required else "||--|o"
lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"')
# Check if range is an enum
elif include_enums and slot.range in all_enums:
cardinality = "||--}o" if slot.multivalued else "||--|o"
lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"')
lines.append("")
lines.append("```")
lines.append("")
return '\n'.join(lines)
def main():
"""Main entry point."""
print("=" * 60, file=sys.stderr)
print("Mermaid ER Diagram Generator with Instance Data", file=sys.stderr)
print("=" * 60, file=sys.stderr)
# Load schema
print(f"\nLoading schema: {SCHEMA_PATH}", file=sys.stderr)
sv = SchemaView(SCHEMA_PATH)
print(f"✓ Loaded schema: {sv.schema.name}", file=sys.stderr)
print(f" Classes: {len(list(sv.all_classes()))}", file=sys.stderr)
print(f" Enums: {len(list(sv.all_enums()))}", file=sys.stderr)
# Load instance data
instances_dir = Path(INSTANCES_DIR)
print(f"\nLoading instance data from: {instances_dir}", file=sys.stderr)
instance_data = load_instance_data(instances_dir)
print(f"✓ Loaded {len(instance_data)} enum instance files", file=sys.stderr)
# Generate Mermaid
print("\nGenerating Mermaid ER diagram...", file=sys.stderr)
mermaid = generate_mermaid_with_instances(sv, instance_data)
# Generate timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Ensure output directories exist
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
frontend_path = Path(FRONTEND_OUTPUT)
frontend_path.parent.mkdir(parents=True, exist_ok=True)
# Write to schemas directory (timestamped)
schema_output = output_dir / f"complete_schema_with_instances_{timestamp}.mmd"
schema_output.write_text(mermaid)
print(f"\n✓ Generated: {schema_output}", file=sys.stderr)
print(f" Size: {len(mermaid)} bytes", file=sys.stderr)
# Write to frontend directory (overwrite)
frontend_path.write_text(mermaid)
print(f"✓ Updated frontend: {frontend_path}", file=sys.stderr)
print("\n" + "=" * 60, file=sys.stderr)
print("Done! The UML diagram now includes instance data.", file=sys.stderr)
print("=" * 60, file=sys.stderr)
if __name__ == '__main__':
main()