- Implemented `generate_mermaid_with_instances.py` to create ER diagrams that include all classes, relationships, enum values, and instance data. - Loaded instance data from YAML files and enriched enum definitions with meaningful annotations. - Configured output paths for generated diagrams in both frontend and schema directories. - Added support for excluding technical classes and limiting the number of displayed enum and instance values for readability.
291 lines
12 KiB
Python
291 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate Mermaid ER diagrams with instance data from LinkML schemas.
|
|
|
|
This script extends the standard Mermaid generation to include:
|
|
1. All classes and their relationships
|
|
2. Enum values (from LinkML schema)
|
|
3. Instance data (from instances/enums/*.yaml) as annotations
|
|
|
|
The instance data provides semantically meaningful "allowed values" for
|
|
CustodianType classes like MuseumType, LibraryType, HeritageSocietyType, etc.
|
|
|
|
Usage:
|
|
python3 scripts/generate_mermaid_with_instances.py
|
|
|
|
Output:
|
|
frontend/public/data/heritage_custodian_ontology.mmd
|
|
schemas/20251121/uml/mermaid/complete_schema_with_instances_YYYYMMDD_HHMMSS.mmd
|
|
"""
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from linkml_runtime.utils.schemaview import SchemaView
|
|
|
|
# Configuration
|
|
SCHEMA_PATH = "schemas/20251121/linkml/01_custodian_name_modular.yaml"
|
|
INSTANCES_DIR = "schemas/20251121/linkml/instances/enums"
|
|
OUTPUT_DIR = "schemas/20251121/uml/mermaid"
|
|
FRONTEND_OUTPUT = "frontend/public/data/heritage_custodian_ontology.mmd"
|
|
|
|
# Classes to exclude from diagrams (technical artifacts with no semantic significance)
|
|
EXCLUDED_CLASSES = {
|
|
"Container", # LinkML tree_root for validation only, not part of ontology
|
|
}
|
|
|
|
# Maximum number of enum values to show in diagram (for readability)
|
|
MAX_ENUM_VALUES_IN_DIAGRAM = 10
|
|
|
|
# Maximum number of instance values to show (for readability)
|
|
MAX_INSTANCE_VALUES = 15
|
|
|
|
# Mapping from enum names to their instance files
|
|
ENUM_INSTANCE_FILES = {
|
|
"CustodianPrimaryTypeEnum": "custodian_primary_type.yaml",
|
|
"AppellationTypeEnum": "appellation_type.yaml",
|
|
"OrganizationalChangeEventTypeEnum": "organizational_change_event_type.yaml",
|
|
"StaffRoleTypeEnum": "staff_role_type.yaml",
|
|
"OrganizationalUnitTypeEnum": "organizational_unit_type.yaml",
|
|
"LegalStatusEnum": "legal_status_type.yaml",
|
|
"PlaceSpecificityEnum": "place_specificity.yaml",
|
|
"EncompassingBodyTypeEnum": "encompassing_body_type.yaml",
|
|
"AuxiliaryDigitalPlatformTypeEnum": "auxiliary_digital_platform_type.yaml",
|
|
"AgentTypeEnum": "agent_type.yaml",
|
|
"EntityTypeEnum": "entity_type.yaml",
|
|
"SourceDocumentTypeEnum": "source_document_type.yaml",
|
|
"ReconstructionActivityTypeEnum": "reconstruction_activity_type.yaml",
|
|
"WebPortalTypeEnum": "web_portal_type.yaml",
|
|
"SocialMediaPlatformTypeEnum": "social_media_platform_type.yaml",
|
|
"RecordsLifecycleStageEnum": "records_lifecycle_stage.yaml",
|
|
"ArchiveProcessingStatusEnum": "archive_processing_status.yaml",
|
|
"StorageTypeEnum": "storage_type.yaml",
|
|
"DigitalPresenceTypeEnum": "digital_presence_type.yaml",
|
|
"FeatureTypeEnum": "feature_type.yaml",
|
|
"ProjectStatusEnum": "project_status.yaml",
|
|
"FinancialStatementTypeEnum": "financial_statement_type.yaml",
|
|
"StorageConditionStatusEnum": "storage_condition_status.yaml",
|
|
"AuxiliaryPlaceTypeEnum": "auxiliary_place_type.yaml",
|
|
"GiftShopTypeEnum": "gift_shop_type.yaml",
|
|
"FundingRequirementTypeEnum": "funding_requirement_type.yaml",
|
|
"OrganizationBranchTypeEnum": "organization_branch_type.yaml",
|
|
}
|
|
|
|
|
|
def load_instance_data(instances_dir: Path) -> dict:
|
|
"""Load all instance data from YAML files."""
|
|
instance_data = {}
|
|
|
|
for enum_name, filename in ENUM_INSTANCE_FILES.items():
|
|
filepath = instances_dir / filename
|
|
if filepath.exists():
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and 'instances' in data:
|
|
# Extract value names and their English labels
|
|
values = []
|
|
for instance in data['instances']:
|
|
value = instance.get('value', '')
|
|
code = instance.get('code', '')
|
|
|
|
# Get English label if available
|
|
pref_label = instance.get('skos:prefLabel', {})
|
|
en_label = pref_label.get('en', '') if isinstance(pref_label, dict) else ''
|
|
|
|
# Get Wikidata entity
|
|
wikidata = instance.get('wikidata', {})
|
|
qid = wikidata.get('entity', '') if isinstance(wikidata, dict) else ''
|
|
|
|
values.append({
|
|
'value': value,
|
|
'code': code,
|
|
'label': en_label,
|
|
'wikidata': qid
|
|
})
|
|
|
|
instance_data[enum_name] = {
|
|
'name': data.get('name', enum_name),
|
|
'description': data.get('description', ''),
|
|
'values': values
|
|
}
|
|
print(f" ✓ Loaded {len(values)} instances from {filename}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f" ⚠ Warning: Could not load {filename}: {e}", file=sys.stderr)
|
|
|
|
return instance_data
|
|
|
|
|
|
def generate_mermaid_with_instances(sv: SchemaView, instance_data: dict, include_enums: bool = True) -> str:
|
|
"""
|
|
Generate Mermaid ER diagram with instance data annotations.
|
|
"""
|
|
lines = ["```mermaid"]
|
|
lines.append("erDiagram")
|
|
lines.append("")
|
|
lines.append(" %% Heritage Custodian Ontology - Complete Schema with Instance Data")
|
|
lines.append(f" %% Generated: {datetime.now().isoformat()}")
|
|
lines.append(f" %% Schema: {sv.schema.name}")
|
|
lines.append("")
|
|
|
|
# Get all classes except excluded ones
|
|
all_classes = [c for c in sv.all_classes() if c not in EXCLUDED_CLASSES]
|
|
|
|
# Get all enums
|
|
all_enums = list(sv.all_enums()) if include_enums else []
|
|
|
|
# Generate class entities
|
|
for class_name in all_classes:
|
|
cls = sv.get_class(class_name)
|
|
|
|
lines.append(f"{class_name} {{")
|
|
|
|
# Add ALL attributes/slots
|
|
for slot_name in sv.class_slots(class_name):
|
|
slot = sv.induced_slot(slot_name, class_name)
|
|
|
|
if slot:
|
|
slot_range = slot.range if slot.range else "string"
|
|
|
|
# Skip excluded classes only
|
|
if slot_range in EXCLUDED_CLASSES:
|
|
continue
|
|
|
|
# Format: type attribute_name
|
|
multivalued_marker = "List" if slot.multivalued else ""
|
|
required_marker = " PK" if slot.required else ""
|
|
lines.append(f" {slot_range}{multivalued_marker} {slot_name}{required_marker}")
|
|
|
|
lines.append("}")
|
|
|
|
# Generate enum entities with instance data enrichment
|
|
if include_enums and all_enums:
|
|
lines.append("")
|
|
lines.append(" %% Enumerations with Instance Data")
|
|
for enum_name in all_enums:
|
|
enum_def = sv.get_enum(enum_name)
|
|
if enum_def and enum_def.permissible_values:
|
|
lines.append(f"{enum_name} {{")
|
|
lines.append(" string enum_type PK")
|
|
|
|
# Check if we have instance data for this enum
|
|
if enum_name in instance_data:
|
|
inst_data = instance_data[enum_name]
|
|
values = inst_data['values']
|
|
|
|
# Show values with their labels and Wikidata IDs
|
|
for i, val_info in enumerate(values[:MAX_INSTANCE_VALUES]):
|
|
value = val_info['value']
|
|
code = val_info.get('code', '')
|
|
label = val_info.get('label', '')
|
|
qid = val_info.get('wikidata', '')
|
|
|
|
# Build annotation string
|
|
annotation = f" string {value}"
|
|
# Add comment with code, label, and Wikidata ID
|
|
# Note: Mermaid ER diagrams don't support comments in entities
|
|
# but we include the code for context
|
|
if code:
|
|
annotation = f" string {value}_{code}"
|
|
lines.append(annotation)
|
|
|
|
if len(values) > MAX_INSTANCE_VALUES:
|
|
remaining = len(values) - MAX_INSTANCE_VALUES
|
|
lines.append(f" string _and_{remaining}_more")
|
|
else:
|
|
# Fall back to schema enum values
|
|
values = list(enum_def.permissible_values.keys())
|
|
for i, value_name in enumerate(values[:MAX_ENUM_VALUES_IN_DIAGRAM]):
|
|
lines.append(f" string {value_name}")
|
|
|
|
if len(values) > MAX_ENUM_VALUES_IN_DIAGRAM:
|
|
remaining = len(values) - MAX_ENUM_VALUES_IN_DIAGRAM
|
|
lines.append(f" string _and_{remaining}_more")
|
|
|
|
lines.append("}")
|
|
|
|
lines.append("")
|
|
|
|
# Generate relationships
|
|
for class_name in all_classes:
|
|
cls = sv.get_class(class_name)
|
|
|
|
# Inheritance relationships
|
|
if cls.is_a and cls.is_a not in EXCLUDED_CLASSES:
|
|
lines.append(f'{class_name} ||--|| {cls.is_a} : "inherits"')
|
|
|
|
# Association relationships
|
|
for slot_name in sv.class_slots(class_name):
|
|
slot = sv.induced_slot(slot_name, class_name)
|
|
|
|
if slot and slot.range:
|
|
# Check if range is a class
|
|
if slot.range in all_classes:
|
|
if slot.multivalued:
|
|
cardinality = "||--}|" if slot.required else "||--}o"
|
|
else:
|
|
cardinality = "||--||" if slot.required else "||--|o"
|
|
|
|
lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"')
|
|
|
|
# Check if range is an enum
|
|
elif include_enums and slot.range in all_enums:
|
|
cardinality = "||--}o" if slot.multivalued else "||--|o"
|
|
lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"')
|
|
|
|
lines.append("")
|
|
lines.append("```")
|
|
lines.append("")
|
|
|
|
return '\n'.join(lines)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
print("=" * 60, file=sys.stderr)
|
|
print("Mermaid ER Diagram Generator with Instance Data", file=sys.stderr)
|
|
print("=" * 60, file=sys.stderr)
|
|
|
|
# Load schema
|
|
print(f"\nLoading schema: {SCHEMA_PATH}", file=sys.stderr)
|
|
sv = SchemaView(SCHEMA_PATH)
|
|
print(f"✓ Loaded schema: {sv.schema.name}", file=sys.stderr)
|
|
print(f" Classes: {len(list(sv.all_classes()))}", file=sys.stderr)
|
|
print(f" Enums: {len(list(sv.all_enums()))}", file=sys.stderr)
|
|
|
|
# Load instance data
|
|
instances_dir = Path(INSTANCES_DIR)
|
|
print(f"\nLoading instance data from: {instances_dir}", file=sys.stderr)
|
|
instance_data = load_instance_data(instances_dir)
|
|
print(f"✓ Loaded {len(instance_data)} enum instance files", file=sys.stderr)
|
|
|
|
# Generate Mermaid
|
|
print("\nGenerating Mermaid ER diagram...", file=sys.stderr)
|
|
mermaid = generate_mermaid_with_instances(sv, instance_data)
|
|
|
|
# Generate timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Ensure output directories exist
|
|
output_dir = Path(OUTPUT_DIR)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
frontend_path = Path(FRONTEND_OUTPUT)
|
|
frontend_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write to schemas directory (timestamped)
|
|
schema_output = output_dir / f"complete_schema_with_instances_{timestamp}.mmd"
|
|
schema_output.write_text(mermaid)
|
|
print(f"\n✓ Generated: {schema_output}", file=sys.stderr)
|
|
print(f" Size: {len(mermaid)} bytes", file=sys.stderr)
|
|
|
|
# Write to frontend directory (overwrite)
|
|
frontend_path.write_text(mermaid)
|
|
print(f"✓ Updated frontend: {frontend_path}", file=sys.stderr)
|
|
|
|
print("\n" + "=" * 60, file=sys.stderr)
|
|
print("Done! The UML diagram now includes instance data.", file=sys.stderr)
|
|
print("=" * 60, file=sys.stderr)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|