#!/usr/bin/env python3 """ Generate Mermaid ER diagrams with instance data from LinkML schemas. This script extends the standard Mermaid generation to include: 1. All classes and their relationships 2. Enum values (from LinkML schema) 3. Instance data (from instances/enums/*.yaml) as annotations The instance data provides semantically meaningful "allowed values" for CustodianType classes like MuseumType, LibraryType, HeritageSocietyType, etc. Usage: python3 scripts/generate_mermaid_with_instances.py Output: frontend/public/data/heritage_custodian_ontology.mmd schemas/20251121/uml/mermaid/complete_schema_with_instances_YYYYMMDD_HHMMSS.mmd """ import sys import yaml from pathlib import Path from datetime import datetime from linkml_runtime.utils.schemaview import SchemaView # Configuration SCHEMA_PATH = "schemas/20251121/linkml/01_custodian_name_modular.yaml" INSTANCES_DIR = "schemas/20251121/linkml/instances/enums" OUTPUT_DIR = "schemas/20251121/uml/mermaid" FRONTEND_OUTPUT = "frontend/public/data/heritage_custodian_ontology.mmd" # Classes to exclude from diagrams (technical artifacts with no semantic significance) EXCLUDED_CLASSES = { "Container", # LinkML tree_root for validation only, not part of ontology } # Maximum number of enum values to show in diagram (for readability) MAX_ENUM_VALUES_IN_DIAGRAM = 10 # Maximum number of instance values to show (for readability) MAX_INSTANCE_VALUES = 15 # Mapping from enum names to their instance files ENUM_INSTANCE_FILES = { "CustodianPrimaryTypeEnum": "custodian_primary_type.yaml", "AppellationTypeEnum": "appellation_type.yaml", "OrganizationalChangeEventTypeEnum": "organizational_change_event_type.yaml", "StaffRoleTypeEnum": "staff_role_type.yaml", "OrganizationalUnitTypeEnum": "organizational_unit_type.yaml", "LegalStatusEnum": "legal_status_type.yaml", "PlaceSpecificityEnum": "place_specificity.yaml", "EncompassingBodyTypeEnum": "encompassing_body_type.yaml", "AuxiliaryDigitalPlatformTypeEnum": "auxiliary_digital_platform_type.yaml", "AgentTypeEnum": "agent_type.yaml", "EntityTypeEnum": "entity_type.yaml", "SourceDocumentTypeEnum": "source_document_type.yaml", "ReconstructionActivityTypeEnum": "reconstruction_activity_type.yaml", "WebPortalTypeEnum": "web_portal_type.yaml", "SocialMediaPlatformTypeEnum": "social_media_platform_type.yaml", "RecordsLifecycleStageEnum": "records_lifecycle_stage.yaml", "ArchiveProcessingStatusEnum": "archive_processing_status.yaml", "StorageTypeEnum": "storage_type.yaml", "DigitalPresenceTypeEnum": "digital_presence_type.yaml", "FeatureTypeEnum": "feature_type.yaml", "ProjectStatusEnum": "project_status.yaml", "FinancialStatementTypeEnum": "financial_statement_type.yaml", "StorageConditionStatusEnum": "storage_condition_status.yaml", "AuxiliaryPlaceTypeEnum": "auxiliary_place_type.yaml", "GiftShopTypeEnum": "gift_shop_type.yaml", "FundingRequirementTypeEnum": "funding_requirement_type.yaml", "OrganizationBranchTypeEnum": "organization_branch_type.yaml", } def load_instance_data(instances_dir: Path) -> dict: """Load all instance data from YAML files.""" instance_data = {} for enum_name, filename in ENUM_INSTANCE_FILES.items(): filepath = instances_dir / filename if filepath.exists(): try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and 'instances' in data: # Extract value names and their English labels values = [] for instance in data['instances']: value = instance.get('value', '') code = instance.get('code', '') # Get English label if available pref_label = instance.get('skos:prefLabel', {}) en_label = pref_label.get('en', '') if isinstance(pref_label, dict) else '' # Get Wikidata entity wikidata = instance.get('wikidata', {}) qid = wikidata.get('entity', '') if isinstance(wikidata, dict) else '' values.append({ 'value': value, 'code': code, 'label': en_label, 'wikidata': qid }) instance_data[enum_name] = { 'name': data.get('name', enum_name), 'description': data.get('description', ''), 'values': values } print(f" ✓ Loaded {len(values)} instances from {filename}", file=sys.stderr) except Exception as e: print(f" ⚠ Warning: Could not load {filename}: {e}", file=sys.stderr) return instance_data def generate_mermaid_with_instances(sv: SchemaView, instance_data: dict, include_enums: bool = True) -> str: """ Generate Mermaid ER diagram with instance data annotations. """ lines = ["```mermaid"] lines.append("erDiagram") lines.append("") lines.append(" %% Heritage Custodian Ontology - Complete Schema with Instance Data") lines.append(f" %% Generated: {datetime.now().isoformat()}") lines.append(f" %% Schema: {sv.schema.name}") lines.append("") # Get all classes except excluded ones all_classes = [c for c in sv.all_classes() if c not in EXCLUDED_CLASSES] # Get all enums all_enums = list(sv.all_enums()) if include_enums else [] # Generate class entities for class_name in all_classes: cls = sv.get_class(class_name) lines.append(f"{class_name} {{") # Add ALL attributes/slots for slot_name in sv.class_slots(class_name): slot = sv.induced_slot(slot_name, class_name) if slot: slot_range = slot.range if slot.range else "string" # Skip excluded classes only if slot_range in EXCLUDED_CLASSES: continue # Format: type attribute_name multivalued_marker = "List" if slot.multivalued else "" required_marker = " PK" if slot.required else "" lines.append(f" {slot_range}{multivalued_marker} {slot_name}{required_marker}") lines.append("}") # Generate enum entities with instance data enrichment if include_enums and all_enums: lines.append("") lines.append(" %% Enumerations with Instance Data") for enum_name in all_enums: enum_def = sv.get_enum(enum_name) if enum_def and enum_def.permissible_values: lines.append(f"{enum_name} {{") lines.append(" string enum_type PK") # Check if we have instance data for this enum if enum_name in instance_data: inst_data = instance_data[enum_name] values = inst_data['values'] # Show values with their labels and Wikidata IDs for i, val_info in enumerate(values[:MAX_INSTANCE_VALUES]): value = val_info['value'] code = val_info.get('code', '') label = val_info.get('label', '') qid = val_info.get('wikidata', '') # Build annotation string annotation = f" string {value}" # Add comment with code, label, and Wikidata ID # Note: Mermaid ER diagrams don't support comments in entities # but we include the code for context if code: annotation = f" string {value}_{code}" lines.append(annotation) if len(values) > MAX_INSTANCE_VALUES: remaining = len(values) - MAX_INSTANCE_VALUES lines.append(f" string _and_{remaining}_more") else: # Fall back to schema enum values values = list(enum_def.permissible_values.keys()) for i, value_name in enumerate(values[:MAX_ENUM_VALUES_IN_DIAGRAM]): lines.append(f" string {value_name}") if len(values) > MAX_ENUM_VALUES_IN_DIAGRAM: remaining = len(values) - MAX_ENUM_VALUES_IN_DIAGRAM lines.append(f" string _and_{remaining}_more") lines.append("}") lines.append("") # Generate relationships for class_name in all_classes: cls = sv.get_class(class_name) # Inheritance relationships if cls.is_a and cls.is_a not in EXCLUDED_CLASSES: lines.append(f'{class_name} ||--|| {cls.is_a} : "inherits"') # Association relationships for slot_name in sv.class_slots(class_name): slot = sv.induced_slot(slot_name, class_name) if slot and slot.range: # Check if range is a class if slot.range in all_classes: if slot.multivalued: cardinality = "||--}|" if slot.required else "||--}o" else: cardinality = "||--||" if slot.required else "||--|o" lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"') # Check if range is an enum elif include_enums and slot.range in all_enums: cardinality = "||--}o" if slot.multivalued else "||--|o" lines.append(f'{class_name} {cardinality} {slot.range} : "{slot_name}"') lines.append("") lines.append("```") lines.append("") return '\n'.join(lines) def main(): """Main entry point.""" print("=" * 60, file=sys.stderr) print("Mermaid ER Diagram Generator with Instance Data", file=sys.stderr) print("=" * 60, file=sys.stderr) # Load schema print(f"\nLoading schema: {SCHEMA_PATH}", file=sys.stderr) sv = SchemaView(SCHEMA_PATH) print(f"✓ Loaded schema: {sv.schema.name}", file=sys.stderr) print(f" Classes: {len(list(sv.all_classes()))}", file=sys.stderr) print(f" Enums: {len(list(sv.all_enums()))}", file=sys.stderr) # Load instance data instances_dir = Path(INSTANCES_DIR) print(f"\nLoading instance data from: {instances_dir}", file=sys.stderr) instance_data = load_instance_data(instances_dir) print(f"✓ Loaded {len(instance_data)} enum instance files", file=sys.stderr) # Generate Mermaid print("\nGenerating Mermaid ER diagram...", file=sys.stderr) mermaid = generate_mermaid_with_instances(sv, instance_data) # Generate timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Ensure output directories exist output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) frontend_path = Path(FRONTEND_OUTPUT) frontend_path.parent.mkdir(parents=True, exist_ok=True) # Write to schemas directory (timestamped) schema_output = output_dir / f"complete_schema_with_instances_{timestamp}.mmd" schema_output.write_text(mermaid) print(f"\n✓ Generated: {schema_output}", file=sys.stderr) print(f" Size: {len(mermaid)} bytes", file=sys.stderr) # Write to frontend directory (overwrite) frontend_path.write_text(mermaid) print(f"✓ Updated frontend: {frontend_path}", file=sys.stderr) print("\n" + "=" * 60, file=sys.stderr) print("Done! The UML diagram now includes instance data.", file=sys.stderr) print("=" * 60, file=sys.stderr) if __name__ == '__main__': main()