glam/scripts/fix_linkml_metadata.py

import os
import re

directory = "schemas/20251121/linkml/modules/classes/"

prefixes_block = """prefixes:
  linkml: https://w3id.org/linkml/
  schema: http://schema.org/
  skos: http://www.w3.org/2004/02/skos/core#
  rico: https://www.ica.org/standards/RiC/ontology#
  wd: http://www.wikidata.org/entity/
"""

imports_block = """imports:
- linkml:types
"""

def split_camel_case(name):
    return re.sub('([a-z0-9])([A-Z])', r'\1 \2', name)

count = 0

for filename in os.listdir(directory):
    if not filename.endswith(".yaml"):
        continue

    filepath = os.path.join(directory, filename)
    with open(filepath, 'r') as f:
        content = f.read()

    if content.startswith("id:"):
        continue # Already has metadata

    # Check if imports already exist in the file (even if unstructured)
    has_imports = re.search(r"^imports:", content, re.MULTILINE)

    if not content.strip().startswith("classes:") and not has_imports:
        # Some files might have comments at the top?
        # If it doesn't start with classes: or id:, we should check.
        # But my grep showed files starting with classes:
        pass

    # Simple parsing
    lines = content.splitlines()
    class_name = None
    description = None

    # Determine class name from filename first as fallback/confirmation
    filename_class = filename.replace(".yaml", "")

    found_class_in_content = False

    for i, line in enumerate(lines):
        if line.strip().startswith("classes:"):
            # Look for class name in subsequent lines
            for j in range(i+1, min(i+5, len(lines))):
                # Matches "  ClassName:"
                match = re.match(r"^  ([a-zA-Z0-9_]+):", lines[j])
                if match:
                    class_name = match.group(1)
                    found_class_in_content = True

                    # Look for description inside the class
                    for k in range(j+1, min(j+15, len(lines))):
                         # Matches "    description: Value"
                         desc_match = re.match(r"^    description:\s+(.*)", lines[k])
                         if desc_match:
                             description = desc_match.group(1).strip()
                             # Handle multi-line description if needed?
                             if description.startswith(">") or description.startswith("|"):
                                 description = None
                             break
                    break
            break

    if not class_name:
        # Fallback to filename if parsing failed (e.g. if file is empty or weird)
        class_name = filename_class

    # Ensure class name matches filename (convention)
    if class_name != filename_class:
        print(f"Warning: Class name '{class_name}' in content differs from filename '{filename_class}'. Using filename.")
        class_name = filename_class

    title = split_camel_case(class_name)
    if not description:
        description = f"LinkML class definition for {title}"
    else:
        # Strip quotes if present
        if (description.startswith('"') and description.endswith('"')) or (description.startswith("'") and description.endswith("'")):
            description = description[1:-1]

    # Construct new content
    new_header = f"id: https://nde.nl/ontology/hc/class/{class_name}\n"
    new_header += f"name: {class_name}\n"
    new_header += f"title: {title}\n"
    new_header += f"description: {description}\n"
    new_header += prefixes_block

    if not has_imports:
        new_header += imports_block

    new_content = new_header + content

    with open(filepath, 'w') as f:
        f.write(new_content)

    count += 1
    # print(f"Updated {filename}")

print(f"Total files updated: {count}")