glam/scripts/fix_linkml_metadata.py
kempersc 1516d509cf Add metadata to LinkML class definitions and update prefixes
- Added `id`, `name`, `title`, and `description` fields to multiple LinkML class YAML files.
- Standardized prefixes across all class definitions.
- Introduced a new script `fix_linkml_metadata.py` to automate the addition of metadata to class files.
- Updated existing class files to ensure compliance with the new metadata structure.
2026-01-29 17:40:47 +01:00

111 lines
3.8 KiB
Python

import os
import re
directory = "schemas/20251121/linkml/modules/classes/"
prefixes_block = """prefixes:
linkml: https://w3id.org/linkml/
schema: http://schema.org/
skos: http://www.w3.org/2004/02/skos/core#
rico: https://www.ica.org/standards/RiC/ontology#
wd: http://www.wikidata.org/entity/
"""
imports_block = """imports:
- linkml:types
"""
def split_camel_case(name):
return re.sub('([a-z0-9])([A-Z])', r'\1 \2', name)
count = 0
for filename in os.listdir(directory):
if not filename.endswith(".yaml"):
continue
filepath = os.path.join(directory, filename)
with open(filepath, 'r') as f:
content = f.read()
if content.startswith("id:"):
continue # Already has metadata
# Check if imports already exist in the file (even if unstructured)
has_imports = re.search(r"^imports:", content, re.MULTILINE)
if not content.strip().startswith("classes:") and not has_imports:
# Some files might have comments at the top?
# If it doesn't start with classes: or id:, we should check.
# But my grep showed files starting with classes:
pass
# Simple parsing
lines = content.splitlines()
class_name = None
description = None
# Determine class name from filename first as fallback/confirmation
filename_class = filename.replace(".yaml", "")
found_class_in_content = False
for i, line in enumerate(lines):
if line.strip().startswith("classes:"):
# Look for class name in subsequent lines
for j in range(i+1, min(i+5, len(lines))):
# Matches " ClassName:"
match = re.match(r"^ ([a-zA-Z0-9_]+):", lines[j])
if match:
class_name = match.group(1)
found_class_in_content = True
# Look for description inside the class
for k in range(j+1, min(j+15, len(lines))):
# Matches " description: Value"
desc_match = re.match(r"^ description:\s+(.*)", lines[k])
if desc_match:
description = desc_match.group(1).strip()
# Handle multi-line description if needed?
if description.startswith(">") or description.startswith("|"):
description = None
break
break
break
if not class_name:
# Fallback to filename if parsing failed (e.g. if file is empty or weird)
class_name = filename_class
# Ensure class name matches filename (convention)
if class_name != filename_class:
print(f"Warning: Class name '{class_name}' in content differs from filename '{filename_class}'. Using filename.")
class_name = filename_class
title = split_camel_case(class_name)
if not description:
description = f"LinkML class definition for {title}"
else:
# Strip quotes if present
if (description.startswith('"') and description.endswith('"')) or (description.startswith("'") and description.endswith("'")):
description = description[1:-1]
# Construct new content
new_header = f"id: https://nde.nl/ontology/hc/class/{class_name}\n"
new_header += f"name: {class_name}\n"
new_header += f"title: {title}\n"
new_header += f"description: {description}\n"
new_header += prefixes_block
if not has_imports:
new_header += imports_block
new_content = new_header + content
with open(filepath, 'w') as f:
f.write(new_content)
count += 1
# print(f"Updated {filename}")
print(f"Total files updated: {count}")