glam/scripts/fix_yaml_examples_v2.py
kempersc dfa667c90f Fix LinkML schema for valid RDF generation with proper slot_uri
Summary:
- Create 46 missing slot definition files with proper slot_uri values
- Add slot imports to main schema (01_custodian_name_modular.yaml)
- Fix YAML examples sections in 116+ class and slot files
- Fix PersonObservation.yaml examples section (nested objects → string literals)

Technical changes:
- All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS)
- Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF
- gen-owl now produces valid Turtle with 153,166 triples

New slot files (46):
- RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc.
- Scope slots: scope_includes, scope_excludes, archive_scope
- Organization slots: organization_type, governance_authority, area_served
- Platform slots: platform_type_category, portal_type_category
- Social media slots: social_media_platform_category, post_type_*
- Type hierarchy slots: broader_type, narrower_types, custodian_type_broader
- Wikidata slots: wikidata_equivalent, wikidata_mapping

Generated output:
- schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB)
- Validated with rdflib: 153,166 triples, no malformed URIs
2026-01-07 13:48:03 +01:00

162 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Fix YAML examples formatting issues in LinkML slot files.
The main issue is examples sections with bad formatting like:
examples:
- value: "multi\nline\nstring..."
description: Some description
When it should be:
examples:
- value: |
multi
line
string
description: Some description
"""
import re
import sys
from pathlib import Path
def fix_yaml_content(content: str) -> str:
"""Fix YAML content with examples issues."""
lines = content.split('\n')
result = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is an examples section start
if re.match(r'^(\s*)examples:\s*$', line):
result.append(line)
i += 1
# Process examples items
while i < len(lines):
item_line = lines[i]
# Check if we've left examples section (unindented line that's not empty or comment)
if item_line.strip() and not item_line.startswith(' ') and not item_line.strip().startswith('#'):
break
# Check for "- value:" pattern with inline content
value_match = re.match(r'^(\s+)- value:\s*"(.*)$', item_line)
if value_match:
indent = value_match.group(1)
value_start = value_match.group(2)
# Convert inline string to block scalar
# First collect the full value (may span multiple logical lines due to escapes)
full_value = value_start
# Check if string continues (no closing quote)
while not full_value.rstrip().endswith('"') or full_value.rstrip().endswith('\\"'):
i += 1
if i >= len(lines):
break
full_value += lines[i]
# Now we have the full value string
# Remove trailing quote and unescape
full_value = full_value.rstrip()
if full_value.endswith('"'):
full_value = full_value[:-1]
# Unescape common escapes
full_value = full_value.replace('\\n', '\n').replace('\\"', '"').replace('\\t', '\t')
# Write as block scalar
result.append(f'{indent}- value: |')
for val_line in full_value.split('\n'):
result.append(f'{indent} {val_line}')
i += 1
# Check for description at wrong indent
if i < len(lines):
desc_match = re.match(rf'^{indent}(description:\s*.*)$', lines[i])
if desc_match:
# Fix indentation
result.append(f'{indent} {desc_match.group(1)}')
i += 1
continue
continue
# Check for description at wrong indent (following a properly formatted value)
desc_wrong_indent = re.match(r'^(\s+)(description:\s*.*)$', item_line)
if desc_wrong_indent:
indent = desc_wrong_indent.group(1)
desc_content = desc_wrong_indent.group(2)
# Check if previous line was a "- value:" line or content under it
if result and ('- value:' in result[-1] or result[-1].startswith(indent + ' ')):
# This description should be at indent + 2
result.append(f'{indent} {desc_content}')
i += 1
continue
result.append(item_line)
i += 1
continue
result.append(line)
i += 1
return '\n'.join(result)
def fix_file(filepath: Path) -> bool:
"""Fix a single file. Returns True if modified."""
try:
content = filepath.read_text()
except Exception as e:
print(f"Error reading {filepath}: {e}")
return False
original = content
fixed = fix_yaml_content(content)
if fixed != original:
filepath.write_text(fixed)
return True
return False
def main():
"""Main entry point."""
slots_dir = Path('schemas/20251121/linkml/modules/slots')
if not slots_dir.exists():
print(f"Directory not found: {slots_dir}")
sys.exit(1)
modified = 0
errors = 0
for yaml_file in sorted(slots_dir.glob('*.yaml')):
# First check if file has YAML errors
try:
import yaml
yaml.safe_load(yaml_file.read_text())
continue # File is valid, skip
except yaml.YAMLError:
pass # File has errors, try to fix
if fix_file(yaml_file):
# Verify the fix worked
try:
yaml.safe_load(yaml_file.read_text())
print(f"Fixed: {yaml_file.name}")
modified += 1
except yaml.YAMLError as e:
print(f"Still broken after fix: {yaml_file.name}")
errors += 1
print(f"\nModified {modified} files, {errors} still have errors")
if __name__ == '__main__':
main()