Summary: - Create 46 missing slot definition files with proper slot_uri values - Add slot imports to main schema (01_custodian_name_modular.yaml) - Fix YAML examples sections in 116+ class and slot files - Fix PersonObservation.yaml examples section (nested objects → string literals) Technical changes: - All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS) - Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF - gen-owl now produces valid Turtle with 153,166 triples New slot files (46): - RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc. - Scope slots: scope_includes, scope_excludes, archive_scope - Organization slots: organization_type, governance_authority, area_served - Platform slots: platform_type_category, portal_type_category - Social media slots: social_media_platform_category, post_type_* - Type hierarchy slots: broader_type, narrower_types, custodian_type_broader - Wikidata slots: wikidata_equivalent, wikidata_mapping Generated output: - schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB) - Validated with rdflib: 153,166 triples, no malformed URIs
162 lines
5.5 KiB
Python
162 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix YAML examples formatting issues in LinkML slot files.
|
|
|
|
The main issue is examples sections with bad formatting like:
|
|
examples:
|
|
- value: "multi\nline\nstring..."
|
|
description: Some description
|
|
|
|
When it should be:
|
|
examples:
|
|
- value: |
|
|
multi
|
|
line
|
|
string
|
|
description: Some description
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def fix_yaml_content(content: str) -> str:
|
|
"""Fix YAML content with examples issues."""
|
|
lines = content.split('\n')
|
|
result = []
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check if this is an examples section start
|
|
if re.match(r'^(\s*)examples:\s*$', line):
|
|
result.append(line)
|
|
i += 1
|
|
|
|
# Process examples items
|
|
while i < len(lines):
|
|
item_line = lines[i]
|
|
|
|
# Check if we've left examples section (unindented line that's not empty or comment)
|
|
if item_line.strip() and not item_line.startswith(' ') and not item_line.strip().startswith('#'):
|
|
break
|
|
|
|
# Check for "- value:" pattern with inline content
|
|
value_match = re.match(r'^(\s+)- value:\s*"(.*)$', item_line)
|
|
if value_match:
|
|
indent = value_match.group(1)
|
|
value_start = value_match.group(2)
|
|
|
|
# Convert inline string to block scalar
|
|
# First collect the full value (may span multiple logical lines due to escapes)
|
|
full_value = value_start
|
|
|
|
# Check if string continues (no closing quote)
|
|
while not full_value.rstrip().endswith('"') or full_value.rstrip().endswith('\\"'):
|
|
i += 1
|
|
if i >= len(lines):
|
|
break
|
|
full_value += lines[i]
|
|
|
|
# Now we have the full value string
|
|
# Remove trailing quote and unescape
|
|
full_value = full_value.rstrip()
|
|
if full_value.endswith('"'):
|
|
full_value = full_value[:-1]
|
|
|
|
# Unescape common escapes
|
|
full_value = full_value.replace('\\n', '\n').replace('\\"', '"').replace('\\t', '\t')
|
|
|
|
# Write as block scalar
|
|
result.append(f'{indent}- value: |')
|
|
for val_line in full_value.split('\n'):
|
|
result.append(f'{indent} {val_line}')
|
|
|
|
i += 1
|
|
|
|
# Check for description at wrong indent
|
|
if i < len(lines):
|
|
desc_match = re.match(rf'^{indent}(description:\s*.*)$', lines[i])
|
|
if desc_match:
|
|
# Fix indentation
|
|
result.append(f'{indent} {desc_match.group(1)}')
|
|
i += 1
|
|
continue
|
|
continue
|
|
|
|
# Check for description at wrong indent (following a properly formatted value)
|
|
desc_wrong_indent = re.match(r'^(\s+)(description:\s*.*)$', item_line)
|
|
if desc_wrong_indent:
|
|
indent = desc_wrong_indent.group(1)
|
|
desc_content = desc_wrong_indent.group(2)
|
|
# Check if previous line was a "- value:" line or content under it
|
|
if result and ('- value:' in result[-1] or result[-1].startswith(indent + ' ')):
|
|
# This description should be at indent + 2
|
|
result.append(f'{indent} {desc_content}')
|
|
i += 1
|
|
continue
|
|
|
|
result.append(item_line)
|
|
i += 1
|
|
continue
|
|
|
|
result.append(line)
|
|
i += 1
|
|
|
|
return '\n'.join(result)
|
|
|
|
|
|
def fix_file(filepath: Path) -> bool:
|
|
"""Fix a single file. Returns True if modified."""
|
|
try:
|
|
content = filepath.read_text()
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return False
|
|
|
|
original = content
|
|
fixed = fix_yaml_content(content)
|
|
|
|
if fixed != original:
|
|
filepath.write_text(fixed)
|
|
return True
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
slots_dir = Path('schemas/20251121/linkml/modules/slots')
|
|
|
|
if not slots_dir.exists():
|
|
print(f"Directory not found: {slots_dir}")
|
|
sys.exit(1)
|
|
|
|
modified = 0
|
|
errors = 0
|
|
|
|
for yaml_file in sorted(slots_dir.glob('*.yaml')):
|
|
# First check if file has YAML errors
|
|
try:
|
|
import yaml
|
|
yaml.safe_load(yaml_file.read_text())
|
|
continue # File is valid, skip
|
|
except yaml.YAMLError:
|
|
pass # File has errors, try to fix
|
|
|
|
if fix_file(yaml_file):
|
|
# Verify the fix worked
|
|
try:
|
|
yaml.safe_load(yaml_file.read_text())
|
|
print(f"Fixed: {yaml_file.name}")
|
|
modified += 1
|
|
except yaml.YAMLError as e:
|
|
print(f"Still broken after fix: {yaml_file.name}")
|
|
errors += 1
|
|
|
|
print(f"\nModified {modified} files, {errors} still have errors")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|