- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
226 lines
7.5 KiB
Python
226 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup Redundant Description Text from LinkML Class Files
|
|
|
|
This script removes description sections that are NOW represented as structured
|
|
slots/classes in the LinkML schema. It follows the principle:
|
|
"Only remove text that is already represented as structured KG classes and predicates"
|
|
|
|
SECTIONS REMOVED (fully structured):
|
|
- **Dual-Class Pattern**: → dual_class_link slot (DualClassLink class)
|
|
- **Ontological Alignment**: → class_uri, *_mappings LinkML elements
|
|
- **Multilingual Labels**: → structured_aliases LinkML element
|
|
|
|
SECTIONS KEPT (not yet fully structured):
|
|
- **Scope**: - Domain-specific content descriptions
|
|
- **Notable Examples**: - Real-world institution examples
|
|
- **Historical Significance**: - Contextual importance
|
|
- **Privacy Considerations**: - Only partially structured (privacy_note slot)
|
|
- **Preservation Challenges**: - Only partially structured (preservation_note slot)
|
|
- **Related Types**: - Partially in see_also, but descriptions not structured
|
|
- **Wikidata**: Q##### - Keep as human-readable reference
|
|
|
|
Usage:
|
|
python scripts/cleanup_redundant_descriptions.py --dry-run # Preview changes
|
|
python scripts/cleanup_redundant_descriptions.py # Apply changes
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Tuple, List
|
|
|
|
|
|
# Sections that ARE fully structured and can be removed
|
|
STRUCTURED_SECTIONS = [
|
|
'Dual-Class Pattern',
|
|
'Ontological Alignment',
|
|
'Multilingual Labels',
|
|
]
|
|
|
|
|
|
def remove_section(text: str, section_header: str) -> Tuple[str, bool]:
|
|
"""
|
|
Remove a markdown section from description text.
|
|
|
|
A section starts with **Header**: and ends at the next **Header**: or end of text.
|
|
|
|
Returns:
|
|
Tuple of (modified_text, was_modified)
|
|
"""
|
|
# Pattern matches **Section Header**: followed by content until next **Header**: or end
|
|
# Using re.DOTALL to match across newlines
|
|
pattern = rf'\*\*{re.escape(section_header)}\*\*:.*?(?=\n[ ]*\*\*[A-Z]|\Z)'
|
|
|
|
new_text, count = re.subn(pattern, '', text, flags=re.DOTALL)
|
|
|
|
return new_text, count > 0
|
|
|
|
|
|
def cleanup_description(description: str) -> Tuple[str, List[str]]:
|
|
"""
|
|
Remove structured sections from a description.
|
|
|
|
Returns:
|
|
Tuple of (cleaned_description, list_of_removed_sections)
|
|
"""
|
|
removed = []
|
|
|
|
for section in STRUCTURED_SECTIONS:
|
|
description, was_removed = remove_section(description, section)
|
|
if was_removed:
|
|
removed.append(section)
|
|
|
|
# Clean up extra newlines
|
|
description = re.sub(r'\n{3,}', '\n\n', description)
|
|
description = description.rstrip()
|
|
|
|
return description, removed
|
|
|
|
|
|
def process_yaml_file(filepath: Path, dry_run: bool = True) -> dict:
|
|
"""
|
|
Process a single YAML file to clean up class-level descriptions.
|
|
|
|
We only target class-level descriptions (4 spaces indent), NOT slot_usage descriptions.
|
|
|
|
Returns dict with statistics about changes made.
|
|
"""
|
|
content = filepath.read_text(encoding='utf-8')
|
|
|
|
stats = {
|
|
'file': filepath.name,
|
|
'classes_modified': 0,
|
|
'sections_removed': [],
|
|
'modified': False
|
|
}
|
|
|
|
lines = content.split('\n')
|
|
new_lines = []
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Look for class-level description (exactly 4 spaces indent)
|
|
# This avoids matching slot_usage descriptions which have deeper indentation
|
|
if re.match(r'^ description: \|', line):
|
|
# Found a class-level description block
|
|
desc_lines = [line]
|
|
i += 1
|
|
|
|
# Collect all indented lines that are part of this description
|
|
# Description content is indented with 6+ spaces
|
|
while i < len(lines):
|
|
next_line = lines[i]
|
|
# Check if line is part of description (6+ spaces) or empty
|
|
if next_line == '' or re.match(r'^ ', next_line):
|
|
desc_lines.append(next_line)
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
# Join and process the description
|
|
desc_block = '\n'.join(desc_lines)
|
|
|
|
# Check if this description has any structured sections to remove
|
|
has_sections = any(f'**{s}**:' in desc_block for s in STRUCTURED_SECTIONS)
|
|
|
|
if has_sections:
|
|
# Extract the description content (after "description: |")
|
|
desc_content = '\n'.join(desc_lines[1:]) # Skip the "description: |" line
|
|
|
|
# Clean it up
|
|
cleaned_content, removed = cleanup_description(desc_content)
|
|
|
|
if removed:
|
|
stats['classes_modified'] += 1
|
|
stats['sections_removed'].extend(removed)
|
|
stats['modified'] = True
|
|
|
|
# Rebuild the description block
|
|
new_lines.append(' description: |')
|
|
for cleaned_line in cleaned_content.split('\n'):
|
|
new_lines.append(cleaned_line)
|
|
else:
|
|
# No changes, keep original
|
|
new_lines.extend(desc_lines)
|
|
else:
|
|
# No structured sections, keep original
|
|
new_lines.extend(desc_lines)
|
|
else:
|
|
new_lines.append(line)
|
|
i += 1
|
|
|
|
if stats['modified']:
|
|
new_content = '\n'.join(new_lines)
|
|
|
|
# Ensure file ends with newline
|
|
if not new_content.endswith('\n'):
|
|
new_content += '\n'
|
|
|
|
if not dry_run:
|
|
filepath.write_text(new_content, encoding='utf-8')
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Clean up redundant description text from LinkML class files'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Preview changes without modifying files'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Show detailed output for each file'
|
|
)
|
|
parser.add_argument(
|
|
'--file',
|
|
type=str,
|
|
help='Process a single file instead of all files'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Find all class YAML files
|
|
classes_dir = Path('schemas/20251121/linkml/modules/classes')
|
|
|
|
if args.file:
|
|
yaml_files = [Path(args.file)]
|
|
else:
|
|
yaml_files = sorted(classes_dir.glob('*.yaml'))
|
|
|
|
total_modified = 0
|
|
total_sections = 0
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")
|
|
|
|
for filepath in yaml_files:
|
|
stats = process_yaml_file(filepath, dry_run=args.dry_run)
|
|
|
|
if stats['modified']:
|
|
total_modified += 1
|
|
total_sections += len(stats['sections_removed'])
|
|
|
|
if args.verbose or args.dry_run:
|
|
print(f"{'Would modify' if args.dry_run else 'Modified'}: {stats['file']}")
|
|
print(f" Classes: {stats['classes_modified']}")
|
|
print(f" Sections removed: {', '.join(sorted(set(stats['sections_removed'])))}")
|
|
print()
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Summary:")
|
|
print(f" Files {'that would be' if args.dry_run else ''} modified: {total_modified}")
|
|
print(f" Total sections removed: {total_sections}")
|
|
|
|
if args.dry_run:
|
|
print(f"\nRun without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|