glam/scripts/cleanup_redundant_descriptions.py
kempersc 98c42bf272 Fix LinkML URI conflicts and generate RDF outputs
- Fix scope_note → finding_aid_scope_note in FindingAid.yaml
- Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead)
- Remove duplicate rico_record_set_type from class_metadata_slots.yaml
- Fix range types for equals_string compatibility (uriorcurie → string)
- Move class names from close_mappings to see_also in 10 RecordSetTypes files
- Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context
- Sync schemas to frontend/public/schemas/

Files: 1,151 changed (includes prior CustodianType migration)
2026-01-07 12:32:59 +01:00

226 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Cleanup Redundant Description Text from LinkML Class Files
This script removes description sections that are NOW represented as structured
slots/classes in the LinkML schema. It follows the principle:
"Only remove text that is already represented as structured KG classes and predicates"
SECTIONS REMOVED (fully structured):
- **Dual-Class Pattern**: → dual_class_link slot (DualClassLink class)
- **Ontological Alignment**: → class_uri, *_mappings LinkML elements
- **Multilingual Labels**: → structured_aliases LinkML element
SECTIONS KEPT (not yet fully structured):
- **Scope**: - Domain-specific content descriptions
- **Notable Examples**: - Real-world institution examples
- **Historical Significance**: - Contextual importance
- **Privacy Considerations**: - Only partially structured (privacy_note slot)
- **Preservation Challenges**: - Only partially structured (preservation_note slot)
- **Related Types**: - Partially in see_also, but descriptions not structured
- **Wikidata**: Q##### - Keep as human-readable reference
Usage:
python scripts/cleanup_redundant_descriptions.py --dry-run # Preview changes
python scripts/cleanup_redundant_descriptions.py # Apply changes
"""
import argparse
import re
from pathlib import Path
from typing import Tuple, List
# Sections that ARE fully structured and can be removed
STRUCTURED_SECTIONS = [
'Dual-Class Pattern',
'Ontological Alignment',
'Multilingual Labels',
]
def remove_section(text: str, section_header: str) -> Tuple[str, bool]:
"""
Remove a markdown section from description text.
A section starts with **Header**: and ends at the next **Header**: or end of text.
Returns:
Tuple of (modified_text, was_modified)
"""
# Pattern matches **Section Header**: followed by content until next **Header**: or end
# Using re.DOTALL to match across newlines
pattern = rf'\*\*{re.escape(section_header)}\*\*:.*?(?=\n[ ]*\*\*[A-Z]|\Z)'
new_text, count = re.subn(pattern, '', text, flags=re.DOTALL)
return new_text, count > 0
def cleanup_description(description: str) -> Tuple[str, List[str]]:
"""
Remove structured sections from a description.
Returns:
Tuple of (cleaned_description, list_of_removed_sections)
"""
removed = []
for section in STRUCTURED_SECTIONS:
description, was_removed = remove_section(description, section)
if was_removed:
removed.append(section)
# Clean up extra newlines
description = re.sub(r'\n{3,}', '\n\n', description)
description = description.rstrip()
return description, removed
def process_yaml_file(filepath: Path, dry_run: bool = True) -> dict:
"""
Process a single YAML file to clean up class-level descriptions.
We only target class-level descriptions (4 spaces indent), NOT slot_usage descriptions.
Returns dict with statistics about changes made.
"""
content = filepath.read_text(encoding='utf-8')
stats = {
'file': filepath.name,
'classes_modified': 0,
'sections_removed': [],
'modified': False
}
lines = content.split('\n')
new_lines = []
i = 0
while i < len(lines):
line = lines[i]
# Look for class-level description (exactly 4 spaces indent)
# This avoids matching slot_usage descriptions which have deeper indentation
if re.match(r'^ description: \|', line):
# Found a class-level description block
desc_lines = [line]
i += 1
# Collect all indented lines that are part of this description
# Description content is indented with 6+ spaces
while i < len(lines):
next_line = lines[i]
# Check if line is part of description (6+ spaces) or empty
if next_line == '' or re.match(r'^ ', next_line):
desc_lines.append(next_line)
i += 1
else:
break
# Join and process the description
desc_block = '\n'.join(desc_lines)
# Check if this description has any structured sections to remove
has_sections = any(f'**{s}**:' in desc_block for s in STRUCTURED_SECTIONS)
if has_sections:
# Extract the description content (after "description: |")
desc_content = '\n'.join(desc_lines[1:]) # Skip the "description: |" line
# Clean it up
cleaned_content, removed = cleanup_description(desc_content)
if removed:
stats['classes_modified'] += 1
stats['sections_removed'].extend(removed)
stats['modified'] = True
# Rebuild the description block
new_lines.append(' description: |')
for cleaned_line in cleaned_content.split('\n'):
new_lines.append(cleaned_line)
else:
# No changes, keep original
new_lines.extend(desc_lines)
else:
# No structured sections, keep original
new_lines.extend(desc_lines)
else:
new_lines.append(line)
i += 1
if stats['modified']:
new_content = '\n'.join(new_lines)
# Ensure file ends with newline
if not new_content.endswith('\n'):
new_content += '\n'
if not dry_run:
filepath.write_text(new_content, encoding='utf-8')
return stats
def main():
parser = argparse.ArgumentParser(
description='Clean up redundant description text from LinkML class files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview changes without modifying files'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show detailed output for each file'
)
parser.add_argument(
'--file',
type=str,
help='Process a single file instead of all files'
)
args = parser.parse_args()
# Find all class YAML files
classes_dir = Path('schemas/20251121/linkml/modules/classes')
if args.file:
yaml_files = [Path(args.file)]
else:
yaml_files = sorted(classes_dir.glob('*.yaml'))
total_modified = 0
total_sections = 0
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")
for filepath in yaml_files:
stats = process_yaml_file(filepath, dry_run=args.dry_run)
if stats['modified']:
total_modified += 1
total_sections += len(stats['sections_removed'])
if args.verbose or args.dry_run:
print(f"{'Would modify' if args.dry_run else 'Modified'}: {stats['file']}")
print(f" Classes: {stats['classes_modified']}")
print(f" Sections removed: {', '.join(sorted(set(stats['sections_removed'])))}")
print()
print(f"\n{'=' * 60}")
print(f"Summary:")
print(f" Files {'that would be' if args.dry_run else ''} modified: {total_modified}")
print(f" Total sections removed: {total_sections}")
if args.dry_run:
print(f"\nRun without --dry-run to apply changes.")
if __name__ == '__main__':
main()