glam/scripts/cleanup_class_descriptions.py
kempersc dfa667c90f Fix LinkML schema for valid RDF generation with proper slot_uri
Summary:
- Create 46 missing slot definition files with proper slot_uri values
- Add slot imports to main schema (01_custodian_name_modular.yaml)
- Fix YAML examples sections in 116+ class and slot files
- Fix PersonObservation.yaml examples section (nested objects → string literals)

Technical changes:
- All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS)
- Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF
- gen-owl now produces valid Turtle with 153,166 triples

New slot files (46):
- RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc.
- Scope slots: scope_includes, scope_excludes, archive_scope
- Organization slots: organization_type, governance_authority, area_served
- Platform slots: platform_type_category, portal_type_category
- Social media slots: social_media_platform_category, post_type_*
- Type hierarchy slots: broader_type, narrower_types, custodian_type_broader
- Wikidata slots: wikidata_equivalent, wikidata_mapping

Generated output:
- schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB)
- Validated with rdflib: 153,166 triples, no malformed URIs
2026-01-07 13:48:03 +01:00

250 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Cleanup Class Descriptions - Simplified Version
This script ONLY removes redundant sections from class descriptions.
It does NOT try to extract and store complex data structures.
Sections removed:
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
- **Ontological Alignment**: - Redundant (mappings capture this)
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
- **RDF Serialization**: - Implementation detail
- **SKOS**: - Redundant (mappings capture this)
- **Dublin Core**: - Redundant (mappings capture this)
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
- **Example Structure**: - Implementation detail
Sections KEPT (contain unique information):
- **Wikidata**: Q-number reference (important)
- **Scope**: Detailed scope description
- **Notable Examples**: Real-world institution examples
- **Related Types**: Linked types with Wikidata IDs
- **Historical Significance**: Historical context
- **Dutch Context**: Dutch-specific information
- etc.
Usage:
python scripts/cleanup_class_descriptions.py [--dry-run] [--verbose] [--file PATH]
"""
import argparse
import re
import sys
from pathlib import Path
try:
from ruamel.yaml import YAML
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 120
yaml.indent(mapping=2, sequence=2, offset=2)
USE_RUAMEL = True
except ImportError:
import yaml
USE_RUAMEL = False
# Sections to REMOVE entirely (already structured elsewhere or redundant)
REMOVE_PATTERNS = [
# Dual-class pattern - redundant with class hierarchy
(r'\n\s*\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dual_class_pattern'),
# Ontology alignment sections - redundant with mappings
(r'\n\s*\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontological_alignment'),
(r'\n\s*\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_upper'),
(r'\n\s*\*\*Ontology Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_mixed'),
# Multilingual labels - redundant with structured_aliases
(r'\n\s*\*\*Multilingual Labels\*\*:\s*\n(?:\s*- [a-z]{2}: .*\n)+', 'multilingual_labels'),
# SKOS alignment - redundant with mappings
(r'\n\s*\*\*SKOS\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment'),
(r'\n\s*\*\*SKOS Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment_full'),
# Dublin Core - redundant with mappings
(r'\n\s*\*\*Dublin Core\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dublin_core'),
# RDF examples - implementation details
(r'\n\s*\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'),
# Example JSON/YAML structure - implementation details
(r'\n\s*\*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n```.*?```', 'example_structure'),
# GLAMORCUBES category - redundant with annotations
(r'\n\s*\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'glamorcubes_category'),
]
def clean_description(description: str, verbose: bool = False) -> tuple[str, list[str]]:
"""
Remove redundant sections from a class description.
Returns:
tuple: (cleaned_description, list_of_removed_sections)
"""
if not description:
return description, []
cleaned = description
removed_sections = []
for pattern, section_name in REMOVE_PATTERNS:
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
if regex.search(cleaned):
cleaned = regex.sub('', cleaned)
removed_sections.append(section_name)
if verbose:
print(f" Removed: {section_name}")
# Clean up extra whitespace
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
cleaned = cleaned.strip()
return cleaned, removed_sections
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
"""Process a single class YAML file."""
result = {
'file': str(file_path),
'modified': False,
'classes_processed': [],
'removed_sections': [],
'errors': []
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
if USE_RUAMEL:
from io import StringIO
data = yaml.load(StringIO(content))
else:
import yaml as pyyaml
data = pyyaml.safe_load(content)
if not data:
return result
modified = False
# Process classes
if 'classes' in data and isinstance(data['classes'], dict):
for class_name, class_data in data['classes'].items():
if not isinstance(class_data, dict):
continue
if 'description' not in class_data:
continue
desc = class_data['description']
if not isinstance(desc, str):
continue
original_desc = str(desc)
cleaned_desc, removed = clean_description(original_desc, verbose)
if removed:
class_data['description'] = cleaned_desc
result['classes_processed'].append(class_name)
result['removed_sections'].extend(removed)
modified = True
if verbose:
print(f" Class: {class_name}")
result['modified'] = modified
if modified and not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
if USE_RUAMEL:
yaml.dump(data, f)
else:
import yaml as pyyaml
pyyaml.dump(data, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120)
except Exception as e:
result['errors'].append(str(e))
import traceback
if verbose:
traceback.print_exc()
return result
def main():
parser = argparse.ArgumentParser(description='Cleanup class descriptions by removing redundant sections')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
parser.add_argument('--file', type=str, help='Process a single file')
args = parser.parse_args()
classes_dir = Path('schemas/20251121/linkml/modules/classes')
if args.file:
files = [Path(args.file)]
else:
files = sorted(classes_dir.glob('*.yaml'))
print(f"Processing {len(files)} class files...")
if args.dry_run:
print("DRY RUN - no files will be modified\n")
stats = {
'files_processed': 0,
'files_modified': 0,
'classes_processed': 0,
'sections_removed': {},
'errors': []
}
for file_path in files:
if args.verbose:
print(f"\nProcessing: {file_path.name}")
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
stats['files_processed'] += 1
if result['modified']:
stats['files_modified'] += 1
if not args.verbose:
print(f" Modified: {file_path.name} ({len(result['classes_processed'])} classes)")
stats['classes_processed'] += len(result['classes_processed'])
for section in result['removed_sections']:
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
if result['errors']:
stats['errors'].extend(result['errors'])
print(f" ERROR in {file_path.name}: {result['errors']}")
# Summary
print(f"\n{'=' * 60}")
print("SUMMARY")
print(f"{'=' * 60}")
print(f"Files processed: {stats['files_processed']}")
print(f"Files modified: {stats['files_modified']}")
print(f"Classes processed: {stats['classes_processed']}")
print(f"\nSections removed by type:")
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
print(f" {section}: {count}")
if stats['errors']:
print(f"\nErrors: {len(stats['errors'])}")
for error in stats['errors'][:10]:
print(f" - {error}")
if args.dry_run:
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()