glam/scripts/cleanup_class_descriptions_v2.py
kempersc dfa667c90f Fix LinkML schema for valid RDF generation with proper slot_uri
Summary:
- Create 46 missing slot definition files with proper slot_uri values
- Add slot imports to main schema (01_custodian_name_modular.yaml)
- Fix YAML examples sections in 116+ class and slot files
- Fix PersonObservation.yaml examples section (nested objects → string literals)

Technical changes:
- All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS)
- Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF
- gen-owl now produces valid Turtle with 153,166 triples

New slot files (46):
- RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc.
- Scope slots: scope_includes, scope_excludes, archive_scope
- Organization slots: organization_type, governance_authority, area_served
- Platform slots: platform_type_category, portal_type_category
- Social media slots: social_media_platform_category, post_type_*
- Type hierarchy slots: broader_type, narrower_types, custodian_type_broader
- Wikidata slots: wikidata_equivalent, wikidata_mapping

Generated output:
- schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB)
- Validated with rdflib: 153,166 triples, no malformed URIs
2026-01-07 13:48:03 +01:00

168 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Cleanup Class Descriptions v2 - Text-Based Approach
This script removes redundant sections from class descriptions using TEXT-BASED
regex replacement, NOT YAML parsing. This preserves the exact formatting of files.
Sections removed:
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
- **Ontological Alignment**: - Redundant (mappings capture this)
- **ONTOLOGY ALIGNMENT**: - Same as above, different case
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
- **RDF Serialization**: - Implementation detail
- **SKOS**: / **SKOS Alignment**: - Redundant (mappings capture this)
- **Dublin Core**: - Redundant (mappings capture this)
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
- **Example Structure**: - Implementation detail
Usage:
python scripts/cleanup_class_descriptions_v2.py [--dry-run] [--verbose] [--file PATH]
"""
import argparse
import re
from pathlib import Path
# Patterns to remove from description content
# These patterns are designed to match section content without consuming the final newline before YAML keys
REMOVE_PATTERNS = [
# Dual-class pattern - matches until next section or end of indented block
(r'\n \*\*Dual-Class Pattern\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dual_class_pattern'),
# Ontology alignment sections (various cases)
(r'\n \*\*ONTOLOGY ALIGNMENT\*\*:[^\n]*\n(?: [^\n*][^\n]*\n| \n)*(?: [0-9]+\. \*\*[^\n]+\n(?: [^\n]+\n)*)*', 'ontology_alignment_upper'),
(r'\n \*\*Ontological Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontological_alignment'),
(r'\n \*\*Ontology Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontology_alignment_mixed'),
# Multilingual labels - bullet list
(r'\n \*\*Multilingual Labels\*\*:\n(?: - [a-z]{2,3}: [^\n]+\n)+', 'multilingual_labels'),
# SKOS alignment sections
(r'\n \*\*SKOS\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment'),
(r'\n \*\*SKOS Alignment\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment_full'),
# Dublin Core section
(r'\n \*\*Dublin Core\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dublin_core'),
# RDF examples with code blocks
(r'\n \*\*RDF Serialization(?: Example)?\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'rdf_serialization'),
# Example JSON/YAML structure with code blocks
(r'\n \*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'example_structure'),
# GLAMORCUBES category
(r'\n \*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'glamorcubes_category'),
]
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
"""Process a single class YAML file using text-based replacement."""
result = {
'file': str(file_path),
'modified': False,
'removed_sections': [],
'errors': []
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
# Apply each removal pattern
for pattern, section_name in REMOVE_PATTERNS:
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
if regex.search(content):
content = regex.sub('', content)
result['removed_sections'].append(section_name)
if verbose:
print(f" Removed: {section_name}")
# Clean up multiple consecutive blank lines (more than 2)
content = re.sub(r'\n{4,}', '\n\n\n', content)
# Check if content changed
if content != original_content:
result['modified'] = True
if not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
except Exception as e:
result['errors'].append(str(e))
import traceback
if verbose:
traceback.print_exc()
return result
def main():
parser = argparse.ArgumentParser(description='Cleanup class descriptions (text-based)')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
parser.add_argument('--file', type=str, help='Process a single file')
args = parser.parse_args()
classes_dir = Path('schemas/20251121/linkml/modules/classes')
if args.file:
files = [Path(args.file)]
else:
files = sorted(classes_dir.glob('*.yaml'))
print(f"Processing {len(files)} class files...")
if args.dry_run:
print("DRY RUN - no files will be modified\n")
stats = {
'files_processed': 0,
'files_modified': 0,
'sections_removed': {},
'errors': []
}
for file_path in files:
if args.verbose:
print(f"\nProcessing: {file_path.name}")
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
stats['files_processed'] += 1
if result['modified']:
stats['files_modified'] += 1
if not args.verbose:
print(f" Modified: {file_path.name}")
for section in result['removed_sections']:
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
if result['errors']:
stats['errors'].extend(result['errors'])
print(f" ERROR in {file_path.name}: {result['errors']}")
# Summary
print(f"\n{'=' * 60}")
print("SUMMARY")
print(f"{'=' * 60}")
print(f"Files processed: {stats['files_processed']}")
print(f"Files modified: {stats['files_modified']}")
print(f"\nSections removed by type:")
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
print(f" {section}: {count}")
if stats['errors']:
print(f"\nErrors: {len(stats['errors'])}")
for error in stats['errors'][:10]:
print(f" - {error}")
if args.dry_run:
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()