Summary: - Create 46 missing slot definition files with proper slot_uri values - Add slot imports to main schema (01_custodian_name_modular.yaml) - Fix YAML examples sections in 116+ class and slot files - Fix PersonObservation.yaml examples section (nested objects → string literals) Technical changes: - All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS) - Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF - gen-owl now produces valid Turtle with 153,166 triples New slot files (46): - RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc. - Scope slots: scope_includes, scope_excludes, archive_scope - Organization slots: organization_type, governance_authority, area_served - Platform slots: platform_type_category, portal_type_category - Social media slots: social_media_platform_category, post_type_* - Type hierarchy slots: broader_type, narrower_types, custodian_type_broader - Wikidata slots: wikidata_equivalent, wikidata_mapping Generated output: - schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB) - Validated with rdflib: 153,166 triples, no malformed URIs
168 lines
6.3 KiB
Python
168 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup Class Descriptions v2 - Text-Based Approach
|
|
|
|
This script removes redundant sections from class descriptions using TEXT-BASED
|
|
regex replacement, NOT YAML parsing. This preserves the exact formatting of files.
|
|
|
|
Sections removed:
|
|
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
|
|
- **Ontological Alignment**: - Redundant (mappings capture this)
|
|
- **ONTOLOGY ALIGNMENT**: - Same as above, different case
|
|
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
|
|
- **RDF Serialization**: - Implementation detail
|
|
- **SKOS**: / **SKOS Alignment**: - Redundant (mappings capture this)
|
|
- **Dublin Core**: - Redundant (mappings capture this)
|
|
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
|
|
- **Example Structure**: - Implementation detail
|
|
|
|
Usage:
|
|
python scripts/cleanup_class_descriptions_v2.py [--dry-run] [--verbose] [--file PATH]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
# Patterns to remove from description content
|
|
# These patterns are designed to match section content without consuming the final newline before YAML keys
|
|
REMOVE_PATTERNS = [
|
|
# Dual-class pattern - matches until next section or end of indented block
|
|
(r'\n \*\*Dual-Class Pattern\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dual_class_pattern'),
|
|
|
|
# Ontology alignment sections (various cases)
|
|
(r'\n \*\*ONTOLOGY ALIGNMENT\*\*:[^\n]*\n(?: [^\n*][^\n]*\n| \n)*(?: [0-9]+\. \*\*[^\n]+\n(?: [^\n]+\n)*)*', 'ontology_alignment_upper'),
|
|
(r'\n \*\*Ontological Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontological_alignment'),
|
|
(r'\n \*\*Ontology Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontology_alignment_mixed'),
|
|
|
|
# Multilingual labels - bullet list
|
|
(r'\n \*\*Multilingual Labels\*\*:\n(?: - [a-z]{2,3}: [^\n]+\n)+', 'multilingual_labels'),
|
|
|
|
# SKOS alignment sections
|
|
(r'\n \*\*SKOS\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment'),
|
|
(r'\n \*\*SKOS Alignment\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment_full'),
|
|
|
|
# Dublin Core section
|
|
(r'\n \*\*Dublin Core\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dublin_core'),
|
|
|
|
# RDF examples with code blocks
|
|
(r'\n \*\*RDF Serialization(?: Example)?\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'rdf_serialization'),
|
|
|
|
# Example JSON/YAML structure with code blocks
|
|
(r'\n \*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'example_structure'),
|
|
|
|
# GLAMORCUBES category
|
|
(r'\n \*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'glamorcubes_category'),
|
|
]
|
|
|
|
|
|
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
|
|
"""Process a single class YAML file using text-based replacement."""
|
|
result = {
|
|
'file': str(file_path),
|
|
'modified': False,
|
|
'removed_sections': [],
|
|
'errors': []
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
|
|
# Apply each removal pattern
|
|
for pattern, section_name in REMOVE_PATTERNS:
|
|
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
|
|
if regex.search(content):
|
|
content = regex.sub('', content)
|
|
result['removed_sections'].append(section_name)
|
|
if verbose:
|
|
print(f" Removed: {section_name}")
|
|
|
|
# Clean up multiple consecutive blank lines (more than 2)
|
|
content = re.sub(r'\n{4,}', '\n\n\n', content)
|
|
|
|
# Check if content changed
|
|
if content != original_content:
|
|
result['modified'] = True
|
|
|
|
if not dry_run:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
except Exception as e:
|
|
result['errors'].append(str(e))
|
|
import traceback
|
|
if verbose:
|
|
traceback.print_exc()
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Cleanup class descriptions (text-based)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
|
|
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
|
|
parser.add_argument('--file', type=str, help='Process a single file')
|
|
args = parser.parse_args()
|
|
|
|
classes_dir = Path('schemas/20251121/linkml/modules/classes')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = sorted(classes_dir.glob('*.yaml'))
|
|
|
|
print(f"Processing {len(files)} class files...")
|
|
if args.dry_run:
|
|
print("DRY RUN - no files will be modified\n")
|
|
|
|
stats = {
|
|
'files_processed': 0,
|
|
'files_modified': 0,
|
|
'sections_removed': {},
|
|
'errors': []
|
|
}
|
|
|
|
for file_path in files:
|
|
if args.verbose:
|
|
print(f"\nProcessing: {file_path.name}")
|
|
|
|
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
|
|
|
|
stats['files_processed'] += 1
|
|
if result['modified']:
|
|
stats['files_modified'] += 1
|
|
if not args.verbose:
|
|
print(f" Modified: {file_path.name}")
|
|
|
|
for section in result['removed_sections']:
|
|
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
|
|
|
|
if result['errors']:
|
|
stats['errors'].extend(result['errors'])
|
|
print(f" ERROR in {file_path.name}: {result['errors']}")
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 60}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 60}")
|
|
print(f"Files processed: {stats['files_processed']}")
|
|
print(f"Files modified: {stats['files_modified']}")
|
|
print(f"\nSections removed by type:")
|
|
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
|
|
print(f" {section}: {count}")
|
|
|
|
if stats['errors']:
|
|
print(f"\nErrors: {len(stats['errors'])}")
|
|
for error in stats['errors'][:10]:
|
|
print(f" - {error}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|