Summary: - Create 46 missing slot definition files with proper slot_uri values - Add slot imports to main schema (01_custodian_name_modular.yaml) - Fix YAML examples sections in 116+ class and slot files - Fix PersonObservation.yaml examples section (nested objects → string literals) Technical changes: - All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS) - Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF - gen-owl now produces valid Turtle with 153,166 triples New slot files (46): - RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc. - Scope slots: scope_includes, scope_excludes, archive_scope - Organization slots: organization_type, governance_authority, area_served - Platform slots: platform_type_category, portal_type_category - Social media slots: social_media_platform_category, post_type_* - Type hierarchy slots: broader_type, narrower_types, custodian_type_broader - Wikidata slots: wikidata_equivalent, wikidata_mapping Generated output: - schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB) - Validated with rdflib: 153,166 triples, no malformed URIs
250 lines
8.7 KiB
Python
250 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup Class Descriptions - Simplified Version
|
|
|
|
This script ONLY removes redundant sections from class descriptions.
|
|
It does NOT try to extract and store complex data structures.
|
|
|
|
Sections removed:
|
|
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
|
|
- **Ontological Alignment**: - Redundant (mappings capture this)
|
|
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
|
|
- **RDF Serialization**: - Implementation detail
|
|
- **SKOS**: - Redundant (mappings capture this)
|
|
- **Dublin Core**: - Redundant (mappings capture this)
|
|
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
|
|
- **Example Structure**: - Implementation detail
|
|
|
|
Sections KEPT (contain unique information):
|
|
- **Wikidata**: Q-number reference (important)
|
|
- **Scope**: Detailed scope description
|
|
- **Notable Examples**: Real-world institution examples
|
|
- **Related Types**: Linked types with Wikidata IDs
|
|
- **Historical Significance**: Historical context
|
|
- **Dutch Context**: Dutch-specific information
|
|
- etc.
|
|
|
|
Usage:
|
|
python scripts/cleanup_class_descriptions.py [--dry-run] [--verbose] [--file PATH]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from ruamel.yaml import YAML
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 120
|
|
yaml.indent(mapping=2, sequence=2, offset=2)
|
|
USE_RUAMEL = True
|
|
except ImportError:
|
|
import yaml
|
|
USE_RUAMEL = False
|
|
|
|
|
|
# Sections to REMOVE entirely (already structured elsewhere or redundant)
|
|
REMOVE_PATTERNS = [
|
|
# Dual-class pattern - redundant with class hierarchy
|
|
(r'\n\s*\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dual_class_pattern'),
|
|
|
|
# Ontology alignment sections - redundant with mappings
|
|
(r'\n\s*\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontological_alignment'),
|
|
(r'\n\s*\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_upper'),
|
|
(r'\n\s*\*\*Ontology Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_mixed'),
|
|
|
|
# Multilingual labels - redundant with structured_aliases
|
|
(r'\n\s*\*\*Multilingual Labels\*\*:\s*\n(?:\s*- [a-z]{2}: .*\n)+', 'multilingual_labels'),
|
|
|
|
# SKOS alignment - redundant with mappings
|
|
(r'\n\s*\*\*SKOS\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment'),
|
|
(r'\n\s*\*\*SKOS Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment_full'),
|
|
|
|
# Dublin Core - redundant with mappings
|
|
(r'\n\s*\*\*Dublin Core\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dublin_core'),
|
|
|
|
# RDF examples - implementation details
|
|
(r'\n\s*\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'),
|
|
|
|
# Example JSON/YAML structure - implementation details
|
|
(r'\n\s*\*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n```.*?```', 'example_structure'),
|
|
|
|
# GLAMORCUBES category - redundant with annotations
|
|
(r'\n\s*\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'glamorcubes_category'),
|
|
]
|
|
|
|
|
|
def clean_description(description: str, verbose: bool = False) -> tuple[str, list[str]]:
|
|
"""
|
|
Remove redundant sections from a class description.
|
|
|
|
Returns:
|
|
tuple: (cleaned_description, list_of_removed_sections)
|
|
"""
|
|
if not description:
|
|
return description, []
|
|
|
|
cleaned = description
|
|
removed_sections = []
|
|
|
|
for pattern, section_name in REMOVE_PATTERNS:
|
|
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
|
|
if regex.search(cleaned):
|
|
cleaned = regex.sub('', cleaned)
|
|
removed_sections.append(section_name)
|
|
if verbose:
|
|
print(f" Removed: {section_name}")
|
|
|
|
# Clean up extra whitespace
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
|
cleaned = cleaned.strip()
|
|
|
|
return cleaned, removed_sections
|
|
|
|
|
|
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
|
|
"""Process a single class YAML file."""
|
|
result = {
|
|
'file': str(file_path),
|
|
'modified': False,
|
|
'classes_processed': [],
|
|
'removed_sections': [],
|
|
'errors': []
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
if USE_RUAMEL:
|
|
from io import StringIO
|
|
data = yaml.load(StringIO(content))
|
|
else:
|
|
import yaml as pyyaml
|
|
data = pyyaml.safe_load(content)
|
|
|
|
if not data:
|
|
return result
|
|
|
|
modified = False
|
|
|
|
# Process classes
|
|
if 'classes' in data and isinstance(data['classes'], dict):
|
|
for class_name, class_data in data['classes'].items():
|
|
if not isinstance(class_data, dict):
|
|
continue
|
|
|
|
if 'description' not in class_data:
|
|
continue
|
|
|
|
desc = class_data['description']
|
|
if not isinstance(desc, str):
|
|
continue
|
|
|
|
original_desc = str(desc)
|
|
cleaned_desc, removed = clean_description(original_desc, verbose)
|
|
|
|
if removed:
|
|
class_data['description'] = cleaned_desc
|
|
result['classes_processed'].append(class_name)
|
|
result['removed_sections'].extend(removed)
|
|
modified = True
|
|
|
|
if verbose:
|
|
print(f" Class: {class_name}")
|
|
|
|
result['modified'] = modified
|
|
|
|
if modified and not dry_run:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
if USE_RUAMEL:
|
|
yaml.dump(data, f)
|
|
else:
|
|
import yaml as pyyaml
|
|
pyyaml.dump(data, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120)
|
|
|
|
except Exception as e:
|
|
result['errors'].append(str(e))
|
|
import traceback
|
|
if verbose:
|
|
traceback.print_exc()
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Cleanup class descriptions by removing redundant sections')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
|
|
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
|
|
parser.add_argument('--file', type=str, help='Process a single file')
|
|
args = parser.parse_args()
|
|
|
|
classes_dir = Path('schemas/20251121/linkml/modules/classes')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = sorted(classes_dir.glob('*.yaml'))
|
|
|
|
print(f"Processing {len(files)} class files...")
|
|
if args.dry_run:
|
|
print("DRY RUN - no files will be modified\n")
|
|
|
|
stats = {
|
|
'files_processed': 0,
|
|
'files_modified': 0,
|
|
'classes_processed': 0,
|
|
'sections_removed': {},
|
|
'errors': []
|
|
}
|
|
|
|
for file_path in files:
|
|
if args.verbose:
|
|
print(f"\nProcessing: {file_path.name}")
|
|
|
|
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
|
|
|
|
stats['files_processed'] += 1
|
|
if result['modified']:
|
|
stats['files_modified'] += 1
|
|
if not args.verbose:
|
|
print(f" Modified: {file_path.name} ({len(result['classes_processed'])} classes)")
|
|
|
|
stats['classes_processed'] += len(result['classes_processed'])
|
|
|
|
for section in result['removed_sections']:
|
|
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
|
|
|
|
if result['errors']:
|
|
stats['errors'].extend(result['errors'])
|
|
print(f" ERROR in {file_path.name}: {result['errors']}")
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 60}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 60}")
|
|
print(f"Files processed: {stats['files_processed']}")
|
|
print(f"Files modified: {stats['files_modified']}")
|
|
print(f"Classes processed: {stats['classes_processed']}")
|
|
print(f"\nSections removed by type:")
|
|
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
|
|
print(f" {section}: {count}")
|
|
|
|
if stats['errors']:
|
|
print(f"\nErrors: {len(stats['errors'])}")
|
|
for error in stats['errors'][:10]:
|
|
print(f" - {error}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|