glam/scripts/structuralize_class_descriptions.py
kempersc dfa667c90f Fix LinkML schema for valid RDF generation with proper slot_uri
Summary:
- Create 46 missing slot definition files with proper slot_uri values
- Add slot imports to main schema (01_custodian_name_modular.yaml)
- Fix YAML examples sections in 116+ class and slot files
- Fix PersonObservation.yaml examples section (nested objects → string literals)

Technical changes:
- All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS)
- Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF
- gen-owl now produces valid Turtle with 153,166 triples

New slot files (46):
- RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc.
- Scope slots: scope_includes, scope_excludes, archive_scope
- Organization slots: organization_type, governance_authority, area_served
- Platform slots: platform_type_category, portal_type_category
- Social media slots: social_media_platform_category, post_type_*
- Type hierarchy slots: broader_type, narrower_types, custodian_type_broader
- Wikidata slots: wikidata_equivalent, wikidata_mapping

Generated output:
- schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB)
- Validated with rdflib: 153,166 triples, no malformed URIs
2026-01-07 13:48:03 +01:00

369 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Structuralize Class Descriptions
Migrates unstructured content from class description fields to proper LinkML slots.
For each class, this script:
1. Extracts sections like **Scope**:, **Notable Examples**:, etc. from descriptions
2. Creates slot_usage entries for the corresponding slots from description_sections.yaml
3. Removes the extracted sections from the description
4. Keeps only the core definition paragraph and **Wikidata**: reference
Target slots (from description_sections.yaml):
- scope_description: **Scope**:
- notable_examples: **Notable Examples**:
- historical_significance: **Historical Significance**:
- typical_contents: **Typical Contents**:
- related_types: **Related Types**:
- research_value: **Research Value**:
- dutch_context: **Dutch Context**:
- key_distinction: **Key Distinction**:, **Key Distinctions from Other Types**:
- administrative_context: **Administrative Context**:
- temporal_dynamics: **Temporal Dynamics**:
- use_cases: **Use Cases**:, **USE CASES**:
- heritage_sector_usage: **Heritage Sector Usage**:, **Heritage use cases**:
- characteristics: **Characteristics**:, **CHARACTERISTICS**:
- purpose: **Purpose**:, **PURPOSE**:
- class_definition: **Definition**:, **DEFINITION**:
- privacy_note: **Privacy Considerations**:
- preservation_note: **Preservation Considerations**:
Usage:
python scripts/structuralize_class_descriptions.py [--dry-run] [--verbose] [--file PATH]
"""
import argparse
import re
import sys
from pathlib import Path
from ruamel.yaml import YAML
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 120
yaml.indent(mapping=2, sequence=2, offset=2)
# Section patterns mapping to slot names
# Format: (section_name, slot_name, regex_pattern, is_list)
SECTION_MAPPINGS = [
('scope', 'scope_description', r'\*\*Scope\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('definition', 'class_definition', r'\*\*(?:DEFINITION|Definition)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('key_distinction', 'key_distinction', r'\*\*Key Distinction(?:s from Other Types)?\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('notable_examples', 'notable_examples', r'\*\*Notable Examples\*\*:\s*\n((?:- .*\n?)+)', True),
('related_types', 'related_types', r'\*\*(?:RELATED TYPES|Related Types)\*\*:\s*\n((?:- .*\n?)+)', True),
('typical_contents', 'typical_contents', r'\*\*Typical Contents\*\*:\s*\n((?:- .*\n?)+)', True),
('historical_significance', 'historical_significance', r'\*\*Historical Significance\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('research_value', 'research_value', r'\*\*Research Value\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('dutch_context', 'dutch_context', r'\*\*Dutch Context\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('administrative_context', 'administrative_context', r'\*\*Administrative Context\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('temporal_dynamics', 'temporal_dynamics', r'\*\*Temporal Dynamics\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('use_cases', 'use_cases', r'\*\*(?:USE CASES|Use Cases)\*\*:\s*\n((?:- .*\n?)+)', True),
('heritage_sector_usage', 'heritage_sector_usage', r'\*\*(?:Heritage Sector Usage|Heritage use cases)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('characteristics', 'characteristics', r'\*\*(?:CHARACTERISTICS|Characteristics)\*\*:\s*\n((?:- .*\n?)+)', True),
('purpose', 'purpose', r'\*\*(?:PURPOSE|Purpose)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('privacy_note', 'privacy_note', r'\*\*Privacy Considerations\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('preservation_note', 'preservation_note', r'\*\*Preservation(?:\s+Considerations)?\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
('geographic_restriction', 'geographic_restriction', r'\*\*Geographic Restriction\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
]
# Sections to REMOVE entirely (already structured elsewhere or redundant)
REMOVE_PATTERNS = [
(r'\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'dual_class_pattern'),
(r'\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'ontological_alignment'),
(r'\*\*Multilingual Labels\*\*:\s*\n(?:- [a-z]{2}: .*\n)+', 'multilingual_labels'),
(r'\*\*SKOS\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'skos_alignment'),
(r'\*\*Dublin Core\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'dublin_core'),
(r'\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'),
(r'\*\*Example(?: JSON)? Structure\*\*:\s*\n```.*?```', 'example_structure'),
(r'\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'ontology_alignment_upper'),
(r'\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'glamorcubes_category'),
]
def parse_list_content(content: str) -> list[str]:
"""Parse bullet list content into a list of strings."""
items = []
for line in content.strip().split('\n'):
line = line.strip()
if line.startswith('- '):
items.append(line[2:].strip())
elif line and items: # Continuation of previous item
items[-1] += ' ' + line
return items
def parse_notable_examples(content: str) -> list[dict]:
"""Parse notable examples into structured format."""
examples = []
for line in content.strip().split('\n'):
line = line.strip()
if line.startswith('- '):
example_text = line[2:].strip()
example = {'example_name': example_text}
# Try to extract location from parentheses
location_match = re.search(r'\(([^)]+)\)$', example_text)
if location_match:
example['example_location'] = location_match.group(1)
example['example_name'] = example_text[:location_match.start()].strip()
examples.append(example)
return examples
def parse_related_types(content: str) -> list[dict]:
"""Parse related types into structured format."""
related = []
for line in content.strip().split('\n'):
line = line.strip()
if line.startswith('- '):
type_text = line[2:].strip()
rel = {'related_type_name': type_text}
# Try to extract Wikidata ID
wikidata_match = re.search(r'\(Q(\d+)\)', type_text)
if wikidata_match:
rel['related_type_wikidata'] = f"Q{wikidata_match.group(1)}"
rel['related_type_name'] = type_text[:wikidata_match.start()].strip()
# Try to extract note after dash
note_match = re.search(r'\)\s*-\s*(.+)$', type_text)
if note_match:
rel['related_type_note'] = note_match.group(1).strip()
elif ' - ' in type_text and not wikidata_match:
parts = type_text.split(' - ', 1)
rel['related_type_name'] = parts[0].strip()
rel['related_type_note'] = parts[1].strip()
related.append(rel)
return related
def extract_sections(description: str, verbose: bool = False) -> tuple[str, dict, list[str]]:
"""
Extract structured sections from a class description.
Returns:
tuple: (cleaned_description, extracted_data, removed_sections)
"""
if not description:
return description, {}, []
cleaned = description
extracted = {}
removed_sections = []
# First, remove patterns that should be deleted entirely
for pattern, section_name in REMOVE_PATTERNS:
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
if regex.search(cleaned):
cleaned = regex.sub('', cleaned)
removed_sections.append(section_name)
if verbose:
print(f" Removed: {section_name}")
# Extract sections to slots
for section_name, slot_name, pattern, is_list in SECTION_MAPPINGS:
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
match = regex.search(cleaned)
if match:
content = match.group(1).strip()
if slot_name == 'notable_examples':
extracted[slot_name] = parse_notable_examples(content)
elif slot_name == 'related_types':
extracted[slot_name] = parse_related_types(content)
elif is_list:
extracted[slot_name] = parse_list_content(content)
else:
# For non-list content, clean up and store as string
extracted[slot_name] = content
cleaned = regex.sub('', cleaned)
removed_sections.append(section_name)
if verbose:
print(f" Extracted: {section_name} -> {slot_name}")
# Clean up extra whitespace
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
cleaned = cleaned.strip()
return cleaned, extracted, removed_sections
def process_class(class_name: str, class_data: dict, verbose: bool = False) -> tuple[bool, list[str]]:
"""
Process a single class, extracting structured content from its description.
Returns:
tuple: (was_modified, list_of_extracted_sections)
"""
if not isinstance(class_data, dict):
return False, []
if 'description' not in class_data or not isinstance(class_data['description'], str):
return False, []
cleaned, extracted, removed_sections = extract_sections(
class_data['description'], verbose
)
if not removed_sections:
return False, []
# Update description
class_data['description'] = cleaned
# Add extracted data to slot_usage or annotations
if extracted:
if 'slot_usage' not in class_data:
class_data['slot_usage'] = {}
elif class_data['slot_usage'] is None:
class_data['slot_usage'] = {}
import json
for slot_name, value in extracted.items():
if isinstance(value, list) and value:
if slot_name in ['notable_examples', 'related_types']:
# Complex nested structures - store as JSON string to avoid YAML formatting issues
class_data['slot_usage'][slot_name] = {
'range': 'NotableExample' if slot_name == 'notable_examples' else 'RelatedType',
'multivalued': True,
'inlined_as_list': True,
'annotations': {
'extracted_values': json.dumps(value, ensure_ascii=False)
}
}
else:
# Simple list of strings - store as JSON array string
class_data['slot_usage'][slot_name] = {
'annotations': {
'default_values': json.dumps(value, ensure_ascii=False)
}
}
elif isinstance(value, str) and value:
class_data['slot_usage'][slot_name] = {
'annotations': {
'default_value': value
}
}
return True, removed_sections
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
"""Process a single class YAML file."""
result = {
'file': str(file_path),
'modified': False,
'classes_processed': [],
'removed_sections': [],
'errors': []
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.load(content)
if not data:
return result
modified = False
# Process classes
if 'classes' in data and isinstance(data['classes'], dict):
for class_name, class_data in data['classes'].items():
was_modified, removed = process_class(class_name, class_data, verbose)
if was_modified:
result['classes_processed'].append(class_name)
result['removed_sections'].extend(removed)
modified = True
result['modified'] = modified
if modified and not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
except Exception as e:
result['errors'].append(str(e))
import traceback
if verbose:
traceback.print_exc()
return result
def main():
parser = argparse.ArgumentParser(description='Structuralize class descriptions')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
parser.add_argument('--file', type=str, help='Process a single file')
args = parser.parse_args()
classes_dir = Path('schemas/20251121/linkml/modules/classes')
if args.file:
files = [Path(args.file)]
else:
files = sorted(classes_dir.glob('*.yaml'))
print(f"Processing {len(files)} class files...")
if args.dry_run:
print("DRY RUN - no files will be modified\n")
stats = {
'files_processed': 0,
'files_modified': 0,
'classes_processed': 0,
'sections_removed': {},
'errors': []
}
for file_path in files:
if args.verbose:
print(f"\nProcessing: {file_path.name}")
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
stats['files_processed'] += 1
if result['modified']:
stats['files_modified'] += 1
if not args.verbose:
print(f" Modified: {file_path.name} ({len(result['classes_processed'])} classes)")
stats['classes_processed'] += len(result['classes_processed'])
for section in result['removed_sections']:
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
if result['errors']:
stats['errors'].extend(result['errors'])
print(f" ERROR in {file_path.name}: {result['errors']}")
# Summary
print(f"\n{'=' * 60}")
print("SUMMARY")
print(f"{'=' * 60}")
print(f"Files processed: {stats['files_processed']}")
print(f"Files modified: {stats['files_modified']}")
print(f"Classes processed: {stats['classes_processed']}")
print(f"\nSections removed/extracted by type:")
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
print(f" {section}: {count}")
if stats['errors']:
print(f"\nErrors: {len(stats['errors'])}")
for error in stats['errors'][:10]:
print(f" - {error}")
if args.dry_run:
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()