Summary: - Create 46 missing slot definition files with proper slot_uri values - Add slot imports to main schema (01_custodian_name_modular.yaml) - Fix YAML examples sections in 116+ class and slot files - Fix PersonObservation.yaml examples section (nested objects → string literals) Technical changes: - All slots now have explicit slot_uri mapping to base ontologies (RiC-O, Schema.org, SKOS) - Eliminates malformed URIs like 'custodian/:slot_name' in generated RDF - gen-owl now produces valid Turtle with 153,166 triples New slot files (46): - RiC-O slots: rico_note, rico_organizational_principle, rico_has_or_had_holder, etc. - Scope slots: scope_includes, scope_excludes, archive_scope - Organization slots: organization_type, governance_authority, area_served - Platform slots: platform_type_category, portal_type_category - Social media slots: social_media_platform_category, post_type_* - Type hierarchy slots: broader_type, narrower_types, custodian_type_broader - Wikidata slots: wikidata_equivalent, wikidata_mapping Generated output: - schemas/20251121/rdf/01_custodian_name_modular_20260107_134534_clean.owl.ttl (6.9MB) - Validated with rdflib: 153,166 triples, no malformed URIs
369 lines
15 KiB
Python
369 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Structuralize Class Descriptions
|
|
|
|
Migrates unstructured content from class description fields to proper LinkML slots.
|
|
|
|
For each class, this script:
|
|
1. Extracts sections like **Scope**:, **Notable Examples**:, etc. from descriptions
|
|
2. Creates slot_usage entries for the corresponding slots from description_sections.yaml
|
|
3. Removes the extracted sections from the description
|
|
4. Keeps only the core definition paragraph and **Wikidata**: reference
|
|
|
|
Target slots (from description_sections.yaml):
|
|
- scope_description: **Scope**:
|
|
- notable_examples: **Notable Examples**:
|
|
- historical_significance: **Historical Significance**:
|
|
- typical_contents: **Typical Contents**:
|
|
- related_types: **Related Types**:
|
|
- research_value: **Research Value**:
|
|
- dutch_context: **Dutch Context**:
|
|
- key_distinction: **Key Distinction**:, **Key Distinctions from Other Types**:
|
|
- administrative_context: **Administrative Context**:
|
|
- temporal_dynamics: **Temporal Dynamics**:
|
|
- use_cases: **Use Cases**:, **USE CASES**:
|
|
- heritage_sector_usage: **Heritage Sector Usage**:, **Heritage use cases**:
|
|
- characteristics: **Characteristics**:, **CHARACTERISTICS**:
|
|
- purpose: **Purpose**:, **PURPOSE**:
|
|
- class_definition: **Definition**:, **DEFINITION**:
|
|
- privacy_note: **Privacy Considerations**:
|
|
- preservation_note: **Preservation Considerations**:
|
|
|
|
Usage:
|
|
python scripts/structuralize_class_descriptions.py [--dry-run] [--verbose] [--file PATH]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from ruamel.yaml import YAML
|
|
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 120
|
|
yaml.indent(mapping=2, sequence=2, offset=2)
|
|
|
|
# Section patterns mapping to slot names
|
|
# Format: (section_name, slot_name, regex_pattern, is_list)
|
|
SECTION_MAPPINGS = [
|
|
('scope', 'scope_description', r'\*\*Scope\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('definition', 'class_definition', r'\*\*(?:DEFINITION|Definition)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('key_distinction', 'key_distinction', r'\*\*Key Distinction(?:s from Other Types)?\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('notable_examples', 'notable_examples', r'\*\*Notable Examples\*\*:\s*\n((?:- .*\n?)+)', True),
|
|
('related_types', 'related_types', r'\*\*(?:RELATED TYPES|Related Types)\*\*:\s*\n((?:- .*\n?)+)', True),
|
|
('typical_contents', 'typical_contents', r'\*\*Typical Contents\*\*:\s*\n((?:- .*\n?)+)', True),
|
|
('historical_significance', 'historical_significance', r'\*\*Historical Significance\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('research_value', 'research_value', r'\*\*Research Value\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('dutch_context', 'dutch_context', r'\*\*Dutch Context\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('administrative_context', 'administrative_context', r'\*\*Administrative Context\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('temporal_dynamics', 'temporal_dynamics', r'\*\*Temporal Dynamics\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('use_cases', 'use_cases', r'\*\*(?:USE CASES|Use Cases)\*\*:\s*\n((?:- .*\n?)+)', True),
|
|
('heritage_sector_usage', 'heritage_sector_usage', r'\*\*(?:Heritage Sector Usage|Heritage use cases)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('characteristics', 'characteristics', r'\*\*(?:CHARACTERISTICS|Characteristics)\*\*:\s*\n((?:- .*\n?)+)', True),
|
|
('purpose', 'purpose', r'\*\*(?:PURPOSE|Purpose)\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('privacy_note', 'privacy_note', r'\*\*Privacy Considerations\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('preservation_note', 'preservation_note', r'\*\*Preservation(?:\s+Considerations)?\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
('geographic_restriction', 'geographic_restriction', r'\*\*Geographic Restriction\*\*:\s*\n(.*?)(?=\n\*\*[A-Z]|\Z)', False),
|
|
]
|
|
|
|
# Sections to REMOVE entirely (already structured elsewhere or redundant)
|
|
REMOVE_PATTERNS = [
|
|
(r'\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'dual_class_pattern'),
|
|
(r'\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'ontological_alignment'),
|
|
(r'\*\*Multilingual Labels\*\*:\s*\n(?:- [a-z]{2}: .*\n)+', 'multilingual_labels'),
|
|
(r'\*\*SKOS\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'skos_alignment'),
|
|
(r'\*\*Dublin Core\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'dublin_core'),
|
|
(r'\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'),
|
|
(r'\*\*Example(?: JSON)? Structure\*\*:\s*\n```.*?```', 'example_structure'),
|
|
(r'\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'ontology_alignment_upper'),
|
|
(r'\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\*\*[A-Z]|\Z)', 'glamorcubes_category'),
|
|
]
|
|
|
|
|
|
def parse_list_content(content: str) -> list[str]:
|
|
"""Parse bullet list content into a list of strings."""
|
|
items = []
|
|
for line in content.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('- '):
|
|
items.append(line[2:].strip())
|
|
elif line and items: # Continuation of previous item
|
|
items[-1] += ' ' + line
|
|
return items
|
|
|
|
|
|
def parse_notable_examples(content: str) -> list[dict]:
|
|
"""Parse notable examples into structured format."""
|
|
examples = []
|
|
for line in content.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('- '):
|
|
example_text = line[2:].strip()
|
|
example = {'example_name': example_text}
|
|
|
|
# Try to extract location from parentheses
|
|
location_match = re.search(r'\(([^)]+)\)$', example_text)
|
|
if location_match:
|
|
example['example_location'] = location_match.group(1)
|
|
example['example_name'] = example_text[:location_match.start()].strip()
|
|
|
|
examples.append(example)
|
|
return examples
|
|
|
|
|
|
def parse_related_types(content: str) -> list[dict]:
|
|
"""Parse related types into structured format."""
|
|
related = []
|
|
for line in content.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('- '):
|
|
type_text = line[2:].strip()
|
|
rel = {'related_type_name': type_text}
|
|
|
|
# Try to extract Wikidata ID
|
|
wikidata_match = re.search(r'\(Q(\d+)\)', type_text)
|
|
if wikidata_match:
|
|
rel['related_type_wikidata'] = f"Q{wikidata_match.group(1)}"
|
|
rel['related_type_name'] = type_text[:wikidata_match.start()].strip()
|
|
|
|
# Try to extract note after dash
|
|
note_match = re.search(r'\)\s*-\s*(.+)$', type_text)
|
|
if note_match:
|
|
rel['related_type_note'] = note_match.group(1).strip()
|
|
elif ' - ' in type_text and not wikidata_match:
|
|
parts = type_text.split(' - ', 1)
|
|
rel['related_type_name'] = parts[0].strip()
|
|
rel['related_type_note'] = parts[1].strip()
|
|
|
|
related.append(rel)
|
|
return related
|
|
|
|
|
|
def extract_sections(description: str, verbose: bool = False) -> tuple[str, dict, list[str]]:
|
|
"""
|
|
Extract structured sections from a class description.
|
|
|
|
Returns:
|
|
tuple: (cleaned_description, extracted_data, removed_sections)
|
|
"""
|
|
if not description:
|
|
return description, {}, []
|
|
|
|
cleaned = description
|
|
extracted = {}
|
|
removed_sections = []
|
|
|
|
# First, remove patterns that should be deleted entirely
|
|
for pattern, section_name in REMOVE_PATTERNS:
|
|
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
|
|
if regex.search(cleaned):
|
|
cleaned = regex.sub('', cleaned)
|
|
removed_sections.append(section_name)
|
|
if verbose:
|
|
print(f" Removed: {section_name}")
|
|
|
|
# Extract sections to slots
|
|
for section_name, slot_name, pattern, is_list in SECTION_MAPPINGS:
|
|
regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
|
|
match = regex.search(cleaned)
|
|
|
|
if match:
|
|
content = match.group(1).strip()
|
|
|
|
if slot_name == 'notable_examples':
|
|
extracted[slot_name] = parse_notable_examples(content)
|
|
elif slot_name == 'related_types':
|
|
extracted[slot_name] = parse_related_types(content)
|
|
elif is_list:
|
|
extracted[slot_name] = parse_list_content(content)
|
|
else:
|
|
# For non-list content, clean up and store as string
|
|
extracted[slot_name] = content
|
|
|
|
cleaned = regex.sub('', cleaned)
|
|
removed_sections.append(section_name)
|
|
if verbose:
|
|
print(f" Extracted: {section_name} -> {slot_name}")
|
|
|
|
# Clean up extra whitespace
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
|
cleaned = cleaned.strip()
|
|
|
|
return cleaned, extracted, removed_sections
|
|
|
|
|
|
def process_class(class_name: str, class_data: dict, verbose: bool = False) -> tuple[bool, list[str]]:
|
|
"""
|
|
Process a single class, extracting structured content from its description.
|
|
|
|
Returns:
|
|
tuple: (was_modified, list_of_extracted_sections)
|
|
"""
|
|
if not isinstance(class_data, dict):
|
|
return False, []
|
|
|
|
if 'description' not in class_data or not isinstance(class_data['description'], str):
|
|
return False, []
|
|
|
|
cleaned, extracted, removed_sections = extract_sections(
|
|
class_data['description'], verbose
|
|
)
|
|
|
|
if not removed_sections:
|
|
return False, []
|
|
|
|
# Update description
|
|
class_data['description'] = cleaned
|
|
|
|
# Add extracted data to slot_usage or annotations
|
|
if extracted:
|
|
if 'slot_usage' not in class_data:
|
|
class_data['slot_usage'] = {}
|
|
elif class_data['slot_usage'] is None:
|
|
class_data['slot_usage'] = {}
|
|
|
|
import json
|
|
for slot_name, value in extracted.items():
|
|
if isinstance(value, list) and value:
|
|
if slot_name in ['notable_examples', 'related_types']:
|
|
# Complex nested structures - store as JSON string to avoid YAML formatting issues
|
|
class_data['slot_usage'][slot_name] = {
|
|
'range': 'NotableExample' if slot_name == 'notable_examples' else 'RelatedType',
|
|
'multivalued': True,
|
|
'inlined_as_list': True,
|
|
'annotations': {
|
|
'extracted_values': json.dumps(value, ensure_ascii=False)
|
|
}
|
|
}
|
|
else:
|
|
# Simple list of strings - store as JSON array string
|
|
class_data['slot_usage'][slot_name] = {
|
|
'annotations': {
|
|
'default_values': json.dumps(value, ensure_ascii=False)
|
|
}
|
|
}
|
|
elif isinstance(value, str) and value:
|
|
class_data['slot_usage'][slot_name] = {
|
|
'annotations': {
|
|
'default_value': value
|
|
}
|
|
}
|
|
|
|
return True, removed_sections
|
|
|
|
|
|
def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
|
|
"""Process a single class YAML file."""
|
|
result = {
|
|
'file': str(file_path),
|
|
'modified': False,
|
|
'classes_processed': [],
|
|
'removed_sections': [],
|
|
'errors': []
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
data = yaml.load(content)
|
|
if not data:
|
|
return result
|
|
|
|
modified = False
|
|
|
|
# Process classes
|
|
if 'classes' in data and isinstance(data['classes'], dict):
|
|
for class_name, class_data in data['classes'].items():
|
|
was_modified, removed = process_class(class_name, class_data, verbose)
|
|
|
|
if was_modified:
|
|
result['classes_processed'].append(class_name)
|
|
result['removed_sections'].extend(removed)
|
|
modified = True
|
|
|
|
result['modified'] = modified
|
|
|
|
if modified and not dry_run:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
except Exception as e:
|
|
result['errors'].append(str(e))
|
|
import traceback
|
|
if verbose:
|
|
traceback.print_exc()
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Structuralize class descriptions')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
|
|
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
|
|
parser.add_argument('--file', type=str, help='Process a single file')
|
|
args = parser.parse_args()
|
|
|
|
classes_dir = Path('schemas/20251121/linkml/modules/classes')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = sorted(classes_dir.glob('*.yaml'))
|
|
|
|
print(f"Processing {len(files)} class files...")
|
|
if args.dry_run:
|
|
print("DRY RUN - no files will be modified\n")
|
|
|
|
stats = {
|
|
'files_processed': 0,
|
|
'files_modified': 0,
|
|
'classes_processed': 0,
|
|
'sections_removed': {},
|
|
'errors': []
|
|
}
|
|
|
|
for file_path in files:
|
|
if args.verbose:
|
|
print(f"\nProcessing: {file_path.name}")
|
|
|
|
result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)
|
|
|
|
stats['files_processed'] += 1
|
|
if result['modified']:
|
|
stats['files_modified'] += 1
|
|
if not args.verbose:
|
|
print(f" Modified: {file_path.name} ({len(result['classes_processed'])} classes)")
|
|
|
|
stats['classes_processed'] += len(result['classes_processed'])
|
|
|
|
for section in result['removed_sections']:
|
|
stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1
|
|
|
|
if result['errors']:
|
|
stats['errors'].extend(result['errors'])
|
|
print(f" ERROR in {file_path.name}: {result['errors']}")
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 60}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 60}")
|
|
print(f"Files processed: {stats['files_processed']}")
|
|
print(f"Files modified: {stats['files_modified']}")
|
|
print(f"Classes processed: {stats['classes_processed']}")
|
|
print(f"\nSections removed/extracted by type:")
|
|
for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
|
|
print(f" {section}: {count}")
|
|
|
|
if stats['errors']:
|
|
print(f"\nErrors: {len(stats['errors'])}")
|
|
for error in stats['errors'][:10]:
|
|
print(f" - {error}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN complete. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|