glam/scripts/create_missing_slots.py

225 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Create missing slot files for the LinkML schema.
This script creates slot files for slots that are used in class files
but don't have corresponding slot files in modules/slots/.
"""
import os
from pathlib import Path
import yaml
import re
# Missing slots from lint output
MISSING_SLOTS = """actual_end
actual_start
annex_description
annex_id
annex_name
annex_reason
annotation_motivation
annotation_segments
annotation_type
aspect_ratio
available_caption_languages
caption_available
character_count
climate_control_type
comment_count
comments_fetched
common_variants
content_title
contents_description
default_audio_language
default_language
detection_count
detection_threshold
dislike_count
example_portals
favorite_count
frame_rate
frame_sample_rate
full_text
generated_by
generation_method
generation_timestamp
has_forklift_access
has_loading_dock
includes_bounding_boxes
includes_segmentation_masks
includes_speakers
is_annex_of_reading_room
is_embeddable
is_licensed_content
is_made_for_kids
is_temporary
is_verified
keyframe_extraction
like_count
live_broadcast_content
material_specialization
metrics_observed_at
model_architecture
model_provider
model_task
model_version
overall_confidence
paragraph_count
planned_closure_date
planned_end
planned_start
portal_type_description
portal_type_id
portal_type_name
primary_speaker
processing_duration_seconds
reason_description
replaces_primary_location
requires_qualification
requires_separate_registration
role_id
role_name
role_name_local
sentence_count
serves_function_of
shares_catalog_with_main
source_language_auto_detected
source_video
source_video_url
temp_location_description
temp_location_id
temp_location_name
temp_location_reason
total_frames_analyzed
transcript_format
typical_responsibilities
verification_date
verified_by
video_category_id
video_comments
view_count
warehouse_description
warehouse_floor_area_sqm
warehouse_id
warehouse_managed_by
warehouse_name
warehouse_security_level
warehouse_type
word_count""".strip().split('\n')
SLOTS_DIR = Path('/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/slots')
# Slot type inference patterns
TYPE_PATTERNS = {
r'_id$': 'uriorcurie',
r'_url$': 'uri',
r'_date$': 'date',
r'_count$': 'integer',
r'_seconds$': 'float',
r'^is_': 'boolean',
r'^has_': 'boolean',
r'^includes_': 'boolean',
r'^requires_': 'boolean',
r'_sqm$': 'float',
r'_timestamp$': 'datetime',
r'_at$': 'datetime',
}
# Slot URI mappings based on name patterns
URI_PATTERNS = {
r'_id$': 'dcterms:identifier',
r'_description$': 'dcterms:description',
r'_name$': 'skos:prefLabel',
r'_type$': 'dcterms:type',
r'_date$': 'dcterms:date',
r'_url$': 'schema:url',
r'_count$': 'schema:interactionCount',
r'_timestamp$': 'prov:atTime',
r'_at$': 'prov:atTime',
r'^verified': 'prov:wasAttributedTo',
}
def infer_range(slot_name: str) -> str:
"""Infer the range type from slot name."""
for pattern, range_type in TYPE_PATTERNS.items():
if re.search(pattern, slot_name):
return range_type
return 'string'
def infer_slot_uri(slot_name: str) -> str:
"""Infer slot_uri from slot name."""
for pattern, uri in URI_PATTERNS.items():
if re.search(pattern, slot_name):
return uri
# Default fallback
camel = ''.join(word.capitalize() for word in slot_name.split('_'))
camel = camel[0].lower() + camel[1:]
return f'hc:{camel}'
def humanize_name(slot_name: str) -> str:
"""Convert slot_name to human readable title."""
return ' '.join(word.capitalize() for word in slot_name.split('_'))
def create_slot_file(slot_name: str) -> dict:
"""Create a slot file content."""
range_type = infer_range(slot_name)
slot_uri = infer_slot_uri(slot_name)
title = humanize_name(slot_name)
content = {
'id': f'https://nde.nl/ontology/hc/slot/{slot_name}',
'name': f'{slot_name}_slot',
'title': f'{title} Slot',
'prefixes': {
'linkml': 'https://w3id.org/linkml/',
'hc': 'https://nde.nl/ontology/hc/',
'dcterms': 'http://purl.org/dc/terms/',
'schema': 'http://schema.org/',
'skos': 'http://www.w3.org/2004/02/skos/core#',
'prov': 'http://www.w3.org/ns/prov#',
},
'imports': ['linkml:types'],
'default_prefix': 'hc',
'slots': {
slot_name: {
'slot_uri': slot_uri,
'description': f'{title} for heritage custodian entities.',
'range': range_type,
}
}
}
# Add multivalued for certain patterns
if '_comments' in slot_name or '_segments' in slot_name or 'languages' in slot_name:
content['slots'][slot_name]['multivalued'] = True
return content
def main():
created = 0
skipped = 0
for slot_name in MISSING_SLOTS:
slot_file = SLOTS_DIR / f'{slot_name}.yaml'
if slot_file.exists():
print(f'SKIP: {slot_name}.yaml already exists')
skipped += 1
continue
content = create_slot_file(slot_name)
with open(slot_file, 'w') as f:
yaml.dump(content, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f'CREATE: {slot_name}.yaml')
created += 1
print(f'\n=== Summary ===')
print(f'Created: {created}')
print(f'Skipped: {skipped}')
print(f'Total: {len(MISSING_SLOTS)}')
if __name__ == '__main__':
main()