225 lines
5.5 KiB
Python
225 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create missing slot files for the LinkML schema.
|
|
|
|
This script creates slot files for slots that are used in class files
|
|
but don't have corresponding slot files in modules/slots/.
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
import yaml
|
|
import re
|
|
|
|
# Missing slots from lint output
|
|
MISSING_SLOTS = """actual_end
|
|
actual_start
|
|
annex_description
|
|
annex_id
|
|
annex_name
|
|
annex_reason
|
|
annotation_motivation
|
|
annotation_segments
|
|
annotation_type
|
|
aspect_ratio
|
|
available_caption_languages
|
|
caption_available
|
|
character_count
|
|
climate_control_type
|
|
comment_count
|
|
comments_fetched
|
|
common_variants
|
|
content_title
|
|
contents_description
|
|
default_audio_language
|
|
default_language
|
|
detection_count
|
|
detection_threshold
|
|
dislike_count
|
|
example_portals
|
|
favorite_count
|
|
frame_rate
|
|
frame_sample_rate
|
|
full_text
|
|
generated_by
|
|
generation_method
|
|
generation_timestamp
|
|
has_forklift_access
|
|
has_loading_dock
|
|
includes_bounding_boxes
|
|
includes_segmentation_masks
|
|
includes_speakers
|
|
is_annex_of_reading_room
|
|
is_embeddable
|
|
is_licensed_content
|
|
is_made_for_kids
|
|
is_temporary
|
|
is_verified
|
|
keyframe_extraction
|
|
like_count
|
|
live_broadcast_content
|
|
material_specialization
|
|
metrics_observed_at
|
|
model_architecture
|
|
model_provider
|
|
model_task
|
|
model_version
|
|
overall_confidence
|
|
paragraph_count
|
|
planned_closure_date
|
|
planned_end
|
|
planned_start
|
|
portal_type_description
|
|
portal_type_id
|
|
portal_type_name
|
|
primary_speaker
|
|
processing_duration_seconds
|
|
reason_description
|
|
replaces_primary_location
|
|
requires_qualification
|
|
requires_separate_registration
|
|
role_id
|
|
role_name
|
|
role_name_local
|
|
sentence_count
|
|
serves_function_of
|
|
shares_catalog_with_main
|
|
source_language_auto_detected
|
|
source_video
|
|
source_video_url
|
|
temp_location_description
|
|
temp_location_id
|
|
temp_location_name
|
|
temp_location_reason
|
|
total_frames_analyzed
|
|
transcript_format
|
|
typical_responsibilities
|
|
verification_date
|
|
verified_by
|
|
video_category_id
|
|
video_comments
|
|
view_count
|
|
warehouse_description
|
|
warehouse_floor_area_sqm
|
|
warehouse_id
|
|
warehouse_managed_by
|
|
warehouse_name
|
|
warehouse_security_level
|
|
warehouse_type
|
|
word_count""".strip().split('\n')
|
|
|
|
SLOTS_DIR = Path('/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/slots')
|
|
|
|
# Slot type inference patterns
|
|
TYPE_PATTERNS = {
|
|
r'_id$': 'uriorcurie',
|
|
r'_url$': 'uri',
|
|
r'_date$': 'date',
|
|
r'_count$': 'integer',
|
|
r'_seconds$': 'float',
|
|
r'^is_': 'boolean',
|
|
r'^has_': 'boolean',
|
|
r'^includes_': 'boolean',
|
|
r'^requires_': 'boolean',
|
|
r'_sqm$': 'float',
|
|
r'_timestamp$': 'datetime',
|
|
r'_at$': 'datetime',
|
|
}
|
|
|
|
# Slot URI mappings based on name patterns
|
|
URI_PATTERNS = {
|
|
r'_id$': 'dcterms:identifier',
|
|
r'_description$': 'dcterms:description',
|
|
r'_name$': 'skos:prefLabel',
|
|
r'_type$': 'dcterms:type',
|
|
r'_date$': 'dcterms:date',
|
|
r'_url$': 'schema:url',
|
|
r'_count$': 'schema:interactionCount',
|
|
r'_timestamp$': 'prov:atTime',
|
|
r'_at$': 'prov:atTime',
|
|
r'^verified': 'prov:wasAttributedTo',
|
|
}
|
|
|
|
def infer_range(slot_name: str) -> str:
|
|
"""Infer the range type from slot name."""
|
|
for pattern, range_type in TYPE_PATTERNS.items():
|
|
if re.search(pattern, slot_name):
|
|
return range_type
|
|
return 'string'
|
|
|
|
def infer_slot_uri(slot_name: str) -> str:
|
|
"""Infer slot_uri from slot name."""
|
|
for pattern, uri in URI_PATTERNS.items():
|
|
if re.search(pattern, slot_name):
|
|
return uri
|
|
# Default fallback
|
|
camel = ''.join(word.capitalize() for word in slot_name.split('_'))
|
|
camel = camel[0].lower() + camel[1:]
|
|
return f'hc:{camel}'
|
|
|
|
def humanize_name(slot_name: str) -> str:
|
|
"""Convert slot_name to human readable title."""
|
|
return ' '.join(word.capitalize() for word in slot_name.split('_'))
|
|
|
|
def create_slot_file(slot_name: str) -> dict:
|
|
"""Create a slot file content."""
|
|
range_type = infer_range(slot_name)
|
|
slot_uri = infer_slot_uri(slot_name)
|
|
title = humanize_name(slot_name)
|
|
|
|
content = {
|
|
'id': f'https://nde.nl/ontology/hc/slot/{slot_name}',
|
|
'name': f'{slot_name}_slot',
|
|
'title': f'{title} Slot',
|
|
'prefixes': {
|
|
'linkml': 'https://w3id.org/linkml/',
|
|
'hc': 'https://nde.nl/ontology/hc/',
|
|
'dcterms': 'http://purl.org/dc/terms/',
|
|
'schema': 'http://schema.org/',
|
|
'skos': 'http://www.w3.org/2004/02/skos/core#',
|
|
'prov': 'http://www.w3.org/ns/prov#',
|
|
},
|
|
'imports': ['linkml:types'],
|
|
'default_prefix': 'hc',
|
|
'slots': {
|
|
slot_name: {
|
|
'slot_uri': slot_uri,
|
|
'description': f'{title} for heritage custodian entities.',
|
|
'range': range_type,
|
|
}
|
|
}
|
|
}
|
|
|
|
# Add multivalued for certain patterns
|
|
if '_comments' in slot_name or '_segments' in slot_name or 'languages' in slot_name:
|
|
content['slots'][slot_name]['multivalued'] = True
|
|
|
|
return content
|
|
|
|
def main():
|
|
created = 0
|
|
skipped = 0
|
|
|
|
for slot_name in MISSING_SLOTS:
|
|
slot_file = SLOTS_DIR / f'{slot_name}.yaml'
|
|
|
|
if slot_file.exists():
|
|
print(f'SKIP: {slot_name}.yaml already exists')
|
|
skipped += 1
|
|
continue
|
|
|
|
content = create_slot_file(slot_name)
|
|
|
|
with open(slot_file, 'w') as f:
|
|
yaml.dump(content, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
|
|
print(f'CREATE: {slot_name}.yaml')
|
|
created += 1
|
|
|
|
print(f'\n=== Summary ===')
|
|
print(f'Created: {created}')
|
|
print(f'Skipped: {skipped}')
|
|
print(f'Total: {len(MISSING_SLOTS)}')
|
|
|
|
if __name__ == '__main__':
|
|
main()
|