glam/scripts/centralize_inline_slots.py
kempersc 174a420c08
All checks were successful
Deploy Frontend / build-and-deploy (push) Successful in 3m57s
refactor(schema): centralize 1515 inline slot definitions per Rule 48
- Remove inline slot definitions from 144 class files
- Create 7 new centralized slot files in modules/slots/:
  - custodian_type_broader.yaml
  - custodian_type_narrower.yaml
  - custodian_type_related.yaml
  - definition.yaml
  - finding_aid_access_restriction.yaml
  - finding_aid_description.yaml
  - finding_aid_temporal_coverage.yaml
- Add centralize_inline_slots.py automation script
- Update manifest with new timestamp

Rule 48: Class files must NOT define inline slots - all slots
must be imported from modules/slots/ directory.

Note: Pre-existing IdentifierFormat duplicate class definition
(in Standard.yaml and IdentifierFormat.yaml) not addressed in
this commit - requires separate schema refactor.
2026-01-11 22:02:14 +01:00

286 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Centralize inline slot definitions from class files to modules/slots/
This script:
1. Extracts all inline slot definitions from modules/classes/*.yaml
2. Checks which slots already exist in modules/slots/
3. Generates missing slot files
4. Updates class files to import centralized slots instead of inline definitions
Usage:
python scripts/centralize_inline_slots.py --dry-run # Preview changes
python scripts/centralize_inline_slots.py # Apply changes
"""
import os
import re
import sys
import yaml
import argparse
from pathlib import Path
from collections import defaultdict
from datetime import datetime
# Paths
SCHEMA_ROOT = Path(__file__).parent.parent / "schemas" / "20251121" / "linkml"
CLASSES_DIR = SCHEMA_ROOT / "modules" / "classes"
SLOTS_DIR = SCHEMA_ROOT / "modules" / "slots"
def load_yaml(path: Path) -> dict:
"""Load YAML file."""
with open(path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or {}
def save_yaml(path: Path, data: dict, dry_run: bool = False):
"""Save YAML file with proper formatting."""
if dry_run:
print(f" [DRY-RUN] Would write: {path}")
return
with open(path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
def get_existing_centralized_slots() -> set:
"""Get set of slot names already defined in modules/slots/."""
existing = set()
for slot_file in SLOTS_DIR.glob("*.yaml"):
try:
data = load_yaml(slot_file)
if 'slots' in data:
existing.update(data['slots'].keys())
except Exception as e:
print(f"Warning: Could not parse {slot_file}: {e}")
return existing
def extract_inline_slots(class_file: Path) -> dict:
"""Extract inline slot definitions from a class file.
Returns dict mapping slot_name -> slot_definition
"""
try:
data = load_yaml(class_file)
except Exception as e:
print(f"Warning: Could not parse {class_file}: {e}")
return {}
# Top-level slots section contains inline definitions
inline_slots = data.get('slots', {})
# Filter to only actual slot definitions (not just references)
# A slot definition has properties like range, slot_uri, description, etc.
defined_slots = {}
for slot_name, slot_def in inline_slots.items():
if isinstance(slot_def, dict): # Has properties = is a definition
defined_slots[slot_name] = slot_def
return defined_slots
def generate_slot_file_content(slot_name: str, slot_def: dict, source_class: str) -> dict:
"""Generate content for a centralized slot file."""
# Extract prefixes needed for this slot
prefixes = {
'linkml': 'https://w3id.org/linkml/',
'hc': 'https://nde.nl/ontology/hc/',
}
# Add prefixes based on slot_uri
slot_uri = slot_def.get('slot_uri', '')
if slot_uri:
prefix_match = re.match(r'^([a-z]+):', slot_uri)
if prefix_match:
prefix = prefix_match.group(1)
# Common prefix mappings
prefix_map = {
'schema': 'http://schema.org/',
'dcterms': 'http://purl.org/dc/terms/',
'prov': 'http://www.w3.org/ns/prov#',
'crm': 'http://www.cidoc-crm.org/cidoc-crm/',
'rico': 'https://www.ica.org/standards/RiC/ontology#',
'org': 'http://www.w3.org/ns/org#',
'foaf': 'http://xmlns.com/foaf/0.1/',
'skos': 'http://www.w3.org/2004/02/skos/core#',
'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
'owl': 'http://www.w3.org/2002/07/owl#',
'sosa': 'http://www.w3.org/ns/sosa/',
'premis': 'http://www.loc.gov/premis/rdf/v3/',
'odrl': 'http://www.w3.org/ns/odrl/2/',
'frapo': 'http://purl.org/cerif/frapo/',
'dcat': 'http://www.w3.org/ns/dcat#',
'bf': 'http://id.loc.gov/ontologies/bibframe/',
}
if prefix in prefix_map:
prefixes[prefix] = prefix_map[prefix]
content = {
'id': f'https://nde.nl/ontology/hc/slot/{slot_name}',
'name': f'{slot_name}_slot',
'title': f'{slot_name.replace("_", " ").title()} Slot',
'prefixes': prefixes,
'imports': ['linkml:types'],
'default_prefix': 'hc',
'slots': {
slot_name: slot_def
}
}
# Add comment about origin
content['comments'] = [f'Centralized from {source_class} - {datetime.now().isoformat()}']
return content
def update_class_file(class_file: Path, slots_to_remove: list, dry_run: bool = False):
"""Update class file to remove inline slots and add imports."""
try:
data = load_yaml(class_file)
except Exception as e:
print(f"Warning: Could not parse {class_file}: {e}")
return
if 'slots' not in data:
return
# Remove inline slot definitions
original_slots = data.get('slots', {})
remaining_slots = {}
removed = []
for slot_name, slot_def in original_slots.items():
if slot_name in slots_to_remove and isinstance(slot_def, dict):
removed.append(slot_name)
else:
remaining_slots[slot_name] = slot_def
if not removed:
return
# Update slots section
if remaining_slots:
data['slots'] = remaining_slots
else:
del data['slots']
# Add imports for centralized slots
imports = data.get('imports', [])
for slot_name in removed:
import_path = f'../slots/{slot_name}'
if import_path not in imports:
imports.append(import_path)
data['imports'] = imports
# Save updated file
if dry_run:
print(f" [DRY-RUN] Would update {class_file.name}: remove {len(removed)} inline slots, add {len(removed)} imports")
else:
save_yaml(class_file, data)
print(f" Updated {class_file.name}: removed {len(removed)} inline slots")
def main():
parser = argparse.ArgumentParser(description='Centralize inline slot definitions')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing')
parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output')
args = parser.parse_args()
print("=" * 60)
print("Centralizing Inline Slot Definitions")
print("=" * 60)
# Get existing centralized slots
print("\n1. Checking existing centralized slots...")
existing_slots = get_existing_centralized_slots()
print(f" Found {len(existing_slots)} slots already in modules/slots/")
# Extract inline slots from all class files
print("\n2. Extracting inline slot definitions from class files...")
all_inline_slots = {} # slot_name -> (definition, source_file)
class_files_with_inline = {} # file -> list of slot names
for class_file in sorted(CLASSES_DIR.glob("*.yaml")):
inline_slots = extract_inline_slots(class_file)
if inline_slots:
class_files_with_inline[class_file] = list(inline_slots.keys())
for slot_name, slot_def in inline_slots.items():
if slot_name not in all_inline_slots:
all_inline_slots[slot_name] = (slot_def, class_file.name)
print(f" Found {len(all_inline_slots)} unique inline slot definitions")
print(f" Across {len(class_files_with_inline)} class files")
# Find slots that need to be created
print("\n3. Identifying slots to create...")
slots_to_create = {}
slots_already_exist = set()
for slot_name, (slot_def, source) in all_inline_slots.items():
if slot_name in existing_slots:
slots_already_exist.add(slot_name)
else:
slots_to_create[slot_name] = (slot_def, source)
print(f" {len(slots_already_exist)} slots already exist (will remove inline, keep import)")
print(f" {len(slots_to_create)} slots need to be created")
if args.verbose:
print("\n Slots to create:")
for name in sorted(slots_to_create.keys())[:20]:
print(f" - {name}")
if len(slots_to_create) > 20:
print(f" ... and {len(slots_to_create) - 20} more")
# Create missing slot files
print("\n4. Creating centralized slot files...")
created_count = 0
for slot_name, (slot_def, source) in sorted(slots_to_create.items()):
slot_file = SLOTS_DIR / f"{slot_name}.yaml"
if slot_file.exists():
if args.verbose:
print(f" Skipping {slot_name} (file exists)")
continue
content = generate_slot_file_content(slot_name, slot_def, source)
if args.dry_run:
print(f" [DRY-RUN] Would create: {slot_file.name}")
else:
save_yaml(slot_file, content)
if args.verbose:
print(f" Created: {slot_file.name}")
created_count += 1
print(f" {'Would create' if args.dry_run else 'Created'} {created_count} slot files")
# Update class files
print("\n5. Updating class files to remove inline definitions...")
updated_count = 0
for class_file, slot_names in sorted(class_files_with_inline.items()):
update_class_file(class_file, slot_names, args.dry_run)
updated_count += 1
print(f" {'Would update' if args.dry_run else 'Updated'} {updated_count} class files")
# Summary
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f" Slots already centralized: {len(slots_already_exist)}")
print(f" New slot files {'to create' if args.dry_run else 'created'}: {created_count}")
print(f" Class files {'to update' if args.dry_run else 'updated'}: {updated_count}")
if args.dry_run:
print("\n Run without --dry-run to apply changes.")
else:
print("\n Done! Remember to:")
print(" 1. Run linkml-validate to verify schema integrity")
print(" 2. Update manifest.json")
print(" 3. Commit changes")
if __name__ == '__main__':
main()