glam/scripts/migrate_class_metadata_imports.py
2026-01-08 15:56:28 +01:00

290 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""
Migrate class files from importing class_metadata_slots bundle to importing specific slots.
Per Rule 38: All LinkML slots MUST be centralized in modules/slots/ and classes should
import only the specific slots they need, not a bundle.
This script:
1. Finds all class files importing ../slots/class_metadata_slots
2. Analyzes which slots from that bundle are actually used
3. Replaces the bundle import with specific slot/class imports
4. Validates the migrated files
Usage:
python scripts/migrate_class_metadata_imports.py [--dry-run] [--validate]
"""
import os
import re
import sys
import yaml
import argparse
import subprocess
from pathlib import Path
from collections import defaultdict
from typing import Set, List, Dict, Optional
# Root directory
SCHEMA_DIR = Path("schemas/20251121/linkml")
CLASSES_DIR = SCHEMA_DIR / "modules" / "classes"
SLOTS_DIR = SCHEMA_DIR / "modules" / "slots"
ENUMS_DIR = SCHEMA_DIR / "modules" / "enums"
# Mapping of slot names to their file paths (relative to classes dir)
SLOT_FILES = {
# Custodian type slots
"custodian_types": "../slots/custodian_types",
"custodian_types_rationale": "../slots/custodian_types_rationale",
"custodian_types_primary": "../slots/custodian_types_primary",
# Wikidata slots
"wikidata_entity_id": "../slots/wikidata_entity_id",
"wikidata_entity_label": "../slots/wikidata_entity_label",
"wikidata_mapping_type": "../slots/wikidata_mapping_type",
"wikidata_mapping_rationale": "../slots/wikidata_mapping_rationale",
"wikidata_alignment": "../slots/wikidata_alignment",
# SKOS slots
"skos_broader": "../slots/skos_broader",
"skos_broader_label": "../slots/skos_broader_label",
"skos_narrower": "../slots/skos_narrower",
"skos_related": "../slots/skos_related",
# Dual-class slots
"dual_class_role": "../slots/dual_class_role",
"linked_class_name": "../slots/linked_class_name",
"link_rationale": "../slots/link_rationale",
"dual_class_link": "../slots/dual_class_link",
# Specificity slots
"specificity_score": "../slots/specificity_score",
"specificity_rationale": "../slots/specificity_rationale",
"specificity_timestamp": "../slots/specificity_timestamp",
"specificity_agent": "../slots/specificity_agent",
"specificity_annotation": "../slots/specificity_annotation",
"template_specificity": "../slots/template_specificity",
# Per-template score slots
"archive_search_score": "../slots/archive_search_score",
"museum_search_score": "../slots/museum_search_score",
"library_search_score": "../slots/library_search_score",
"collection_discovery_score": "../slots/collection_discovery_score",
"person_research_score": "../slots/person_research_score",
"location_browse_score": "../slots/location_browse_score",
"identifier_lookup_score": "../slots/identifier_lookup_score",
"organizational_change_score": "../slots/organizational_change_score",
"digital_platform_score": "../slots/digital_platform_score",
"general_heritage_score": "../slots/general_heritage_score",
# RiC-O slots
"rico_organizational_principle": "../slots/rico_organizational_principle",
"rico_organizational_principle_uri": "../slots/rico_organizational_principle_uri",
"rico_has_or_had_holder": "../slots/rico_has_or_had_holder",
"rico_has_or_had_holder_note": "../slots/rico_has_or_had_holder_note",
"rico_note": "../slots/rico_note",
# Scope slots
"custodian_only": "../slots/custodian_only",
"organizational_level": "../slots/organizational_level",
"geographic_restriction": "../slots/geographic_restriction",
# Multilingual labels
"label_de": "../slots/label_de",
"label_es": "../slots/label_es",
"label_fr": "../slots/label_fr",
"label_nl": "../slots/label_nl",
"label_it": "../slots/label_it",
"label_pt": "../slots/label_pt",
# Notes
"privacy_note": "../slots/privacy_note",
"preservation_note": "../slots/preservation_note",
"legal_note": "../slots/legal_note",
}
# Slots that need their class files imported too
SLOT_TO_CLASS = {
"wikidata_alignment": "./WikidataAlignment",
"dual_class_link": "./DualClassLink",
"specificity_annotation": "./SpecificityAnnotation",
"template_specificity": "./TemplateSpecificityScores",
}
# All metadata slots for detection
ALL_METADATA_SLOTS = set(SLOT_FILES.keys())
def find_used_slots(content: str, data: dict) -> Set[str]:
"""Find which metadata slots are actually used in a class file."""
used = set()
if not data or 'classes' not in data:
return used
for class_name, class_def in data.get('classes', {}).items():
if not class_def:
continue
# Check slots list
slots_list = class_def.get('slots', []) or []
for slot in slots_list:
if slot in ALL_METADATA_SLOTS:
used.add(slot)
# Check slot_usage (handle None/null explicitly)
slot_usage = class_def.get('slot_usage')
if slot_usage: # Skip if None or empty
for slot in slot_usage.keys():
if slot in ALL_METADATA_SLOTS:
used.add(slot)
return used
def generate_new_imports(used_slots: Set[str], existing_imports: List[str]) -> List[str]:
"""Generate the new import list replacing class_metadata_slots with specific imports."""
new_imports = []
added_classes = set()
for imp in existing_imports:
if imp == "../slots/class_metadata_slots":
# Skip - we'll add specific imports
continue
new_imports.append(imp)
# Add specific slot imports
for slot in sorted(used_slots):
if slot in SLOT_FILES:
slot_import = SLOT_FILES[slot]
if slot_import not in new_imports:
new_imports.append(slot_import)
# Add class import if needed
if slot in SLOT_TO_CLASS:
class_import = SLOT_TO_CLASS[slot]
if class_import not in new_imports and class_import not in added_classes:
new_imports.append(class_import)
added_classes.add(class_import)
return new_imports
def migrate_file(file_path: Path, dry_run: bool = False) -> bool:
"""Migrate a single class file."""
with open(file_path) as f:
content = f.read()
# Check if it imports class_metadata_slots
if "../slots/class_metadata_slots" not in content:
return False
try:
data = yaml.safe_load(content)
except yaml.YAMLError as e:
print(f" ERROR: YAML parse error: {e}")
return False
if not data:
print(f" ERROR: Empty YAML file")
return False
# Find used slots
used_slots = find_used_slots(content, data)
if not used_slots:
# File imports bundle but doesn't use any slots - just remove the import
print(f" No metadata slots used - removing import")
else:
print(f" Uses slots: {sorted(used_slots)}")
# Get existing imports
existing_imports = data.get('imports', [])
# Generate new imports
new_imports = generate_new_imports(used_slots, existing_imports)
# Update the data
data['imports'] = new_imports
if dry_run:
print(f" Would update imports to: {new_imports}")
return True
# Write back - preserve formatting as much as possible
# Use regex replacement to preserve comments and formatting
# Find the imports block
imports_pattern = r'(imports:\s*\n)((?:- [^\n]+\n)+)'
def replace_imports(match):
prefix = match.group(1)
new_block = '\n'.join(f'- {imp}' for imp in new_imports)
return prefix + new_block + '\n'
new_content = re.sub(imports_pattern, replace_imports, content)
with open(file_path, 'w') as f:
f.write(new_content)
return True
def validate_file(file_path: Path) -> bool:
"""Validate a migrated file using linkml-validate."""
try:
result = subprocess.run(
['linkml-validate', '--schema', str(file_path)],
capture_output=True,
text=True,
cwd=str(Path.cwd())
)
if result.returncode != 0:
print(f" VALIDATION ERROR: {result.stderr}")
return False
return True
except Exception as e:
print(f" VALIDATION ERROR: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='Migrate class files to use specific slot imports')
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
parser.add_argument('--validate', action='store_true', help='Validate files after migration')
parser.add_argument('--file', type=str, help='Migrate a specific file only')
args = parser.parse_args()
os.chdir(Path(__file__).parent.parent)
if args.file:
files = [Path(args.file)]
else:
files = sorted(CLASSES_DIR.glob("*.yaml"))
migrated = 0
skipped = 0
errors = 0
for file_path in files:
print(f"\nProcessing: {file_path.name}")
try:
if migrate_file(file_path, dry_run=args.dry_run):
migrated += 1
if args.validate and not args.dry_run:
if validate_file(file_path):
print(f" VALIDATED OK")
else:
errors += 1
else:
skipped += 1
print(f" Skipped (no class_metadata_slots import)")
except Exception as e:
errors += 1
print(f" ERROR: {e}")
print(f"\n{'='*60}")
print(f"Migration complete:")
print(f" Migrated: {migrated}")
print(f" Skipped: {skipped}")
print(f" Errors: {errors}")
if args.dry_run:
print("\n(Dry run - no changes made)")
if __name__ == '__main__':
main()