#!/usr/bin/env python3 """ Fix archive class files to properly integrate with RecordSetTypes. This script: 1. Removes duplicate 'slots:' sections 2. Adds the RecordSetType base class if missing 3. Fixes imports """ import os import re from pathlib import Path from datetime import datetime SCHEMA_BASE = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml") CLASSES_DIR = SCHEMA_BASE / "modules" / "classes" # Archive classes that already have proper RecordSetType base class ALREADY_PROPER = {"AcademicArchive"} # Skip these - not custodian classes SKIP_PATTERNS = ["RecordSetTypes", "Association", "Network", "OrganizationType"] def get_archive_classes() -> list: """Get list of archive custodian class files.""" archives = [] for f in CLASSES_DIR.glob("*Archive*.yaml"): name = f.stem if any(p in name for p in SKIP_PATTERNS): continue archives.append(name) return sorted(archives) def fix_archive_file(archive_name: str): """Fix an archive class file.""" file_path = CLASSES_DIR / f"{archive_name}.yaml" rst_file = CLASSES_DIR / f"{archive_name}RecordSetTypes.yaml" if not file_path.exists(): print(f" Skipping {archive_name} - file not found") return if not rst_file.exists(): print(f" Skipping {archive_name} - no RecordSetTypes file") return with open(file_path, 'r') as f: content = f.read() original_content = content modified = False # Check if already has RecordSetType base class rst_type_name = f"{archive_name}RecordSetType" has_rst_type = rst_type_name + ":" in content # 1. Fix imports - add missing imports imports_needed = [] if f"./{archive_name}RecordSetTypes" not in content: imports_needed.append(f" - ./{archive_name}RecordSetTypes # Imports concrete subclasses") if "../slots/holds_record_set_types" not in content: imports_needed.append(" - ../slots/holds_record_set_types # Links custodian to record set types") if not has_rst_type and "./CollectionType" not in content: imports_needed.append(" - ./CollectionType") if not has_rst_type and "../slots/type_scope" not in content: imports_needed.append(" - ../slots/type_scope") if imports_needed: # Find the imports section and add to it import_match = re.search(r'(imports:\n(?: - [^\n]+\n)+)', content) if import_match: old_imports = import_match.group(1) new_imports = old_imports.rstrip('\n') + '\n' + '\n'.join(imports_needed) + '\n' content = content.replace(old_imports, new_imports) modified = True # 2. Fix duplicate slots sections # Count slots sections slots_matches = list(re.finditer(r'^ slots:\n((?: - [^\n]+\n)+)', content, re.MULTILINE)) if len(slots_matches) > 1: # Collect all slot items all_slots = set() for match in slots_matches: slot_items = re.findall(r' - ([^\n]+)', match.group(0)) all_slots.update(slot_items) # Add holds_record_set_types if not present all_slots.add('holds_record_set_types') # Remove all slots sections for match in reversed(slots_matches): content = content[:match.start()] + content[match.end():] # Add consolidated slots section after class_uri class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL) if class_uri_match: insert_point = class_uri_match.end() slots_text = "\n # Slots linking custodian to record set types\n slots:\n" for slot in sorted(all_slots): slots_text += f" - {slot}\n" content = content[:insert_point] + slots_text + content[insert_point:] modified = True elif len(slots_matches) == 1: # Check if holds_record_set_types is in slots if "holds_record_set_types" not in slots_matches[0].group(0): # Add it old_slots = slots_matches[0].group(0) new_slots = old_slots.rstrip('\n') + '\n - holds_record_set_types\n' content = content.replace(old_slots, new_slots) modified = True else: # No slots section - need to add one class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL) if class_uri_match: insert_point = class_uri_match.end() slots_text = "\n # Slots linking custodian to record set types\n slots:\n - holds_record_set_types\n" content = content[:insert_point] + slots_text + content[insert_point:] modified = True # 3. Add RecordSetType base class if not present if not has_rst_type: # Add the RecordSetType class at the end rst_type_class = f''' # rico:RecordSetType base class for collection classification {rst_type_name}: description: | A rico:RecordSetType for classifying collections held by {archive_name} custodians. **Dual-Class Pattern**: This class represents the COLLECTION type (rico:RecordSetType). For the custodian organization type, see `{archive_name}`. is_a: CollectionType class_uri: rico:RecordSetType slots: - type_scope see_also: - {archive_name} - rico:RecordSetType annotations: custodian_types: '["A"]' custodian_types_rationale: "{rst_type_name} classifies collections held by ARCHIVE (A) type custodians" linked_custodian_type: {archive_name} dual_class_pattern: collection_type specificity_score: 0.7 specificity_rationale: Type taxonomy class. specificity_annotation_timestamp: '{datetime.now().isoformat()}Z' specificity_annotation_agent: opencode-claude-sonnet-4 template_specificity: archive_search: 0.2 museum_search: 0.75 library_search: 0.75 collection_discovery: 0.75 person_research: 0.75 location_browse: 0.75 identifier_lookup: 0.75 organizational_change: 0.75 digital_platform: 0.75 general_heritage: 0.75 ''' content = content.rstrip() + '\n' + rst_type_class modified = True if modified: with open(file_path, 'w') as f: f.write(content) print(f" Fixed {archive_name}.yaml") else: print(f" {archive_name}.yaml - no changes needed") def main(): print("=" * 70) print("Archive Class File Fixer") print("=" * 70) archives = get_archive_classes() print(f"\nProcessing {len(archives)} archive class files\n") for archive in archives: if archive in ALREADY_PROPER: print(f" Skipping {archive} - already properly configured") continue fix_archive_file(archive) print("\nDone!") if __name__ == "__main__": main()