glam/scripts/fix_archive_class_files.py

#!/usr/bin/env python3
"""
Fix archive class files to properly integrate with RecordSetTypes.

This script:
1. Removes duplicate 'slots:' sections
2. Adds the RecordSetType base class if missing
3. Fixes imports
"""

import os
import re
from pathlib import Path
from datetime import datetime

SCHEMA_BASE = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml")
CLASSES_DIR = SCHEMA_BASE / "modules" / "classes"

# Archive classes that already have proper RecordSetType base class
ALREADY_PROPER = {"AcademicArchive"}

# Skip these - not custodian classes
SKIP_PATTERNS = ["RecordSetTypes", "Association", "Network", "OrganizationType"]


def get_archive_classes() -> list:
    """Get list of archive custodian class files."""
    archives = []
    for f in CLASSES_DIR.glob("*Archive*.yaml"):
        name = f.stem
        if any(p in name for p in SKIP_PATTERNS):
            continue
        archives.append(name)
    return sorted(archives)


def fix_archive_file(archive_name: str):
    """Fix an archive class file."""
    file_path = CLASSES_DIR / f"{archive_name}.yaml"
    rst_file = CLASSES_DIR / f"{archive_name}RecordSetTypes.yaml"

    if not file_path.exists():
        print(f"  Skipping {archive_name} - file not found")
        return

    if not rst_file.exists():
        print(f"  Skipping {archive_name} - no RecordSetTypes file")
        return

    with open(file_path, 'r') as f:
        content = f.read()

    original_content = content
    modified = False

    # Check if already has RecordSetType base class
    rst_type_name = f"{archive_name}RecordSetType"
    has_rst_type = rst_type_name + ":" in content

    # 1. Fix imports - add missing imports
    imports_needed = []
    if f"./{archive_name}RecordSetTypes" not in content:
        imports_needed.append(f"  - ./{archive_name}RecordSetTypes  # Imports concrete subclasses")
    if "../slots/holds_record_set_types" not in content:
        imports_needed.append("  - ../slots/holds_record_set_types  # Links custodian to record set types")
    if not has_rst_type and "./CollectionType" not in content:
        imports_needed.append("  - ./CollectionType")
    if not has_rst_type and "../slots/type_scope" not in content:
        imports_needed.append("  - ../slots/type_scope")

    if imports_needed:
        # Find the imports section and add to it
        import_match = re.search(r'(imports:\n(?:  - [^\n]+\n)+)', content)
        if import_match:
            old_imports = import_match.group(1)
            new_imports = old_imports.rstrip('\n') + '\n' + '\n'.join(imports_needed) + '\n'
            content = content.replace(old_imports, new_imports)
            modified = True

    # 2. Fix duplicate slots sections
    # Count slots sections
    slots_matches = list(re.finditer(r'^    slots:\n((?:      - [^\n]+\n)+)', content, re.MULTILINE))
    if len(slots_matches) > 1:
        # Collect all slot items
        all_slots = set()
        for match in slots_matches:
            slot_items = re.findall(r'      - ([^\n]+)', match.group(0))
            all_slots.update(slot_items)

        # Add holds_record_set_types if not present
        all_slots.add('holds_record_set_types')

        # Remove all slots sections
        for match in reversed(slots_matches):
            content = content[:match.start()] + content[match.end():]

        # Add consolidated slots section after class_uri
        class_uri_match = re.search(rf'(  {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
        if class_uri_match:
            insert_point = class_uri_match.end()
            slots_text = "\n    # Slots linking custodian to record set types\n    slots:\n"
            for slot in sorted(all_slots):
                slots_text += f"      - {slot}\n"
            content = content[:insert_point] + slots_text + content[insert_point:]
            modified = True
    elif len(slots_matches) == 1:
        # Check if holds_record_set_types is in slots
        if "holds_record_set_types" not in slots_matches[0].group(0):
            # Add it
            old_slots = slots_matches[0].group(0)
            new_slots = old_slots.rstrip('\n') + '\n      - holds_record_set_types\n'
            content = content.replace(old_slots, new_slots)
            modified = True
    else:
        # No slots section - need to add one
        class_uri_match = re.search(rf'(  {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
        if class_uri_match:
            insert_point = class_uri_match.end()
            slots_text = "\n    # Slots linking custodian to record set types\n    slots:\n      - holds_record_set_types\n"
            content = content[:insert_point] + slots_text + content[insert_point:]
            modified = True

    # 3. Add RecordSetType base class if not present
    if not has_rst_type:
        # Add the RecordSetType class at the end
        rst_type_class = f'''
  # rico:RecordSetType base class for collection classification
  {rst_type_name}:
    description: |
      A rico:RecordSetType for classifying collections held by {archive_name} custodians.

      **Dual-Class Pattern**:
      This class represents the COLLECTION type (rico:RecordSetType).
      For the custodian organization type, see `{archive_name}`.
    is_a: CollectionType
    class_uri: rico:RecordSetType
    slots:
      - type_scope
    see_also:
      - {archive_name}
      - rico:RecordSetType
    annotations:
      custodian_types: '["A"]'
      custodian_types_rationale: "{rst_type_name} classifies collections held by ARCHIVE (A) type custodians"
      linked_custodian_type: {archive_name}
      dual_class_pattern: collection_type
      specificity_score: 0.7
      specificity_rationale: Type taxonomy class.
      specificity_annotation_timestamp: '{datetime.now().isoformat()}Z'
      specificity_annotation_agent: opencode-claude-sonnet-4
      template_specificity:
        archive_search: 0.2
        museum_search: 0.75
        library_search: 0.75
        collection_discovery: 0.75
        person_research: 0.75
        location_browse: 0.75
        identifier_lookup: 0.75
        organizational_change: 0.75
        digital_platform: 0.75
        general_heritage: 0.75
'''
        content = content.rstrip() + '\n' + rst_type_class
        modified = True

    if modified:
        with open(file_path, 'w') as f:
            f.write(content)
        print(f"  Fixed {archive_name}.yaml")
    else:
        print(f"  {archive_name}.yaml - no changes needed")


def main():
    print("=" * 70)
    print("Archive Class File Fixer")
    print("=" * 70)

    archives = get_archive_classes()
    print(f"\nProcessing {len(archives)} archive class files\n")

    for archive in archives:
        if archive in ALREADY_PROPER:
            print(f"  Skipping {archive} - already properly configured")
            continue
        fix_archive_file(archive)

    print("\nDone!")


if __name__ == "__main__":
    main()