glam/scripts/migrate_annotations_to_slots.py

#!/usr/bin/env python3
"""
Migrate LinkML class annotations to proper slots.

This script converts annotations like:
  - custodian_types, custodian_types_rationale
  - wikidata, wikidata_label
  - skos_broader, skos_broader_label
  - specificity_score, specificity_rationale, etc.
  - dual_class_pattern, linked_collection_type, etc.

To proper slots that map to ontology predicates (SKOS, PROV-O, RiC-O, etc.)

Usage:
    python scripts/migrate_annotations_to_slots.py [--dry-run] [--file <path>]

Examples:
    # Dry run on all archive classes
    python scripts/migrate_annotations_to_slots.py --dry-run

    # Migrate a single file
    python scripts/migrate_annotations_to_slots.py --file schemas/20251121/linkml/modules/classes/MunicipalArchive.yaml

    # Migrate all archive classes
    python scripts/migrate_annotations_to_slots.py

Author: OpenCode Claude
Date: 2026-01-06
"""

import argparse
import json
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Any

import yaml


# Preserve YAML formatting with custom representer
def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)


yaml.add_representer(str, str_representer)


def parse_json_annotation(value: str) -> Any:
    """Parse JSON-encoded annotation value."""
    if isinstance(value, str):
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            return value
    return value


def extract_annotations(class_def: dict) -> dict:
    """Extract annotations from a class definition."""
    return class_def.get('annotations', {})


def build_wikidata_alignment(annotations: dict) -> dict | None:
    """Build WikidataAlignment structured object from annotations."""
    entity_id = annotations.get('wikidata')
    if not entity_id:
        return None

    alignment = {
        'entity_id': str(entity_id),
        'entity_label': annotations.get('wikidata_label', ''),
    }

    mapping_type = annotations.get('wikidata_mapping_type', 'exact')
    alignment['mapping_type'] = mapping_type

    mapping_note = annotations.get('wikidata_mapping_note')
    if mapping_note:
        alignment['mapping_rationale'] = mapping_note

    return alignment


def build_dual_class_link(annotations: dict) -> dict | None:
    """Build DualClassLink structured object from annotations."""
    pattern = annotations.get('dual_class_pattern')
    if not pattern:
        return None

    link = {
        'role': pattern,  # custodian_type or collection_type
    }

    # Determine linked class based on role
    if pattern == 'custodian_type':
        linked = annotations.get('linked_collection_type')
    else:
        linked = annotations.get('linked_custodian_type')

    if linked:
        link['linked_class'] = linked

    note = annotations.get('dual_class_pattern_note')
    if note:
        link['rationale'] = note

    return link


def build_specificity_annotation(annotations: dict) -> dict | None:
    """Build SpecificityAnnotation structured object from annotations."""
    score = annotations.get('specificity_score')
    if score is None:
        return None

    annotation = {
        'score': float(score),
        'rationale': annotations.get('specificity_rationale', ''),
    }

    timestamp = annotations.get('specificity_annotation_timestamp')
    if timestamp:
        annotation['timestamp'] = timestamp

    agent = annotations.get('specificity_annotation_agent')
    if agent:
        annotation['agent'] = agent

    return annotation


def build_template_specificity(annotations: dict) -> dict | None:
    """Build TemplateSpecificityScores from annotations."""
    template_spec = annotations.get('template_specificity')
    if not template_spec:
        return None

    if isinstance(template_spec, str):
        try:
            template_spec = json.loads(template_spec)
        except json.JSONDecodeError:
            return None

    return template_spec


def migrate_class_annotations(class_def: dict, class_name: str) -> dict:
    """
    Migrate annotations to slots for a single class definition.

    Returns the modified class definition.
    """
    annotations = extract_annotations(class_def)
    if not annotations:
        return class_def

    # Build slot usage section (handle None/empty slot_usage)
    slot_usage = class_def.get('slot_usage') or {}

    # Required slots to add
    slots_to_add = set()

    # 1. Custodian types
    custodian_types = annotations.get('custodian_types')
    if custodian_types:
        parsed = parse_json_annotation(custodian_types)
        slots_to_add.add('custodian_types')
        slot_usage['custodian_types'] = {
            'equals_expression': json.dumps(parsed) if isinstance(parsed, list) else custodian_types
        }

    rationale = annotations.get('custodian_types_rationale')
    if rationale:
        slots_to_add.add('custodian_types_rationale')
        slot_usage['custodian_types_rationale'] = {
            'equals_string': rationale
        }

    # 2. Wikidata alignment
    wikidata_alignment = build_wikidata_alignment(annotations)
    if wikidata_alignment:
        slots_to_add.add('wikidata_alignment')
        slot_usage['wikidata_alignment'] = {
            'range': 'WikidataAlignment',
            'inlined': True,
            'description': f"Wikidata alignment: {wikidata_alignment.get('entity_id')} ({wikidata_alignment.get('entity_label', '')})"
        }

    # 3. SKOS broader
    skos_broader = annotations.get('skos_broader')
    if skos_broader:
        slots_to_add.add('skos_broader')
        # Ensure it's a list
        if isinstance(skos_broader, str) and not skos_broader.startswith('['):
            skos_broader = [skos_broader]
        slot_usage['skos_broader'] = {
            'equals_expression': json.dumps(skos_broader) if isinstance(skos_broader, list) else f'["{skos_broader}"]'
        }

    skos_broader_label = annotations.get('skos_broader_label')
    if skos_broader_label:
        slots_to_add.add('skos_broader_label')
        slot_usage['skos_broader_label'] = {
            'equals_string': skos_broader_label
        }

    # 4. Dual-class pattern
    dual_class_link = build_dual_class_link(annotations)
    if dual_class_link:
        slots_to_add.add('dual_class_link')
        slot_usage['dual_class_link'] = {
            'range': 'DualClassLink',
            'inlined': True
        }

    # 5. Specificity annotation
    specificity_annotation = build_specificity_annotation(annotations)
    if specificity_annotation:
        slots_to_add.add('specificity_annotation')
        slot_usage['specificity_annotation'] = {
            'range': 'SpecificityAnnotation',
            'inlined': True
        }

    # 6. Template specificity
    template_specificity = build_template_specificity(annotations)
    if template_specificity:
        slots_to_add.add('template_specificity')
        slot_usage['template_specificity'] = {
            'range': 'TemplateSpecificityScores',
            'inlined': True
        }

    # Update class definition
    new_class_def = class_def.copy()

    # Add slots if not already present
    existing_slots = set(new_class_def.get('slots', []))
    new_slots = list(existing_slots | slots_to_add)
    if new_slots:
        new_class_def['slots'] = sorted(new_slots)

    # Update slot_usage
    if slot_usage:
        new_class_def['slot_usage'] = slot_usage

    # Remove migrated annotations (keep non-migrated ones)
    migrated_keys = {
        'custodian_types', 'custodian_types_rationale',
        'wikidata', 'wikidata_label', 'wikidata_mapping_type', 'wikidata_mapping_note',
        'skos_broader', 'skos_broader_label',
        'dual_class_pattern', 'dual_class_pattern_note',
        'linked_collection_type', 'linked_custodian_type',
        'specificity_score', 'specificity_rationale',
        'specificity_annotation_timestamp', 'specificity_annotation_agent',
        'template_specificity'
    }

    remaining_annotations = {k: v for k, v in annotations.items() if k not in migrated_keys}
    if remaining_annotations:
        new_class_def['annotations'] = remaining_annotations
    elif 'annotations' in new_class_def:
        del new_class_def['annotations']

    return new_class_def


def add_class_metadata_import(schema: dict) -> dict:
    """Ensure class_metadata_slots import is present."""
    imports = schema.get('imports', [])

    metadata_import = '../slots/class_metadata_slots'
    if metadata_import not in imports:
        imports.append(metadata_import)
        schema['imports'] = imports

    return schema


def migrate_file(file_path: Path, dry_run: bool = False) -> bool:
    """
    Migrate a single LinkML schema file.

    Returns True if changes were made, False otherwise.
    """
    print(f"\n{'[DRY RUN] ' if dry_run else ''}Processing: {file_path.name}")

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    try:
        schema = yaml.safe_load(content)
    except yaml.YAMLError as e:
        print(f"  ERROR: Failed to parse YAML: {e}")
        return False

    if not schema:
        print(f"  SKIP: Empty file")
        return False

    # Check if file has classes with annotations
    classes = schema.get('classes', {})
    if not classes:
        print(f"  SKIP: No classes found")
        return False

    changes_made = False

    # Process each class
    for class_name, class_def in classes.items():
        if not isinstance(class_def, dict):
            continue

        annotations = class_def.get('annotations', {})
        if not annotations:
            continue

        # Check for annotations we can migrate
        migratable = {
            'custodian_types', 'wikidata', 'skos_broader',
            'specificity_score', 'dual_class_pattern'
        }

        if not any(key in annotations for key in migratable):
            continue

        print(f"  Migrating class: {class_name}")
        print(f"    Found annotations: {list(annotations.keys())}")

        migrated_class = migrate_class_annotations(class_def, class_name)
        classes[class_name] = migrated_class
        changes_made = True

        # Report changes
        new_slots = migrated_class.get('slots', [])
        print(f"    Added slots: {new_slots}")

        remaining = migrated_class.get('annotations', {})
        if remaining:
            print(f"    Remaining annotations: {list(remaining.keys())}")

    if not changes_made:
        print(f"  SKIP: No migratable annotations found")
        return False

    # Add import for class_metadata_slots
    schema = add_class_metadata_import(schema)
    schema['classes'] = classes

    if dry_run:
        print(f"  [DRY RUN] Would write changes to {file_path}")
        return True

    # Write updated schema
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(schema, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"  DONE: Updated {file_path}")
    return True


def find_archive_classes(base_path: Path) -> list[Path]:
    """Find all archive class files that need migration."""
    classes_dir = base_path / 'schemas/20251121/linkml/modules/classes'

    # Find files matching *Archive*.yaml but exclude _refactored versions
    archive_files = []
    for f in classes_dir.glob('*Archive*.yaml'):
        if '_refactored' not in f.name and 'RecordSetTypes' not in f.name:
            archive_files.append(f)

    return sorted(archive_files)


def main():
    parser = argparse.ArgumentParser(
        description='Migrate LinkML class annotations to proper slots'
    )
    parser.add_argument(
        '--dry-run', '-n',
        action='store_true',
        help='Show what would be done without making changes'
    )
    parser.add_argument(
        '--file', '-f',
        type=Path,
        help='Migrate a specific file instead of all archive classes'
    )
    parser.add_argument(
        '--all-classes',
        action='store_true',
        help='Migrate all class files, not just archive classes'
    )

    args = parser.parse_args()

    # Determine base path
    script_path = Path(__file__).resolve()
    base_path = script_path.parent.parent  # Go up from scripts/ to project root

    if args.file:
        files = [args.file]
    elif args.all_classes:
        classes_dir = base_path / 'schemas/20251121/linkml/modules/classes'
        files = sorted(classes_dir.glob('*.yaml'))
        # Exclude _refactored files
        files = [f for f in files if '_refactored' not in f.name]
    else:
        files = find_archive_classes(base_path)

    print(f"Found {len(files)} files to process")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print("=" * 60)

    migrated_count = 0
    skipped_count = 0
    error_count = 0

    for file_path in files:
        try:
            if migrate_file(file_path, args.dry_run):
                migrated_count += 1
            else:
                skipped_count += 1
        except Exception as e:
            print(f"  ERROR: {e}")
            error_count += 1

    print("\n" + "=" * 60)
    print(f"Summary:")
    print(f"  Migrated: {migrated_count}")
    print(f"  Skipped:  {skipped_count}")
    print(f"  Errors:   {error_count}")

    if args.dry_run:
        print("\nThis was a dry run. No files were modified.")
        print("Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()