glam/scripts/migrate_provenance_phase1_5.py

#!/usr/bin/env python3
"""
Phase 1.5: Migrate ch_annotator sections with claude-conversation agent.

This script handles files created from CH-Annotator sources that have
`agent: claude-conversation` in:
1. ch_annotator.extraction_provenance.agent
2. ch_annotator.entity_claims[].provenance.agent

These files were correctly skipped by Phase 1 but need their agent identifiers
updated to comply with Rule 35 (dual timestamp requirement).

Strategy:
- Replace `agent: claude-conversation` with a valid agent identifier
- Since these files were created via `create_custodian_from_ch_annotator.py`,
  the correct agent is: `batch-script-create-custodian-from-ch-annotator`
- Add migration note to track the change
- Preserve the existing timestamp (it represents source_archived_at semantically)
"""

import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Tuple

import yaml


# Configure YAML to preserve formatting
class SafeDumper(yaml.SafeDumper):
    pass


def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)


SafeDumper.add_representer(str, str_representer)


# Constants
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
OLD_AGENT = "claude-conversation"
NEW_AGENT = "batch-script-create-custodian-from-ch-annotator"
MIGRATION_DATE = datetime.now(timezone.utc).strftime("%Y-%m-%d")


def migrate_provenance_block(prov: dict, context: str) -> bool:
    """
    Migrate a single provenance block.

    Returns True if changes were made.
    """
    changed = False

    if prov.get("agent") == OLD_AGENT:
        prov["agent"] = NEW_AGENT

        # If there's a single timestamp, convert to dual timestamps
        if "timestamp" in prov and "source_archived_at" not in prov:
            original_ts = prov["timestamp"]
            prov["source_archived_at"] = original_ts
            prov["statement_created_at"] = datetime.now(timezone.utc).isoformat()
            del prov["timestamp"]

        # Add migration note if not already present
        if "migration_note" not in prov:
            prov["migration_note"] = f"Migrated from agent:{OLD_AGENT} on {MIGRATION_DATE} ({context})"

        changed = True

    return changed


def migrate_file(filepath: Path, dry_run: bool = False) -> Tuple[bool, str]:
    """
    Migrate a single YAML file.

    Returns (changed, message) tuple.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Quick check - skip if no claude-conversation
        if OLD_AGENT not in content:
            return False, "No migration needed"

        data = yaml.safe_load(content)
        if not data:
            return False, "Empty file"

        changed = False
        changes = []

        # Check ch_annotator.extraction_provenance
        ch_annotator = data.get("ch_annotator", {})
        if ch_annotator:
            extraction_prov = ch_annotator.get("extraction_provenance", {})
            if extraction_prov and migrate_provenance_block(extraction_prov, "ch_annotator.extraction_provenance"):
                changed = True
                changes.append("ch_annotator.extraction_provenance")

            # Check ch_annotator.entity_claims[].provenance
            entity_claims = ch_annotator.get("entity_claims", [])
            for i, claim in enumerate(entity_claims):
                claim_prov = claim.get("provenance", {})
                if claim_prov and migrate_provenance_block(claim_prov, f"entity_claims[{i}].provenance"):
                    changed = True
                    changes.append(f"entity_claims[{i}]")

        # Also check top-level extraction_provenance (in case Phase 1 missed any)
        top_extraction_prov = data.get("extraction_provenance", {})
        if top_extraction_prov and migrate_provenance_block(top_extraction_prov, "extraction_provenance"):
            changed = True
            changes.append("extraction_provenance")

        if changed and not dry_run:
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, Dumper=SafeDumper, default_flow_style=False,
                         allow_unicode=True, sort_keys=False, width=120)

        return changed, f"Migrated: {', '.join(changes)}" if changed else "No migration needed"

    except Exception as e:
        return False, f"Error: {e}"


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Phase 1.5: Migrate CH-Annotator provenance")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without modifying files")
    parser.add_argument("--verbose", action="store_true", help="Show all files processed")
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    args = parser.parse_args()

    print(f"Phase 1.5: Migrating CH-Annotator provenance")
    print(f"Directory: {CUSTODIAN_DIR}")
    print(f"Dry run: {args.dry_run}")
    print("-" * 60)

    # Find all files with agent: claude-conversation (not just migration notes)
    # Use rglob to include subdirectories (archive/, etc.)
    files_to_check = []
    search_pattern = f"agent: {OLD_AGENT}"
    for f in CUSTODIAN_DIR.rglob("*.yaml"):
        try:
            with open(f, 'r', encoding='utf-8') as fp:
                if search_pattern in fp.read():
                    files_to_check.append(f)
        except Exception as e:
            print(f"Warning: Could not read {f}: {e}")

    print(f"Found {len(files_to_check)} files with '{OLD_AGENT}'")

    if args.limit:
        files_to_check = files_to_check[:args.limit]
        print(f"Processing first {args.limit} files")

    print("-" * 60)

    migrated = 0
    errors = 0
    skipped = 0

    for filepath in files_to_check:
        changed, message = migrate_file(filepath, dry_run=args.dry_run)

        if "Error" in message:
            errors += 1
            print(f"ERROR: {filepath.name}: {message}")
        elif changed:
            migrated += 1
            if args.verbose or args.dry_run:
                print(f"{'[DRY RUN] ' if args.dry_run else ''}MIGRATED: {filepath.name}: {message}")
        else:
            skipped += 1
            if args.verbose:
                print(f"SKIPPED: {filepath.name}: {message}")

    print("-" * 60)
    print(f"Summary:")
    print(f"  Migrated: {migrated}")
    print(f"  Skipped:  {skipped}")
    print(f"  Errors:   {errors}")
    print(f"  Total:    {len(files_to_check)}")

    if args.dry_run:
        print("\n[DRY RUN] No files were modified. Run without --dry-run to apply changes.")


if __name__ == "__main__":
    main()