glam/scripts/migrate_claude_conversation_provenance.py

#!/usr/bin/env python3
"""
Migrate `agent: claude-conversation` provenance to Rule 35 compliant dual timestamps.

This script handles Phase 1: ISIL/CSV registry sources (~18,000 files).
These files have provenance paths like `/files/{country}_complete.yaml` and don't
require LLM processing - just timestamp restructuring.

Phase 2 (conversation sources) and Phase 3 (web sources) require GLM4.7 + web-reader
and are handled separately.

Migration Rules (from .opencode/PROVENANCE_TIMESTAMP_RULES.md):
1. Every provenance MUST have `statement_created_at` and `source_archived_at`
2. `agent: claude-conversation` is INVALID - replace with proper agent identifier
3. `source_archived_at` must be <= `statement_created_at`

Usage:
    # Dry run (no changes)
    python scripts/migrate_claude_conversation_provenance.py --dry-run

    # Process specific file
    python scripts/migrate_claude_conversation_provenance.py --file data/custodian/JP-01-TOM-L-H.yaml

    # Process all files (with backup)
    python scripts/migrate_claude_conversation_provenance.py --backup

    # Limit processing
    python scripts/migrate_claude_conversation_provenance.py --limit 100 --dry-run

Author: OpenCode/Claude
Created: 2025-12-30
Related: .opencode/PROVENANCE_TIMESTAMP_RULES.md, .opencode/CLAUDE_CONVERSATION_MIGRATION_SPEC.md
"""

import argparse
import json
import os
import re
import shutil
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# YAML handling - use ruamel.yaml to preserve formatting
HAS_RUAMEL = False
yaml_handler = None
pyyaml_module = None

try:
    from ruamel.yaml import YAML  # type: ignore[import-untyped]
    yaml_handler = YAML()
    yaml_handler.preserve_quotes = True
    yaml_handler.width = 4096  # Prevent line wrapping
    HAS_RUAMEL = True
except ImportError:
    try:
        import yaml as pyyaml_module  # type: ignore[import-untyped]
        print("Warning: ruamel.yaml not installed. Using PyYAML (may lose formatting).")
    except ImportError:
        print("Error: No YAML library available. Install ruamel.yaml or PyYAML.")
        sys.exit(1)

# Constants
CUSTODIAN_DIR = Path("data/custodian")
BACKUP_DIR = Path("data/custodian.backup")
MIGRATION_TIMESTAMP = datetime.now(timezone.utc).isoformat()

# Source type patterns for categorization
SOURCE_PATTERNS = {
    'isil_csv': [
        r'/files/.*_complete\.yaml',
        r'/files/.*_isil.*\.yaml',
        r'/files/.*_ch_annotator\.yaml',
        r'/instances/.*\.yaml',
    ],
    'conversation': [
        r'/conversations/[a-f0-9-]+',
    ],
    'web': [
        r'https?://',
    ],
}

# Agent mapping based on source file patterns
AGENT_MAPPING = {
    'japan_complete': 'batch-script-create-custodian-from-ch-annotator',
    'austria_complete': 'batch-script-create-custodian-from-ch-annotator',
    'switzerland_isil': 'batch-script-create-custodian-from-ch-annotator',
    'czech_unified': 'batch-script-create-custodian-from-ch-annotator',
    'bulgaria_complete': 'batch-script-create-custodian-from-ch-annotator',
    'belgium_complete': 'batch-script-create-custodian-from-ch-annotator',
    'netherlands_complete': 'batch-script-create-custodian-from-ch-annotator',
    'norway': 'batch-script-create-custodian-from-ch-annotator',
    'default': 'batch-script-ch-annotator-extraction',
}


def detect_source_category(path: str) -> str:
    """
    Detect the source category from the provenance path.

    Returns: 'isil_csv', 'conversation', 'web', or 'unknown'
    """
    if not path:
        return 'unknown'

    for category, patterns in SOURCE_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, path):
                return category

    return 'unknown'


def get_agent_for_source(path: str) -> str:
    """
    Determine the appropriate agent identifier based on source path.
    """
    if not path:
        return AGENT_MAPPING['default']

    path_lower = path.lower()
    for key, agent in AGENT_MAPPING.items():
        if key in path_lower:
            return agent

    return AGENT_MAPPING['default']


def migrate_provenance_block(
    provenance: Dict[str, Any],
    annotation_date: Optional[str] = None,
    parent_source_archived_at: Optional[str] = None
) -> Tuple[Dict[str, Any], List[str]]:
    """
    Migrate a single provenance block to Rule 35 compliant format.

    Args:
        provenance: The provenance dict to migrate
        annotation_date: The annotation date from parent context (for statement_created_at)
        parent_source_archived_at: Inherited source_archived_at from parent

    Returns:
        Tuple of (migrated provenance dict, list of changes made)
    """
    changes = []

    # Check if already migrated
    if 'statement_created_at' in provenance and 'source_archived_at' in provenance:
        if provenance.get('agent') != 'claude-conversation':
            return provenance, ['already_migrated']

    # Extract existing timestamp
    existing_timestamp = provenance.get('timestamp')

    # Determine source_archived_at
    if parent_source_archived_at:
        source_archived_at = parent_source_archived_at
    elif existing_timestamp:
        source_archived_at = existing_timestamp
    else:
        source_archived_at = MIGRATION_TIMESTAMP
        changes.append('source_archived_at_defaulted')

    # Determine statement_created_at
    if annotation_date:
        statement_created_at = annotation_date
    elif existing_timestamp:
        # If no annotation_date, use existing timestamp for both (simultaneous)
        statement_created_at = existing_timestamp
    else:
        statement_created_at = MIGRATION_TIMESTAMP
        changes.append('statement_created_at_defaulted')

    # Ensure temporal ordering: source_archived_at <= statement_created_at
    try:
        archived_dt = datetime.fromisoformat(source_archived_at.replace('Z', '+00:00'))
        created_dt = datetime.fromisoformat(statement_created_at.replace('Z', '+00:00'))
        if archived_dt > created_dt:
            # Swap if out of order
            source_archived_at, statement_created_at = statement_created_at, source_archived_at
            changes.append('timestamps_reordered')
    except (ValueError, AttributeError):
        changes.append('timestamp_parse_warning')

    # Determine new agent
    old_agent = provenance.get('agent', '')
    if old_agent == 'claude-conversation' or not old_agent:
        path = provenance.get('path', '')
        new_agent = get_agent_for_source(path)
        changes.append(f'agent_changed:{old_agent}->{new_agent}')
    else:
        new_agent = old_agent

    # Detect source type
    path = provenance.get('path', '')
    source_type = detect_source_category(path)

    # Build migrated provenance
    migrated = {
        # Preserve existing fields
        'namespace': provenance.get('namespace', 'glam'),
        'path': path,
        'context_convention': provenance.get('context_convention', 'ch_annotator-v1_7_0'),

        # NEW: Dual timestamps (Rule 35)
        'source_archived_at': source_archived_at,
        'statement_created_at': statement_created_at,

        # NEW: Valid agent identifier
        'agent': new_agent,

        # NEW: Source classification
        'source_type': source_type,

        # Migration tracking
        'migration_note': f'Migrated from agent:claude-conversation on {MIGRATION_TIMESTAMP[:10]}',
    }

    # Remove old timestamp field (replaced by dual timestamps)
    if 'timestamp' in provenance:
        changes.append('timestamp_field_removed')

    return migrated, changes


def migrate_entity_claims(
    claims: List[Dict[str, Any]],
    parent_source_archived_at: str,
    parent_statement_created_at: str
) -> Tuple[List[Dict[str, Any]], int]:
    """
    Migrate all entity_claims provenance blocks.

    Returns:
        Tuple of (migrated claims list, count of claims migrated)
    """
    migrated_count = 0

    for claim in claims:
        if 'provenance' not in claim:
            continue

        prov = claim['provenance']

        # Check if needs migration
        if prov.get('agent') == 'claude-conversation' or 'statement_created_at' not in prov:
            # Migrate the provenance block
            migrated_prov, _ = migrate_provenance_block(
                prov,
                annotation_date=parent_statement_created_at,
                parent_source_archived_at=parent_source_archived_at
            )
            claim['provenance'] = migrated_prov
            migrated_count += 1

    return claims, migrated_count


def process_file(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
    """
    Process a single custodian YAML file.

    Returns:
        Dict with processing results
    """
    result = {
        'file': str(filepath),
        'status': 'unknown',
        'changes': [],
        'claims_migrated': 0,
        'category': 'unknown',
        'error': None,
    }

    try:
        # Read file
        if HAS_RUAMEL and yaml_handler is not None:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml_handler.load(f)
        elif pyyaml_module is not None:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = pyyaml_module.safe_load(f)
        else:
            result['status'] = 'error'
            result['error'] = 'No YAML library available'
            return result

        if not data:
            result['status'] = 'empty_file'
            return result

        # Check if file has ch_annotator section
        ch_annotator = data.get('ch_annotator', {})
        if not ch_annotator:
            result['status'] = 'no_ch_annotator'
            return result

        # Check extraction_provenance
        extraction_prov = ch_annotator.get('extraction_provenance', {})
        if not extraction_prov:
            result['status'] = 'no_extraction_provenance'
            return result

        # Detect category
        path = extraction_prov.get('path', '')
        result['category'] = detect_source_category(path)

        # Skip non-ISIL/CSV sources (Phase 2 & 3)
        if result['category'] not in ['isil_csv', 'unknown']:
            result['status'] = f'skipped_phase2_{result["category"]}'
            return result

        # Check if already migrated
        if extraction_prov.get('agent') != 'claude-conversation':
            if 'statement_created_at' in extraction_prov and 'source_archived_at' in extraction_prov:
                result['status'] = 'already_migrated'
                return result

        # Get annotation date for statement_created_at
        annotation_prov = ch_annotator.get('annotation_provenance', {})
        annotation_date = annotation_prov.get('annotation_date')

        # Migrate extraction_provenance
        migrated_extraction_prov, changes = migrate_provenance_block(
            extraction_prov,
            annotation_date=annotation_date
        )
        result['changes'].extend(changes)

        # Update ch_annotator.extraction_provenance
        ch_annotator['extraction_provenance'] = migrated_extraction_prov

        # Get timestamps for entity claims inheritance
        source_archived_at = migrated_extraction_prov['source_archived_at']
        statement_created_at = migrated_extraction_prov['statement_created_at']

        # Migrate entity_claims
        entity_claims = ch_annotator.get('entity_claims', [])
        if entity_claims:
            migrated_claims, claims_count = migrate_entity_claims(
                entity_claims,
                source_archived_at,
                statement_created_at
            )
            ch_annotator['entity_claims'] = migrated_claims
            result['claims_migrated'] = claims_count

        # Update data
        data['ch_annotator'] = ch_annotator

        # Write file (unless dry run)
        if not dry_run:
            if HAS_RUAMEL and yaml_handler is not None:
                with open(filepath, 'w', encoding='utf-8') as f:
                    yaml_handler.dump(data, f)
            elif pyyaml_module is not None:
                with open(filepath, 'w', encoding='utf-8') as f:
                    pyyaml_module.safe_dump(data, f, default_flow_style=False, allow_unicode=True)

        result['status'] = 'migrated' if not dry_run else 'would_migrate'

    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)

    return result


def create_backup(backup_dir: Path) -> bool:
    """
    Create backup of custodian directory.
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_path = backup_dir.parent / f"{backup_dir.name}.{timestamp}"

    try:
        if CUSTODIAN_DIR.exists():
            print(f"Creating backup at {backup_path}...")
            shutil.copytree(CUSTODIAN_DIR, backup_path)
            print(f"Backup created: {backup_path}")
            return True
    except Exception as e:
        print(f"Error creating backup: {e}")
        return False

    return False


def main():
    parser = argparse.ArgumentParser(
        description='Migrate agent:claude-conversation provenance to Rule 35 compliant format'
    )
    parser.add_argument('--dry-run', action='store_true',
                        help='Show what would be changed without modifying files')
    parser.add_argument('--file', type=Path,
                        help='Process a specific file only')
    parser.add_argument('--limit', type=int, default=0,
                        help='Limit number of files to process (0 = no limit)')
    parser.add_argument('--backup', action='store_true',
                        help='Create backup before processing')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='Show detailed progress')
    parser.add_argument('--report', type=Path,
                        help='Write JSON report to file')

    args = parser.parse_args()

    print("=" * 70)
    print("Migration: agent:claude-conversation → Rule 35 Compliant Provenance")
    print("=" * 70)
    print(f"Phase 1: ISIL/CSV Registry Sources")
    print(f"Timestamp: {MIGRATION_TIMESTAMP}")
    print(f"Dry run: {args.dry_run}")
    print()

    # Create backup if requested
    if args.backup and not args.dry_run:
        if not create_backup(BACKUP_DIR):
            print("Backup failed. Aborting.")
            sys.exit(1)

    # Collect files to process
    if args.file:
        files = [args.file] if args.file.exists() else []
        if not files:
            print(f"Error: File not found: {args.file}")
            sys.exit(1)
    else:
        files = sorted(CUSTODIAN_DIR.glob("*.yaml"))

    if args.limit > 0:
        files = files[:args.limit]

    print(f"Files to process: {len(files)}")
    print()

    # Process files
    results = {
        'migrated': 0,
        'would_migrate': 0,
        'already_migrated': 0,
        'skipped': 0,
        'errors': 0,
        'claims_migrated': 0,
        'by_category': {},
        'details': [],
    }

    for i, filepath in enumerate(files, 1):
        result = process_file(filepath, dry_run=args.dry_run)

        # Update counters
        status = result['status']
        if status == 'migrated':
            results['migrated'] += 1
        elif status == 'would_migrate':
            results['would_migrate'] += 1
        elif status == 'already_migrated':
            results['already_migrated'] += 1
        elif status == 'error':
            results['errors'] += 1
        else:
            results['skipped'] += 1

        results['claims_migrated'] += result['claims_migrated']

        # Track by category
        cat = result['category']
        results['by_category'][cat] = results['by_category'].get(cat, 0) + 1

        # Store details
        if args.report or args.verbose:
            results['details'].append(result)

        # Progress
        if args.verbose or (i % 1000 == 0):
            print(f"[{i}/{len(files)}] {filepath.name}: {status}")
        elif i % 100 == 0:
            print(f"Processed {i}/{len(files)} files...", end='\r')

    print()
    print()
    print("=" * 70)
    print("RESULTS")
    print("=" * 70)

    if args.dry_run:
        print(f"Would migrate:     {results['would_migrate']:,}")
    else:
        print(f"Migrated:          {results['migrated']:,}")

    print(f"Already migrated:  {results['already_migrated']:,}")
    print(f"Skipped (Phase 2): {results['skipped']:,}")
    print(f"Errors:            {results['errors']:,}")
    print(f"Claims migrated:   {results['claims_migrated']:,}")
    print()
    print("By category:")
    for cat, count in sorted(results['by_category'].items()):
        print(f"  {cat}: {count:,}")

    # Write report
    if args.report:
        report_data = {
            'timestamp': MIGRATION_TIMESTAMP,
            'dry_run': args.dry_run,
            'summary': {
                'migrated': results['migrated'],
                'would_migrate': results['would_migrate'],
                'already_migrated': results['already_migrated'],
                'skipped': results['skipped'],
                'errors': results['errors'],
                'claims_migrated': results['claims_migrated'],
            },
            'by_category': results['by_category'],
            'details': results['details'],
        }
        with open(args.report, 'w', encoding='utf-8') as f:
            json.dump(report_data, f, indent=2, ensure_ascii=False)
        print(f"\nReport written to: {args.report}")

    print()
    if args.dry_run:
        print("This was a DRY RUN. No files were modified.")
        print("Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()