#!/usr/bin/env python3
"""
Migrate `agent: claude-conversation` provenance to Rule 35 compliant dual timestamps.

This script handles Phase 1: ISIL/CSV registry sources (~18,000 files).
These files have provenance paths like `/files/{country}_complete.yaml` and don't
require LLM processing - just timestamp restructuring.

Phase 2 (conversation sources) and Phase 3 (web sources) require GLM4.7 + web-reader
and are handled separately.

Migration Rules (from .opencode/PROVENANCE_TIMESTAMP_RULES.md):
1. Every provenance MUST have `statement_created_at` and `source_archived_at`
2. `agent: claude-conversation` is INVALID - replace with proper agent identifier
3. `source_archived_at` must be <= `statement_created_at`

Usage:
    # Dry run (no changes)
    python scripts/migrate_claude_conversation_provenance.py --dry-run
    
    # Process specific file
    python scripts/migrate_claude_conversation_provenance.py --file data/custodian/JP-01-TOM-L-H.yaml
    
    # Process all files (with backup)
    python scripts/migrate_claude_conversation_provenance.py --backup
    
    # Limit processing
    python scripts/migrate_claude_conversation_provenance.py --limit 100 --dry-run

Author: OpenCode/Claude
Created: 2025-12-30
Related: .opencode/PROVENANCE_TIMESTAMP_RULES.md, .opencode/CLAUDE_CONVERSATION_MIGRATION_SPEC.md
"""

import argparse
import json
import os
import re
import shutil
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# YAML handling - use ruamel.yaml to preserve formatting
HAS_RUAMEL = False
yaml_handler = None
pyyaml_module = None

try:
    from ruamel.yaml import YAML  # type: ignore[import-untyped]
    yaml_handler = YAML()
    yaml_handler.preserve_quotes = True
    yaml_handler.width = 4096  # Prevent line wrapping
    HAS_RUAMEL = True
except ImportError:
    try:
        import yaml as pyyaml_module  # type: ignore[import-untyped]
        print("Warning: ruamel.yaml not installed. Using PyYAML (may lose formatting).")
    except ImportError:
        print("Error: No YAML library available. Install ruamel.yaml or PyYAML.")
        sys.exit(1)

# Constants
CUSTODIAN_DIR = Path("data/custodian")
BACKUP_DIR = Path("data/custodian.backup")
MIGRATION_TIMESTAMP = datetime.now(timezone.utc).isoformat()

# Source type patterns for categorization
SOURCE_PATTERNS = {
    'isil_csv': [
        r'/files/.*_complete\.yaml',
        r'/files/.*_isil.*\.yaml',
        r'/files/.*_ch_annotator\.yaml',
        r'/instances/.*\.yaml',
    ],
    'conversation': [
        r'/conversations/[a-f0-9-]+',
    ],
    'web': [
        r'https?://',
    ],
}

# Agent mapping based on source file patterns
AGENT_MAPPING = {
    'japan_complete': 'batch-script-create-custodian-from-ch-annotator',
    'austria_complete': 'batch-script-create-custodian-from-ch-annotator',
    'switzerland_isil': 'batch-script-create-custodian-from-ch-annotator',
    'czech_unified': 'batch-script-create-custodian-from-ch-annotator',
    'bulgaria_complete': 'batch-script-create-custodian-from-ch-annotator',
    'belgium_complete': 'batch-script-create-custodian-from-ch-annotator',
    'netherlands_complete': 'batch-script-create-custodian-from-ch-annotator',
    'norway': 'batch-script-create-custodian-from-ch-annotator',
    'default': 'batch-script-ch-annotator-extraction',
}


def detect_source_category(path: str) -> str:
    """
    Detect the source category from the provenance path.
    
    Returns: 'isil_csv', 'conversation', 'web', or 'unknown'
    """
    if not path:
        return 'unknown'
    
    for category, patterns in SOURCE_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, path):
                return category
    
    return 'unknown'


def get_agent_for_source(path: str) -> str:
    """
    Determine the appropriate agent identifier based on source path.
    """
    if not path:
        return AGENT_MAPPING['default']
    
    path_lower = path.lower()
    for key, agent in AGENT_MAPPING.items():
        if key in path_lower:
            return agent
    
    return AGENT_MAPPING['default']


def migrate_provenance_block(
    provenance: Dict[str, Any],
    annotation_date: Optional[str] = None,
    parent_source_archived_at: Optional[str] = None
) -> Tuple[Dict[str, Any], List[str]]:
    """
    Migrate a single provenance block to Rule 35 compliant format.
    
    Args:
        provenance: The provenance dict to migrate
        annotation_date: The annotation date from parent context (for statement_created_at)
        parent_source_archived_at: Inherited source_archived_at from parent
        
    Returns:
        Tuple of (migrated provenance dict, list of changes made)
    """
    changes = []
    
    # Check if already migrated
    if 'statement_created_at' in provenance and 'source_archived_at' in provenance:
        if provenance.get('agent') != 'claude-conversation':
            return provenance, ['already_migrated']
    
    # Extract existing timestamp
    existing_timestamp = provenance.get('timestamp')
    
    # Determine source_archived_at
    if parent_source_archived_at:
        source_archived_at = parent_source_archived_at
    elif existing_timestamp:
        source_archived_at = existing_timestamp
    else:
        source_archived_at = MIGRATION_TIMESTAMP
        changes.append('source_archived_at_defaulted')
    
    # Determine statement_created_at
    if annotation_date:
        statement_created_at = annotation_date
    elif existing_timestamp:
        # If no annotation_date, use existing timestamp for both (simultaneous)
        statement_created_at = existing_timestamp
    else:
        statement_created_at = MIGRATION_TIMESTAMP
        changes.append('statement_created_at_defaulted')
    
    # Ensure temporal ordering: source_archived_at <= statement_created_at
    try:
        archived_dt = datetime.fromisoformat(source_archived_at.replace('Z', '+00:00'))
        created_dt = datetime.fromisoformat(statement_created_at.replace('Z', '+00:00'))
        if archived_dt > created_dt:
            # Swap if out of order
            source_archived_at, statement_created_at = statement_created_at, source_archived_at
            changes.append('timestamps_reordered')
    except (ValueError, AttributeError):
        changes.append('timestamp_parse_warning')
    
    # Determine new agent
    old_agent = provenance.get('agent', '')
    if old_agent == 'claude-conversation' or not old_agent:
        path = provenance.get('path', '')
        new_agent = get_agent_for_source(path)
        changes.append(f'agent_changed:{old_agent}->{new_agent}')
    else:
        new_agent = old_agent
    
    # Detect source type
    path = provenance.get('path', '')
    source_type = detect_source_category(path)
    
    # Build migrated provenance
    migrated = {
        # Preserve existing fields
        'namespace': provenance.get('namespace', 'glam'),
        'path': path,
        'context_convention': provenance.get('context_convention', 'ch_annotator-v1_7_0'),
        
        # NEW: Dual timestamps (Rule 35)
        'source_archived_at': source_archived_at,
        'statement_created_at': statement_created_at,
        
        # NEW: Valid agent identifier
        'agent': new_agent,
        
        # NEW: Source classification
        'source_type': source_type,
        
        # Migration tracking
        'migration_note': f'Migrated from agent:claude-conversation on {MIGRATION_TIMESTAMP[:10]}',
    }
    
    # Remove old timestamp field (replaced by dual timestamps)
    if 'timestamp' in provenance:
        changes.append('timestamp_field_removed')
    
    return migrated, changes


def migrate_entity_claims(
    claims: List[Dict[str, Any]],
    parent_source_archived_at: str,
    parent_statement_created_at: str
) -> Tuple[List[Dict[str, Any]], int]:
    """
    Migrate all entity_claims provenance blocks.
    
    Returns:
        Tuple of (migrated claims list, count of claims migrated)
    """
    migrated_count = 0
    
    for claim in claims:
        if 'provenance' not in claim:
            continue
        
        prov = claim['provenance']
        
        # Check if needs migration
        if prov.get('agent') == 'claude-conversation' or 'statement_created_at' not in prov:
            # Migrate the provenance block
            migrated_prov, _ = migrate_provenance_block(
                prov,
                annotation_date=parent_statement_created_at,
                parent_source_archived_at=parent_source_archived_at
            )
            claim['provenance'] = migrated_prov
            migrated_count += 1
    
    return claims, migrated_count


def process_file(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
    """
    Process a single custodian YAML file.
    
    Returns:
        Dict with processing results
    """
    result = {
        'file': str(filepath),
        'status': 'unknown',
        'changes': [],
        'claims_migrated': 0,
        'category': 'unknown',
        'error': None,
    }
    
    try:
        # Read file
        if HAS_RUAMEL and yaml_handler is not None:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml_handler.load(f)
        elif pyyaml_module is not None:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = pyyaml_module.safe_load(f)
        else:
            result['status'] = 'error'
            result['error'] = 'No YAML library available'
            return result
        
        if not data:
            result['status'] = 'empty_file'
            return result
        
        # Check if file has ch_annotator section
        ch_annotator = data.get('ch_annotator', {})
        if not ch_annotator:
            result['status'] = 'no_ch_annotator'
            return result
        
        # Check extraction_provenance
        extraction_prov = ch_annotator.get('extraction_provenance', {})
        if not extraction_prov:
            result['status'] = 'no_extraction_provenance'
            return result
        
        # Detect category
        path = extraction_prov.get('path', '')
        result['category'] = detect_source_category(path)
        
        # Skip non-ISIL/CSV sources (Phase 2 & 3)
        if result['category'] not in ['isil_csv', 'unknown']:
            result['status'] = f'skipped_phase2_{result["category"]}'
            return result
        
        # Check if already migrated
        if extraction_prov.get('agent') != 'claude-conversation':
            if 'statement_created_at' in extraction_prov and 'source_archived_at' in extraction_prov:
                result['status'] = 'already_migrated'
                return result
        
        # Get annotation date for statement_created_at
        annotation_prov = ch_annotator.get('annotation_provenance', {})
        annotation_date = annotation_prov.get('annotation_date')
        
        # Migrate extraction_provenance
        migrated_extraction_prov, changes = migrate_provenance_block(
            extraction_prov,
            annotation_date=annotation_date
        )
        result['changes'].extend(changes)
        
        # Update ch_annotator.extraction_provenance
        ch_annotator['extraction_provenance'] = migrated_extraction_prov
        
        # Get timestamps for entity claims inheritance
        source_archived_at = migrated_extraction_prov['source_archived_at']
        statement_created_at = migrated_extraction_prov['statement_created_at']
        
        # Migrate entity_claims
        entity_claims = ch_annotator.get('entity_claims', [])
        if entity_claims:
            migrated_claims, claims_count = migrate_entity_claims(
                entity_claims,
                source_archived_at,
                statement_created_at
            )
            ch_annotator['entity_claims'] = migrated_claims
            result['claims_migrated'] = claims_count
        
        # Update data
        data['ch_annotator'] = ch_annotator
        
        # Write file (unless dry run)
        if not dry_run:
            if HAS_RUAMEL and yaml_handler is not None:
                with open(filepath, 'w', encoding='utf-8') as f:
                    yaml_handler.dump(data, f)
            elif pyyaml_module is not None:
                with open(filepath, 'w', encoding='utf-8') as f:
                    pyyaml_module.safe_dump(data, f, default_flow_style=False, allow_unicode=True)
        
        result['status'] = 'migrated' if not dry_run else 'would_migrate'
        
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    
    return result


def create_backup(backup_dir: Path) -> bool:
    """
    Create backup of custodian directory.
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_path = backup_dir.parent / f"{backup_dir.name}.{timestamp}"
    
    try:
        if CUSTODIAN_DIR.exists():
            print(f"Creating backup at {backup_path}...")
            shutil.copytree(CUSTODIAN_DIR, backup_path)
            print(f"Backup created: {backup_path}")
            return True
    except Exception as e:
        print(f"Error creating backup: {e}")
        return False
    
    return False


def main():
    parser = argparse.ArgumentParser(
        description='Migrate agent:claude-conversation provenance to Rule 35 compliant format'
    )
    parser.add_argument('--dry-run', action='store_true',
                        help='Show what would be changed without modifying files')
    parser.add_argument('--file', type=Path,
                        help='Process a specific file only')
    parser.add_argument('--limit', type=int, default=0,
                        help='Limit number of files to process (0 = no limit)')
    parser.add_argument('--backup', action='store_true',
                        help='Create backup before processing')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='Show detailed progress')
    parser.add_argument('--report', type=Path,
                        help='Write JSON report to file')
    
    args = parser.parse_args()
    
    print("=" * 70)
    print("Migration: agent:claude-conversation → Rule 35 Compliant Provenance")
    print("=" * 70)
    print(f"Phase 1: ISIL/CSV Registry Sources")
    print(f"Timestamp: {MIGRATION_TIMESTAMP}")
    print(f"Dry run: {args.dry_run}")
    print()
    
    # Create backup if requested
    if args.backup and not args.dry_run:
        if not create_backup(BACKUP_DIR):
            print("Backup failed. Aborting.")
            sys.exit(1)
    
    # Collect files to process
    if args.file:
        files = [args.file] if args.file.exists() else []
        if not files:
            print(f"Error: File not found: {args.file}")
            sys.exit(1)
    else:
        files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
    
    if args.limit > 0:
        files = files[:args.limit]
    
    print(f"Files to process: {len(files)}")
    print()
    
    # Process files
    results = {
        'migrated': 0,
        'would_migrate': 0,
        'already_migrated': 0,
        'skipped': 0,
        'errors': 0,
        'claims_migrated': 0,
        'by_category': {},
        'details': [],
    }
    
    for i, filepath in enumerate(files, 1):
        result = process_file(filepath, dry_run=args.dry_run)
        
        # Update counters
        status = result['status']
        if status == 'migrated':
            results['migrated'] += 1
        elif status == 'would_migrate':
            results['would_migrate'] += 1
        elif status == 'already_migrated':
            results['already_migrated'] += 1
        elif status == 'error':
            results['errors'] += 1
        else:
            results['skipped'] += 1
        
        results['claims_migrated'] += result['claims_migrated']
        
        # Track by category
        cat = result['category']
        results['by_category'][cat] = results['by_category'].get(cat, 0) + 1
        
        # Store details
        if args.report or args.verbose:
            results['details'].append(result)
        
        # Progress
        if args.verbose or (i % 1000 == 0):
            print(f"[{i}/{len(files)}] {filepath.name}: {status}")
        elif i % 100 == 0:
            print(f"Processed {i}/{len(files)} files...", end='\r')
    
    print()
    print()
    print("=" * 70)
    print("RESULTS")
    print("=" * 70)
    
    if args.dry_run:
        print(f"Would migrate:     {results['would_migrate']:,}")
    else:
        print(f"Migrated:          {results['migrated']:,}")
    
    print(f"Already migrated:  {results['already_migrated']:,}")
    print(f"Skipped (Phase 2): {results['skipped']:,}")
    print(f"Errors:            {results['errors']:,}")
    print(f"Claims migrated:   {results['claims_migrated']:,}")
    print()
    print("By category:")
    for cat, count in sorted(results['by_category'].items()):
        print(f"  {cat}: {count:,}")
    
    # Write report
    if args.report:
        report_data = {
            'timestamp': MIGRATION_TIMESTAMP,
            'dry_run': args.dry_run,
            'summary': {
                'migrated': results['migrated'],
                'would_migrate': results['would_migrate'],
                'already_migrated': results['already_migrated'],
                'skipped': results['skipped'],
                'errors': results['errors'],
                'claims_migrated': results['claims_migrated'],
            },
            'by_category': results['by_category'],
            'details': results['details'],
        }
        with open(args.report, 'w', encoding='utf-8') as f:
            json.dump(report_data, f, indent=2, ensure_ascii=False)
        print(f"\nReport written to: {args.report}")
    
    print()
    if args.dry_run:
        print("This was a DRY RUN. No files were modified.")
        print("Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()