#!/usr/bin/env python3 """ Migrate `agent: claude-conversation` provenance to Rule 35 compliant dual timestamps. This script handles Phase 1: ISIL/CSV registry sources (~18,000 files). These files have provenance paths like `/files/{country}_complete.yaml` and don't require LLM processing - just timestamp restructuring. Phase 2 (conversation sources) and Phase 3 (web sources) require GLM4.7 + web-reader and are handled separately. Migration Rules (from .opencode/PROVENANCE_TIMESTAMP_RULES.md): 1. Every provenance MUST have `statement_created_at` and `source_archived_at` 2. `agent: claude-conversation` is INVALID - replace with proper agent identifier 3. `source_archived_at` must be <= `statement_created_at` Usage: # Dry run (no changes) python scripts/migrate_claude_conversation_provenance.py --dry-run # Process specific file python scripts/migrate_claude_conversation_provenance.py --file data/custodian/JP-01-TOM-L-H.yaml # Process all files (with backup) python scripts/migrate_claude_conversation_provenance.py --backup # Limit processing python scripts/migrate_claude_conversation_provenance.py --limit 100 --dry-run Author: OpenCode/Claude Created: 2025-12-30 Related: .opencode/PROVENANCE_TIMESTAMP_RULES.md, .opencode/CLAUDE_CONVERSATION_MIGRATION_SPEC.md """ import argparse import json import os import re import shutil import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple # YAML handling - use ruamel.yaml to preserve formatting HAS_RUAMEL = False yaml_handler = None pyyaml_module = None try: from ruamel.yaml import YAML # type: ignore[import-untyped] yaml_handler = YAML() yaml_handler.preserve_quotes = True yaml_handler.width = 4096 # Prevent line wrapping HAS_RUAMEL = True except ImportError: try: import yaml as pyyaml_module # type: ignore[import-untyped] print("Warning: ruamel.yaml not installed. Using PyYAML (may lose formatting).") except ImportError: print("Error: No YAML library available. Install ruamel.yaml or PyYAML.") sys.exit(1) # Constants CUSTODIAN_DIR = Path("data/custodian") BACKUP_DIR = Path("data/custodian.backup") MIGRATION_TIMESTAMP = datetime.now(timezone.utc).isoformat() # Source type patterns for categorization SOURCE_PATTERNS = { 'isil_csv': [ r'/files/.*_complete\.yaml', r'/files/.*_isil.*\.yaml', r'/files/.*_ch_annotator\.yaml', r'/instances/.*\.yaml', ], 'conversation': [ r'/conversations/[a-f0-9-]+', ], 'web': [ r'https?://', ], } # Agent mapping based on source file patterns AGENT_MAPPING = { 'japan_complete': 'batch-script-create-custodian-from-ch-annotator', 'austria_complete': 'batch-script-create-custodian-from-ch-annotator', 'switzerland_isil': 'batch-script-create-custodian-from-ch-annotator', 'czech_unified': 'batch-script-create-custodian-from-ch-annotator', 'bulgaria_complete': 'batch-script-create-custodian-from-ch-annotator', 'belgium_complete': 'batch-script-create-custodian-from-ch-annotator', 'netherlands_complete': 'batch-script-create-custodian-from-ch-annotator', 'norway': 'batch-script-create-custodian-from-ch-annotator', 'default': 'batch-script-ch-annotator-extraction', } def detect_source_category(path: str) -> str: """ Detect the source category from the provenance path. Returns: 'isil_csv', 'conversation', 'web', or 'unknown' """ if not path: return 'unknown' for category, patterns in SOURCE_PATTERNS.items(): for pattern in patterns: if re.search(pattern, path): return category return 'unknown' def get_agent_for_source(path: str) -> str: """ Determine the appropriate agent identifier based on source path. """ if not path: return AGENT_MAPPING['default'] path_lower = path.lower() for key, agent in AGENT_MAPPING.items(): if key in path_lower: return agent return AGENT_MAPPING['default'] def migrate_provenance_block( provenance: Dict[str, Any], annotation_date: Optional[str] = None, parent_source_archived_at: Optional[str] = None ) -> Tuple[Dict[str, Any], List[str]]: """ Migrate a single provenance block to Rule 35 compliant format. Args: provenance: The provenance dict to migrate annotation_date: The annotation date from parent context (for statement_created_at) parent_source_archived_at: Inherited source_archived_at from parent Returns: Tuple of (migrated provenance dict, list of changes made) """ changes = [] # Check if already migrated if 'statement_created_at' in provenance and 'source_archived_at' in provenance: if provenance.get('agent') != 'claude-conversation': return provenance, ['already_migrated'] # Extract existing timestamp existing_timestamp = provenance.get('timestamp') # Determine source_archived_at if parent_source_archived_at: source_archived_at = parent_source_archived_at elif existing_timestamp: source_archived_at = existing_timestamp else: source_archived_at = MIGRATION_TIMESTAMP changes.append('source_archived_at_defaulted') # Determine statement_created_at if annotation_date: statement_created_at = annotation_date elif existing_timestamp: # If no annotation_date, use existing timestamp for both (simultaneous) statement_created_at = existing_timestamp else: statement_created_at = MIGRATION_TIMESTAMP changes.append('statement_created_at_defaulted') # Ensure temporal ordering: source_archived_at <= statement_created_at try: archived_dt = datetime.fromisoformat(source_archived_at.replace('Z', '+00:00')) created_dt = datetime.fromisoformat(statement_created_at.replace('Z', '+00:00')) if archived_dt > created_dt: # Swap if out of order source_archived_at, statement_created_at = statement_created_at, source_archived_at changes.append('timestamps_reordered') except (ValueError, AttributeError): changes.append('timestamp_parse_warning') # Determine new agent old_agent = provenance.get('agent', '') if old_agent == 'claude-conversation' or not old_agent: path = provenance.get('path', '') new_agent = get_agent_for_source(path) changes.append(f'agent_changed:{old_agent}->{new_agent}') else: new_agent = old_agent # Detect source type path = provenance.get('path', '') source_type = detect_source_category(path) # Build migrated provenance migrated = { # Preserve existing fields 'namespace': provenance.get('namespace', 'glam'), 'path': path, 'context_convention': provenance.get('context_convention', 'ch_annotator-v1_7_0'), # NEW: Dual timestamps (Rule 35) 'source_archived_at': source_archived_at, 'statement_created_at': statement_created_at, # NEW: Valid agent identifier 'agent': new_agent, # NEW: Source classification 'source_type': source_type, # Migration tracking 'migration_note': f'Migrated from agent:claude-conversation on {MIGRATION_TIMESTAMP[:10]}', } # Remove old timestamp field (replaced by dual timestamps) if 'timestamp' in provenance: changes.append('timestamp_field_removed') return migrated, changes def migrate_entity_claims( claims: List[Dict[str, Any]], parent_source_archived_at: str, parent_statement_created_at: str ) -> Tuple[List[Dict[str, Any]], int]: """ Migrate all entity_claims provenance blocks. Returns: Tuple of (migrated claims list, count of claims migrated) """ migrated_count = 0 for claim in claims: if 'provenance' not in claim: continue prov = claim['provenance'] # Check if needs migration if prov.get('agent') == 'claude-conversation' or 'statement_created_at' not in prov: # Migrate the provenance block migrated_prov, _ = migrate_provenance_block( prov, annotation_date=parent_statement_created_at, parent_source_archived_at=parent_source_archived_at ) claim['provenance'] = migrated_prov migrated_count += 1 return claims, migrated_count def process_file(filepath: Path, dry_run: bool = False) -> Dict[str, Any]: """ Process a single custodian YAML file. Returns: Dict with processing results """ result = { 'file': str(filepath), 'status': 'unknown', 'changes': [], 'claims_migrated': 0, 'category': 'unknown', 'error': None, } try: # Read file if HAS_RUAMEL and yaml_handler is not None: with open(filepath, 'r', encoding='utf-8') as f: data = yaml_handler.load(f) elif pyyaml_module is not None: with open(filepath, 'r', encoding='utf-8') as f: data = pyyaml_module.safe_load(f) else: result['status'] = 'error' result['error'] = 'No YAML library available' return result if not data: result['status'] = 'empty_file' return result # Check if file has ch_annotator section ch_annotator = data.get('ch_annotator', {}) if not ch_annotator: result['status'] = 'no_ch_annotator' return result # Check extraction_provenance extraction_prov = ch_annotator.get('extraction_provenance', {}) if not extraction_prov: result['status'] = 'no_extraction_provenance' return result # Detect category path = extraction_prov.get('path', '') result['category'] = detect_source_category(path) # Skip non-ISIL/CSV sources (Phase 2 & 3) if result['category'] not in ['isil_csv', 'unknown']: result['status'] = f'skipped_phase2_{result["category"]}' return result # Check if already migrated if extraction_prov.get('agent') != 'claude-conversation': if 'statement_created_at' in extraction_prov and 'source_archived_at' in extraction_prov: result['status'] = 'already_migrated' return result # Get annotation date for statement_created_at annotation_prov = ch_annotator.get('annotation_provenance', {}) annotation_date = annotation_prov.get('annotation_date') # Migrate extraction_provenance migrated_extraction_prov, changes = migrate_provenance_block( extraction_prov, annotation_date=annotation_date ) result['changes'].extend(changes) # Update ch_annotator.extraction_provenance ch_annotator['extraction_provenance'] = migrated_extraction_prov # Get timestamps for entity claims inheritance source_archived_at = migrated_extraction_prov['source_archived_at'] statement_created_at = migrated_extraction_prov['statement_created_at'] # Migrate entity_claims entity_claims = ch_annotator.get('entity_claims', []) if entity_claims: migrated_claims, claims_count = migrate_entity_claims( entity_claims, source_archived_at, statement_created_at ) ch_annotator['entity_claims'] = migrated_claims result['claims_migrated'] = claims_count # Update data data['ch_annotator'] = ch_annotator # Write file (unless dry run) if not dry_run: if HAS_RUAMEL and yaml_handler is not None: with open(filepath, 'w', encoding='utf-8') as f: yaml_handler.dump(data, f) elif pyyaml_module is not None: with open(filepath, 'w', encoding='utf-8') as f: pyyaml_module.safe_dump(data, f, default_flow_style=False, allow_unicode=True) result['status'] = 'migrated' if not dry_run else 'would_migrate' except Exception as e: result['status'] = 'error' result['error'] = str(e) return result def create_backup(backup_dir: Path) -> bool: """ Create backup of custodian directory. """ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') backup_path = backup_dir.parent / f"{backup_dir.name}.{timestamp}" try: if CUSTODIAN_DIR.exists(): print(f"Creating backup at {backup_path}...") shutil.copytree(CUSTODIAN_DIR, backup_path) print(f"Backup created: {backup_path}") return True except Exception as e: print(f"Error creating backup: {e}") return False return False def main(): parser = argparse.ArgumentParser( description='Migrate agent:claude-conversation provenance to Rule 35 compliant format' ) parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files') parser.add_argument('--file', type=Path, help='Process a specific file only') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = no limit)') parser.add_argument('--backup', action='store_true', help='Create backup before processing') parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed progress') parser.add_argument('--report', type=Path, help='Write JSON report to file') args = parser.parse_args() print("=" * 70) print("Migration: agent:claude-conversation → Rule 35 Compliant Provenance") print("=" * 70) print(f"Phase 1: ISIL/CSV Registry Sources") print(f"Timestamp: {MIGRATION_TIMESTAMP}") print(f"Dry run: {args.dry_run}") print() # Create backup if requested if args.backup and not args.dry_run: if not create_backup(BACKUP_DIR): print("Backup failed. Aborting.") sys.exit(1) # Collect files to process if args.file: files = [args.file] if args.file.exists() else [] if not files: print(f"Error: File not found: {args.file}") sys.exit(1) else: files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if args.limit > 0: files = files[:args.limit] print(f"Files to process: {len(files)}") print() # Process files results = { 'migrated': 0, 'would_migrate': 0, 'already_migrated': 0, 'skipped': 0, 'errors': 0, 'claims_migrated': 0, 'by_category': {}, 'details': [], } for i, filepath in enumerate(files, 1): result = process_file(filepath, dry_run=args.dry_run) # Update counters status = result['status'] if status == 'migrated': results['migrated'] += 1 elif status == 'would_migrate': results['would_migrate'] += 1 elif status == 'already_migrated': results['already_migrated'] += 1 elif status == 'error': results['errors'] += 1 else: results['skipped'] += 1 results['claims_migrated'] += result['claims_migrated'] # Track by category cat = result['category'] results['by_category'][cat] = results['by_category'].get(cat, 0) + 1 # Store details if args.report or args.verbose: results['details'].append(result) # Progress if args.verbose or (i % 1000 == 0): print(f"[{i}/{len(files)}] {filepath.name}: {status}") elif i % 100 == 0: print(f"Processed {i}/{len(files)} files...", end='\r') print() print() print("=" * 70) print("RESULTS") print("=" * 70) if args.dry_run: print(f"Would migrate: {results['would_migrate']:,}") else: print(f"Migrated: {results['migrated']:,}") print(f"Already migrated: {results['already_migrated']:,}") print(f"Skipped (Phase 2): {results['skipped']:,}") print(f"Errors: {results['errors']:,}") print(f"Claims migrated: {results['claims_migrated']:,}") print() print("By category:") for cat, count in sorted(results['by_category'].items()): print(f" {cat}: {count:,}") # Write report if args.report: report_data = { 'timestamp': MIGRATION_TIMESTAMP, 'dry_run': args.dry_run, 'summary': { 'migrated': results['migrated'], 'would_migrate': results['would_migrate'], 'already_migrated': results['already_migrated'], 'skipped': results['skipped'], 'errors': results['errors'], 'claims_migrated': results['claims_migrated'], }, 'by_category': results['by_category'], 'details': results['details'], } with open(args.report, 'w', encoding='utf-8') as f: json.dump(report_data, f, indent=2, ensure_ascii=False) print(f"\nReport written to: {args.report}") print() if args.dry_run: print("This was a DRY RUN. No files were modified.") print("Run without --dry-run to apply changes.") if __name__ == '__main__': main()