#!/usr/bin/env python3 """ Phase 1.5: Migrate ch_annotator sections with claude-conversation agent. This script handles files created from CH-Annotator sources that have `agent: claude-conversation` in: 1. ch_annotator.extraction_provenance.agent 2. ch_annotator.entity_claims[].provenance.agent These files were correctly skipped by Phase 1 but need their agent identifiers updated to comply with Rule 35 (dual timestamp requirement). Strategy: - Replace `agent: claude-conversation` with a valid agent identifier - Since these files were created via `create_custodian_from_ch_annotator.py`, the correct agent is: `batch-script-create-custodian-from-ch-annotator` - Add migration note to track the change - Preserve the existing timestamp (it represents source_archived_at semantically) """ import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Tuple import yaml # Configure YAML to preserve formatting class SafeDumper(yaml.SafeDumper): pass def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) SafeDumper.add_representer(str, str_representer) # Constants CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") OLD_AGENT = "claude-conversation" NEW_AGENT = "batch-script-create-custodian-from-ch-annotator" MIGRATION_DATE = datetime.now(timezone.utc).strftime("%Y-%m-%d") def migrate_provenance_block(prov: dict, context: str) -> bool: """ Migrate a single provenance block. Returns True if changes were made. """ changed = False if prov.get("agent") == OLD_AGENT: prov["agent"] = NEW_AGENT # If there's a single timestamp, convert to dual timestamps if "timestamp" in prov and "source_archived_at" not in prov: original_ts = prov["timestamp"] prov["source_archived_at"] = original_ts prov["statement_created_at"] = datetime.now(timezone.utc).isoformat() del prov["timestamp"] # Add migration note if not already present if "migration_note" not in prov: prov["migration_note"] = f"Migrated from agent:{OLD_AGENT} on {MIGRATION_DATE} ({context})" changed = True return changed def migrate_file(filepath: Path, dry_run: bool = False) -> Tuple[bool, str]: """ Migrate a single YAML file. Returns (changed, message) tuple. """ try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Quick check - skip if no claude-conversation if OLD_AGENT not in content: return False, "No migration needed" data = yaml.safe_load(content) if not data: return False, "Empty file" changed = False changes = [] # Check ch_annotator.extraction_provenance ch_annotator = data.get("ch_annotator", {}) if ch_annotator: extraction_prov = ch_annotator.get("extraction_provenance", {}) if extraction_prov and migrate_provenance_block(extraction_prov, "ch_annotator.extraction_provenance"): changed = True changes.append("ch_annotator.extraction_provenance") # Check ch_annotator.entity_claims[].provenance entity_claims = ch_annotator.get("entity_claims", []) for i, claim in enumerate(entity_claims): claim_prov = claim.get("provenance", {}) if claim_prov and migrate_provenance_block(claim_prov, f"entity_claims[{i}].provenance"): changed = True changes.append(f"entity_claims[{i}]") # Also check top-level extraction_provenance (in case Phase 1 missed any) top_extraction_prov = data.get("extraction_provenance", {}) if top_extraction_prov and migrate_provenance_block(top_extraction_prov, "extraction_provenance"): changed = True changes.append("extraction_provenance") if changed and not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, Dumper=SafeDumper, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) return changed, f"Migrated: {', '.join(changes)}" if changed else "No migration needed" except Exception as e: return False, f"Error: {e}" def main(): import argparse parser = argparse.ArgumentParser(description="Phase 1.5: Migrate CH-Annotator provenance") parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without modifying files") parser.add_argument("--verbose", action="store_true", help="Show all files processed") parser.add_argument("--limit", type=int, help="Limit number of files to process") args = parser.parse_args() print(f"Phase 1.5: Migrating CH-Annotator provenance") print(f"Directory: {CUSTODIAN_DIR}") print(f"Dry run: {args.dry_run}") print("-" * 60) # Find all files with agent: claude-conversation (not just migration notes) # Use rglob to include subdirectories (archive/, etc.) files_to_check = [] search_pattern = f"agent: {OLD_AGENT}" for f in CUSTODIAN_DIR.rglob("*.yaml"): try: with open(f, 'r', encoding='utf-8') as fp: if search_pattern in fp.read(): files_to_check.append(f) except Exception as e: print(f"Warning: Could not read {f}: {e}") print(f"Found {len(files_to_check)} files with '{OLD_AGENT}'") if args.limit: files_to_check = files_to_check[:args.limit] print(f"Processing first {args.limit} files") print("-" * 60) migrated = 0 errors = 0 skipped = 0 for filepath in files_to_check: changed, message = migrate_file(filepath, dry_run=args.dry_run) if "Error" in message: errors += 1 print(f"ERROR: {filepath.name}: {message}") elif changed: migrated += 1 if args.verbose or args.dry_run: print(f"{'[DRY RUN] ' if args.dry_run else ''}MIGRATED: {filepath.name}: {message}") else: skipped += 1 if args.verbose: print(f"SKIPPED: {filepath.name}: {message}") print("-" * 60) print(f"Summary:") print(f" Migrated: {migrated}") print(f" Skipped: {skipped}") print(f" Errors: {errors}") print(f" Total: {len(files_to_check)}") if args.dry_run: print("\n[DRY RUN] No files were modified. Run without --dry-run to apply changes.") if __name__ == "__main__": main()