197 lines
6.8 KiB
Python
197 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase 1.5: Migrate ch_annotator sections with claude-conversation agent.
|
|
|
|
This script handles files created from CH-Annotator sources that have
|
|
`agent: claude-conversation` in:
|
|
1. ch_annotator.extraction_provenance.agent
|
|
2. ch_annotator.entity_claims[].provenance.agent
|
|
|
|
These files were correctly skipped by Phase 1 but need their agent identifiers
|
|
updated to comply with Rule 35 (dual timestamp requirement).
|
|
|
|
Strategy:
|
|
- Replace `agent: claude-conversation` with a valid agent identifier
|
|
- Since these files were created via `create_custodian_from_ch_annotator.py`,
|
|
the correct agent is: `batch-script-create-custodian-from-ch-annotator`
|
|
- Add migration note to track the change
|
|
- Preserve the existing timestamp (it represents source_archived_at semantically)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Tuple
|
|
|
|
import yaml
|
|
|
|
|
|
# Configure YAML to preserve formatting
|
|
class SafeDumper(yaml.SafeDumper):
|
|
pass
|
|
|
|
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
|
|
SafeDumper.add_representer(str, str_representer)
|
|
|
|
|
|
# Constants
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
OLD_AGENT = "claude-conversation"
|
|
NEW_AGENT = "batch-script-create-custodian-from-ch-annotator"
|
|
MIGRATION_DATE = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
|
|
def migrate_provenance_block(prov: dict, context: str) -> bool:
|
|
"""
|
|
Migrate a single provenance block.
|
|
|
|
Returns True if changes were made.
|
|
"""
|
|
changed = False
|
|
|
|
if prov.get("agent") == OLD_AGENT:
|
|
prov["agent"] = NEW_AGENT
|
|
|
|
# If there's a single timestamp, convert to dual timestamps
|
|
if "timestamp" in prov and "source_archived_at" not in prov:
|
|
original_ts = prov["timestamp"]
|
|
prov["source_archived_at"] = original_ts
|
|
prov["statement_created_at"] = datetime.now(timezone.utc).isoformat()
|
|
del prov["timestamp"]
|
|
|
|
# Add migration note if not already present
|
|
if "migration_note" not in prov:
|
|
prov["migration_note"] = f"Migrated from agent:{OLD_AGENT} on {MIGRATION_DATE} ({context})"
|
|
|
|
changed = True
|
|
|
|
return changed
|
|
|
|
|
|
def migrate_file(filepath: Path, dry_run: bool = False) -> Tuple[bool, str]:
|
|
"""
|
|
Migrate a single YAML file.
|
|
|
|
Returns (changed, message) tuple.
|
|
"""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Quick check - skip if no claude-conversation
|
|
if OLD_AGENT not in content:
|
|
return False, "No migration needed"
|
|
|
|
data = yaml.safe_load(content)
|
|
if not data:
|
|
return False, "Empty file"
|
|
|
|
changed = False
|
|
changes = []
|
|
|
|
# Check ch_annotator.extraction_provenance
|
|
ch_annotator = data.get("ch_annotator", {})
|
|
if ch_annotator:
|
|
extraction_prov = ch_annotator.get("extraction_provenance", {})
|
|
if extraction_prov and migrate_provenance_block(extraction_prov, "ch_annotator.extraction_provenance"):
|
|
changed = True
|
|
changes.append("ch_annotator.extraction_provenance")
|
|
|
|
# Check ch_annotator.entity_claims[].provenance
|
|
entity_claims = ch_annotator.get("entity_claims", [])
|
|
for i, claim in enumerate(entity_claims):
|
|
claim_prov = claim.get("provenance", {})
|
|
if claim_prov and migrate_provenance_block(claim_prov, f"entity_claims[{i}].provenance"):
|
|
changed = True
|
|
changes.append(f"entity_claims[{i}]")
|
|
|
|
# Also check top-level extraction_provenance (in case Phase 1 missed any)
|
|
top_extraction_prov = data.get("extraction_provenance", {})
|
|
if top_extraction_prov and migrate_provenance_block(top_extraction_prov, "extraction_provenance"):
|
|
changed = True
|
|
changes.append("extraction_provenance")
|
|
|
|
if changed and not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, Dumper=SafeDumper, default_flow_style=False,
|
|
allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
return changed, f"Migrated: {', '.join(changes)}" if changed else "No migration needed"
|
|
|
|
except Exception as e:
|
|
return False, f"Error: {e}"
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Phase 1.5: Migrate CH-Annotator provenance")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without modifying files")
|
|
parser.add_argument("--verbose", action="store_true", help="Show all files processed")
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
args = parser.parse_args()
|
|
|
|
print(f"Phase 1.5: Migrating CH-Annotator provenance")
|
|
print(f"Directory: {CUSTODIAN_DIR}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print("-" * 60)
|
|
|
|
# Find all files with agent: claude-conversation (not just migration notes)
|
|
# Use rglob to include subdirectories (archive/, etc.)
|
|
files_to_check = []
|
|
search_pattern = f"agent: {OLD_AGENT}"
|
|
for f in CUSTODIAN_DIR.rglob("*.yaml"):
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as fp:
|
|
if search_pattern in fp.read():
|
|
files_to_check.append(f)
|
|
except Exception as e:
|
|
print(f"Warning: Could not read {f}: {e}")
|
|
|
|
print(f"Found {len(files_to_check)} files with '{OLD_AGENT}'")
|
|
|
|
if args.limit:
|
|
files_to_check = files_to_check[:args.limit]
|
|
print(f"Processing first {args.limit} files")
|
|
|
|
print("-" * 60)
|
|
|
|
migrated = 0
|
|
errors = 0
|
|
skipped = 0
|
|
|
|
for filepath in files_to_check:
|
|
changed, message = migrate_file(filepath, dry_run=args.dry_run)
|
|
|
|
if "Error" in message:
|
|
errors += 1
|
|
print(f"ERROR: {filepath.name}: {message}")
|
|
elif changed:
|
|
migrated += 1
|
|
if args.verbose or args.dry_run:
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}MIGRATED: {filepath.name}: {message}")
|
|
else:
|
|
skipped += 1
|
|
if args.verbose:
|
|
print(f"SKIPPED: {filepath.name}: {message}")
|
|
|
|
print("-" * 60)
|
|
print(f"Summary:")
|
|
print(f" Migrated: {migrated}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total: {len(files_to_check)}")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] No files were modified. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|