glam/scripts/migrate_claude_conversation_provenance.py
2026-01-02 02:10:18 +01:00

530 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Migrate `agent: claude-conversation` provenance to Rule 35 compliant dual timestamps.
This script handles Phase 1: ISIL/CSV registry sources (~18,000 files).
These files have provenance paths like `/files/{country}_complete.yaml` and don't
require LLM processing - just timestamp restructuring.
Phase 2 (conversation sources) and Phase 3 (web sources) require GLM4.7 + web-reader
and are handled separately.
Migration Rules (from .opencode/PROVENANCE_TIMESTAMP_RULES.md):
1. Every provenance MUST have `statement_created_at` and `source_archived_at`
2. `agent: claude-conversation` is INVALID - replace with proper agent identifier
3. `source_archived_at` must be <= `statement_created_at`
Usage:
# Dry run (no changes)
python scripts/migrate_claude_conversation_provenance.py --dry-run
# Process specific file
python scripts/migrate_claude_conversation_provenance.py --file data/custodian/JP-01-TOM-L-H.yaml
# Process all files (with backup)
python scripts/migrate_claude_conversation_provenance.py --backup
# Limit processing
python scripts/migrate_claude_conversation_provenance.py --limit 100 --dry-run
Author: OpenCode/Claude
Created: 2025-12-30
Related: .opencode/PROVENANCE_TIMESTAMP_RULES.md, .opencode/CLAUDE_CONVERSATION_MIGRATION_SPEC.md
"""
import argparse
import json
import os
import re
import shutil
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# YAML handling - use ruamel.yaml to preserve formatting
HAS_RUAMEL = False
yaml_handler = None
pyyaml_module = None
try:
from ruamel.yaml import YAML # type: ignore[import-untyped]
yaml_handler = YAML()
yaml_handler.preserve_quotes = True
yaml_handler.width = 4096 # Prevent line wrapping
HAS_RUAMEL = True
except ImportError:
try:
import yaml as pyyaml_module # type: ignore[import-untyped]
print("Warning: ruamel.yaml not installed. Using PyYAML (may lose formatting).")
except ImportError:
print("Error: No YAML library available. Install ruamel.yaml or PyYAML.")
sys.exit(1)
# Constants
CUSTODIAN_DIR = Path("data/custodian")
BACKUP_DIR = Path("data/custodian.backup")
MIGRATION_TIMESTAMP = datetime.now(timezone.utc).isoformat()
# Source type patterns for categorization
SOURCE_PATTERNS = {
'isil_csv': [
r'/files/.*_complete\.yaml',
r'/files/.*_isil.*\.yaml',
r'/files/.*_ch_annotator\.yaml',
r'/instances/.*\.yaml',
],
'conversation': [
r'/conversations/[a-f0-9-]+',
],
'web': [
r'https?://',
],
}
# Agent mapping based on source file patterns
AGENT_MAPPING = {
'japan_complete': 'batch-script-create-custodian-from-ch-annotator',
'austria_complete': 'batch-script-create-custodian-from-ch-annotator',
'switzerland_isil': 'batch-script-create-custodian-from-ch-annotator',
'czech_unified': 'batch-script-create-custodian-from-ch-annotator',
'bulgaria_complete': 'batch-script-create-custodian-from-ch-annotator',
'belgium_complete': 'batch-script-create-custodian-from-ch-annotator',
'netherlands_complete': 'batch-script-create-custodian-from-ch-annotator',
'norway': 'batch-script-create-custodian-from-ch-annotator',
'default': 'batch-script-ch-annotator-extraction',
}
def detect_source_category(path: str) -> str:
"""
Detect the source category from the provenance path.
Returns: 'isil_csv', 'conversation', 'web', or 'unknown'
"""
if not path:
return 'unknown'
for category, patterns in SOURCE_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, path):
return category
return 'unknown'
def get_agent_for_source(path: str) -> str:
"""
Determine the appropriate agent identifier based on source path.
"""
if not path:
return AGENT_MAPPING['default']
path_lower = path.lower()
for key, agent in AGENT_MAPPING.items():
if key in path_lower:
return agent
return AGENT_MAPPING['default']
def migrate_provenance_block(
provenance: Dict[str, Any],
annotation_date: Optional[str] = None,
parent_source_archived_at: Optional[str] = None
) -> Tuple[Dict[str, Any], List[str]]:
"""
Migrate a single provenance block to Rule 35 compliant format.
Args:
provenance: The provenance dict to migrate
annotation_date: The annotation date from parent context (for statement_created_at)
parent_source_archived_at: Inherited source_archived_at from parent
Returns:
Tuple of (migrated provenance dict, list of changes made)
"""
changes = []
# Check if already migrated
if 'statement_created_at' in provenance and 'source_archived_at' in provenance:
if provenance.get('agent') != 'claude-conversation':
return provenance, ['already_migrated']
# Extract existing timestamp
existing_timestamp = provenance.get('timestamp')
# Determine source_archived_at
if parent_source_archived_at:
source_archived_at = parent_source_archived_at
elif existing_timestamp:
source_archived_at = existing_timestamp
else:
source_archived_at = MIGRATION_TIMESTAMP
changes.append('source_archived_at_defaulted')
# Determine statement_created_at
if annotation_date:
statement_created_at = annotation_date
elif existing_timestamp:
# If no annotation_date, use existing timestamp for both (simultaneous)
statement_created_at = existing_timestamp
else:
statement_created_at = MIGRATION_TIMESTAMP
changes.append('statement_created_at_defaulted')
# Ensure temporal ordering: source_archived_at <= statement_created_at
try:
archived_dt = datetime.fromisoformat(source_archived_at.replace('Z', '+00:00'))
created_dt = datetime.fromisoformat(statement_created_at.replace('Z', '+00:00'))
if archived_dt > created_dt:
# Swap if out of order
source_archived_at, statement_created_at = statement_created_at, source_archived_at
changes.append('timestamps_reordered')
except (ValueError, AttributeError):
changes.append('timestamp_parse_warning')
# Determine new agent
old_agent = provenance.get('agent', '')
if old_agent == 'claude-conversation' or not old_agent:
path = provenance.get('path', '')
new_agent = get_agent_for_source(path)
changes.append(f'agent_changed:{old_agent}->{new_agent}')
else:
new_agent = old_agent
# Detect source type
path = provenance.get('path', '')
source_type = detect_source_category(path)
# Build migrated provenance
migrated = {
# Preserve existing fields
'namespace': provenance.get('namespace', 'glam'),
'path': path,
'context_convention': provenance.get('context_convention', 'ch_annotator-v1_7_0'),
# NEW: Dual timestamps (Rule 35)
'source_archived_at': source_archived_at,
'statement_created_at': statement_created_at,
# NEW: Valid agent identifier
'agent': new_agent,
# NEW: Source classification
'source_type': source_type,
# Migration tracking
'migration_note': f'Migrated from agent:claude-conversation on {MIGRATION_TIMESTAMP[:10]}',
}
# Remove old timestamp field (replaced by dual timestamps)
if 'timestamp' in provenance:
changes.append('timestamp_field_removed')
return migrated, changes
def migrate_entity_claims(
claims: List[Dict[str, Any]],
parent_source_archived_at: str,
parent_statement_created_at: str
) -> Tuple[List[Dict[str, Any]], int]:
"""
Migrate all entity_claims provenance blocks.
Returns:
Tuple of (migrated claims list, count of claims migrated)
"""
migrated_count = 0
for claim in claims:
if 'provenance' not in claim:
continue
prov = claim['provenance']
# Check if needs migration
if prov.get('agent') == 'claude-conversation' or 'statement_created_at' not in prov:
# Migrate the provenance block
migrated_prov, _ = migrate_provenance_block(
prov,
annotation_date=parent_statement_created_at,
parent_source_archived_at=parent_source_archived_at
)
claim['provenance'] = migrated_prov
migrated_count += 1
return claims, migrated_count
def process_file(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
"""
Process a single custodian YAML file.
Returns:
Dict with processing results
"""
result = {
'file': str(filepath),
'status': 'unknown',
'changes': [],
'claims_migrated': 0,
'category': 'unknown',
'error': None,
}
try:
# Read file
if HAS_RUAMEL and yaml_handler is not None:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml_handler.load(f)
elif pyyaml_module is not None:
with open(filepath, 'r', encoding='utf-8') as f:
data = pyyaml_module.safe_load(f)
else:
result['status'] = 'error'
result['error'] = 'No YAML library available'
return result
if not data:
result['status'] = 'empty_file'
return result
# Check if file has ch_annotator section
ch_annotator = data.get('ch_annotator', {})
if not ch_annotator:
result['status'] = 'no_ch_annotator'
return result
# Check extraction_provenance
extraction_prov = ch_annotator.get('extraction_provenance', {})
if not extraction_prov:
result['status'] = 'no_extraction_provenance'
return result
# Detect category
path = extraction_prov.get('path', '')
result['category'] = detect_source_category(path)
# Skip non-ISIL/CSV sources (Phase 2 & 3)
if result['category'] not in ['isil_csv', 'unknown']:
result['status'] = f'skipped_phase2_{result["category"]}'
return result
# Check if already migrated
if extraction_prov.get('agent') != 'claude-conversation':
if 'statement_created_at' in extraction_prov and 'source_archived_at' in extraction_prov:
result['status'] = 'already_migrated'
return result
# Get annotation date for statement_created_at
annotation_prov = ch_annotator.get('annotation_provenance', {})
annotation_date = annotation_prov.get('annotation_date')
# Migrate extraction_provenance
migrated_extraction_prov, changes = migrate_provenance_block(
extraction_prov,
annotation_date=annotation_date
)
result['changes'].extend(changes)
# Update ch_annotator.extraction_provenance
ch_annotator['extraction_provenance'] = migrated_extraction_prov
# Get timestamps for entity claims inheritance
source_archived_at = migrated_extraction_prov['source_archived_at']
statement_created_at = migrated_extraction_prov['statement_created_at']
# Migrate entity_claims
entity_claims = ch_annotator.get('entity_claims', [])
if entity_claims:
migrated_claims, claims_count = migrate_entity_claims(
entity_claims,
source_archived_at,
statement_created_at
)
ch_annotator['entity_claims'] = migrated_claims
result['claims_migrated'] = claims_count
# Update data
data['ch_annotator'] = ch_annotator
# Write file (unless dry run)
if not dry_run:
if HAS_RUAMEL and yaml_handler is not None:
with open(filepath, 'w', encoding='utf-8') as f:
yaml_handler.dump(data, f)
elif pyyaml_module is not None:
with open(filepath, 'w', encoding='utf-8') as f:
pyyaml_module.safe_dump(data, f, default_flow_style=False, allow_unicode=True)
result['status'] = 'migrated' if not dry_run else 'would_migrate'
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result
def create_backup(backup_dir: Path) -> bool:
"""
Create backup of custodian directory.
"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_path = backup_dir.parent / f"{backup_dir.name}.{timestamp}"
try:
if CUSTODIAN_DIR.exists():
print(f"Creating backup at {backup_path}...")
shutil.copytree(CUSTODIAN_DIR, backup_path)
print(f"Backup created: {backup_path}")
return True
except Exception as e:
print(f"Error creating backup: {e}")
return False
return False
def main():
parser = argparse.ArgumentParser(
description='Migrate agent:claude-conversation provenance to Rule 35 compliant format'
)
parser.add_argument('--dry-run', action='store_true',
help='Show what would be changed without modifying files')
parser.add_argument('--file', type=Path,
help='Process a specific file only')
parser.add_argument('--limit', type=int, default=0,
help='Limit number of files to process (0 = no limit)')
parser.add_argument('--backup', action='store_true',
help='Create backup before processing')
parser.add_argument('--verbose', '-v', action='store_true',
help='Show detailed progress')
parser.add_argument('--report', type=Path,
help='Write JSON report to file')
args = parser.parse_args()
print("=" * 70)
print("Migration: agent:claude-conversation → Rule 35 Compliant Provenance")
print("=" * 70)
print(f"Phase 1: ISIL/CSV Registry Sources")
print(f"Timestamp: {MIGRATION_TIMESTAMP}")
print(f"Dry run: {args.dry_run}")
print()
# Create backup if requested
if args.backup and not args.dry_run:
if not create_backup(BACKUP_DIR):
print("Backup failed. Aborting.")
sys.exit(1)
# Collect files to process
if args.file:
files = [args.file] if args.file.exists() else []
if not files:
print(f"Error: File not found: {args.file}")
sys.exit(1)
else:
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
if args.limit > 0:
files = files[:args.limit]
print(f"Files to process: {len(files)}")
print()
# Process files
results = {
'migrated': 0,
'would_migrate': 0,
'already_migrated': 0,
'skipped': 0,
'errors': 0,
'claims_migrated': 0,
'by_category': {},
'details': [],
}
for i, filepath in enumerate(files, 1):
result = process_file(filepath, dry_run=args.dry_run)
# Update counters
status = result['status']
if status == 'migrated':
results['migrated'] += 1
elif status == 'would_migrate':
results['would_migrate'] += 1
elif status == 'already_migrated':
results['already_migrated'] += 1
elif status == 'error':
results['errors'] += 1
else:
results['skipped'] += 1
results['claims_migrated'] += result['claims_migrated']
# Track by category
cat = result['category']
results['by_category'][cat] = results['by_category'].get(cat, 0) + 1
# Store details
if args.report or args.verbose:
results['details'].append(result)
# Progress
if args.verbose or (i % 1000 == 0):
print(f"[{i}/{len(files)}] {filepath.name}: {status}")
elif i % 100 == 0:
print(f"Processed {i}/{len(files)} files...", end='\r')
print()
print()
print("=" * 70)
print("RESULTS")
print("=" * 70)
if args.dry_run:
print(f"Would migrate: {results['would_migrate']:,}")
else:
print(f"Migrated: {results['migrated']:,}")
print(f"Already migrated: {results['already_migrated']:,}")
print(f"Skipped (Phase 2): {results['skipped']:,}")
print(f"Errors: {results['errors']:,}")
print(f"Claims migrated: {results['claims_migrated']:,}")
print()
print("By category:")
for cat, count in sorted(results['by_category'].items()):
print(f" {cat}: {count:,}")
# Write report
if args.report:
report_data = {
'timestamp': MIGRATION_TIMESTAMP,
'dry_run': args.dry_run,
'summary': {
'migrated': results['migrated'],
'would_migrate': results['would_migrate'],
'already_migrated': results['already_migrated'],
'skipped': results['skipped'],
'errors': results['errors'],
'claims_migrated': results['claims_migrated'],
},
'by_category': results['by_category'],
'details': results['details'],
}
with open(args.report, 'w', encoding='utf-8') as f:
json.dump(report_data, f, indent=2, ensure_ascii=False)
print(f"\nReport written to: {args.report}")
print()
if args.dry_run:
print("This was a DRY RUN. No files were modified.")
print("Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()