530 lines
18 KiB
Python
530 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate `agent: claude-conversation` provenance to Rule 35 compliant dual timestamps.
|
|
|
|
This script handles Phase 1: ISIL/CSV registry sources (~18,000 files).
|
|
These files have provenance paths like `/files/{country}_complete.yaml` and don't
|
|
require LLM processing - just timestamp restructuring.
|
|
|
|
Phase 2 (conversation sources) and Phase 3 (web sources) require GLM4.7 + web-reader
|
|
and are handled separately.
|
|
|
|
Migration Rules (from .opencode/PROVENANCE_TIMESTAMP_RULES.md):
|
|
1. Every provenance MUST have `statement_created_at` and `source_archived_at`
|
|
2. `agent: claude-conversation` is INVALID - replace with proper agent identifier
|
|
3. `source_archived_at` must be <= `statement_created_at`
|
|
|
|
Usage:
|
|
# Dry run (no changes)
|
|
python scripts/migrate_claude_conversation_provenance.py --dry-run
|
|
|
|
# Process specific file
|
|
python scripts/migrate_claude_conversation_provenance.py --file data/custodian/JP-01-TOM-L-H.yaml
|
|
|
|
# Process all files (with backup)
|
|
python scripts/migrate_claude_conversation_provenance.py --backup
|
|
|
|
# Limit processing
|
|
python scripts/migrate_claude_conversation_provenance.py --limit 100 --dry-run
|
|
|
|
Author: OpenCode/Claude
|
|
Created: 2025-12-30
|
|
Related: .opencode/PROVENANCE_TIMESTAMP_RULES.md, .opencode/CLAUDE_CONVERSATION_MIGRATION_SPEC.md
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
# YAML handling - use ruamel.yaml to preserve formatting
|
|
HAS_RUAMEL = False
|
|
yaml_handler = None
|
|
pyyaml_module = None
|
|
|
|
try:
|
|
from ruamel.yaml import YAML # type: ignore[import-untyped]
|
|
yaml_handler = YAML()
|
|
yaml_handler.preserve_quotes = True
|
|
yaml_handler.width = 4096 # Prevent line wrapping
|
|
HAS_RUAMEL = True
|
|
except ImportError:
|
|
try:
|
|
import yaml as pyyaml_module # type: ignore[import-untyped]
|
|
print("Warning: ruamel.yaml not installed. Using PyYAML (may lose formatting).")
|
|
except ImportError:
|
|
print("Error: No YAML library available. Install ruamel.yaml or PyYAML.")
|
|
sys.exit(1)
|
|
|
|
# Constants
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
BACKUP_DIR = Path("data/custodian.backup")
|
|
MIGRATION_TIMESTAMP = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Source type patterns for categorization
|
|
SOURCE_PATTERNS = {
|
|
'isil_csv': [
|
|
r'/files/.*_complete\.yaml',
|
|
r'/files/.*_isil.*\.yaml',
|
|
r'/files/.*_ch_annotator\.yaml',
|
|
r'/instances/.*\.yaml',
|
|
],
|
|
'conversation': [
|
|
r'/conversations/[a-f0-9-]+',
|
|
],
|
|
'web': [
|
|
r'https?://',
|
|
],
|
|
}
|
|
|
|
# Agent mapping based on source file patterns
|
|
AGENT_MAPPING = {
|
|
'japan_complete': 'batch-script-create-custodian-from-ch-annotator',
|
|
'austria_complete': 'batch-script-create-custodian-from-ch-annotator',
|
|
'switzerland_isil': 'batch-script-create-custodian-from-ch-annotator',
|
|
'czech_unified': 'batch-script-create-custodian-from-ch-annotator',
|
|
'bulgaria_complete': 'batch-script-create-custodian-from-ch-annotator',
|
|
'belgium_complete': 'batch-script-create-custodian-from-ch-annotator',
|
|
'netherlands_complete': 'batch-script-create-custodian-from-ch-annotator',
|
|
'norway': 'batch-script-create-custodian-from-ch-annotator',
|
|
'default': 'batch-script-ch-annotator-extraction',
|
|
}
|
|
|
|
|
|
def detect_source_category(path: str) -> str:
|
|
"""
|
|
Detect the source category from the provenance path.
|
|
|
|
Returns: 'isil_csv', 'conversation', 'web', or 'unknown'
|
|
"""
|
|
if not path:
|
|
return 'unknown'
|
|
|
|
for category, patterns in SOURCE_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, path):
|
|
return category
|
|
|
|
return 'unknown'
|
|
|
|
|
|
def get_agent_for_source(path: str) -> str:
|
|
"""
|
|
Determine the appropriate agent identifier based on source path.
|
|
"""
|
|
if not path:
|
|
return AGENT_MAPPING['default']
|
|
|
|
path_lower = path.lower()
|
|
for key, agent in AGENT_MAPPING.items():
|
|
if key in path_lower:
|
|
return agent
|
|
|
|
return AGENT_MAPPING['default']
|
|
|
|
|
|
def migrate_provenance_block(
|
|
provenance: Dict[str, Any],
|
|
annotation_date: Optional[str] = None,
|
|
parent_source_archived_at: Optional[str] = None
|
|
) -> Tuple[Dict[str, Any], List[str]]:
|
|
"""
|
|
Migrate a single provenance block to Rule 35 compliant format.
|
|
|
|
Args:
|
|
provenance: The provenance dict to migrate
|
|
annotation_date: The annotation date from parent context (for statement_created_at)
|
|
parent_source_archived_at: Inherited source_archived_at from parent
|
|
|
|
Returns:
|
|
Tuple of (migrated provenance dict, list of changes made)
|
|
"""
|
|
changes = []
|
|
|
|
# Check if already migrated
|
|
if 'statement_created_at' in provenance and 'source_archived_at' in provenance:
|
|
if provenance.get('agent') != 'claude-conversation':
|
|
return provenance, ['already_migrated']
|
|
|
|
# Extract existing timestamp
|
|
existing_timestamp = provenance.get('timestamp')
|
|
|
|
# Determine source_archived_at
|
|
if parent_source_archived_at:
|
|
source_archived_at = parent_source_archived_at
|
|
elif existing_timestamp:
|
|
source_archived_at = existing_timestamp
|
|
else:
|
|
source_archived_at = MIGRATION_TIMESTAMP
|
|
changes.append('source_archived_at_defaulted')
|
|
|
|
# Determine statement_created_at
|
|
if annotation_date:
|
|
statement_created_at = annotation_date
|
|
elif existing_timestamp:
|
|
# If no annotation_date, use existing timestamp for both (simultaneous)
|
|
statement_created_at = existing_timestamp
|
|
else:
|
|
statement_created_at = MIGRATION_TIMESTAMP
|
|
changes.append('statement_created_at_defaulted')
|
|
|
|
# Ensure temporal ordering: source_archived_at <= statement_created_at
|
|
try:
|
|
archived_dt = datetime.fromisoformat(source_archived_at.replace('Z', '+00:00'))
|
|
created_dt = datetime.fromisoformat(statement_created_at.replace('Z', '+00:00'))
|
|
if archived_dt > created_dt:
|
|
# Swap if out of order
|
|
source_archived_at, statement_created_at = statement_created_at, source_archived_at
|
|
changes.append('timestamps_reordered')
|
|
except (ValueError, AttributeError):
|
|
changes.append('timestamp_parse_warning')
|
|
|
|
# Determine new agent
|
|
old_agent = provenance.get('agent', '')
|
|
if old_agent == 'claude-conversation' or not old_agent:
|
|
path = provenance.get('path', '')
|
|
new_agent = get_agent_for_source(path)
|
|
changes.append(f'agent_changed:{old_agent}->{new_agent}')
|
|
else:
|
|
new_agent = old_agent
|
|
|
|
# Detect source type
|
|
path = provenance.get('path', '')
|
|
source_type = detect_source_category(path)
|
|
|
|
# Build migrated provenance
|
|
migrated = {
|
|
# Preserve existing fields
|
|
'namespace': provenance.get('namespace', 'glam'),
|
|
'path': path,
|
|
'context_convention': provenance.get('context_convention', 'ch_annotator-v1_7_0'),
|
|
|
|
# NEW: Dual timestamps (Rule 35)
|
|
'source_archived_at': source_archived_at,
|
|
'statement_created_at': statement_created_at,
|
|
|
|
# NEW: Valid agent identifier
|
|
'agent': new_agent,
|
|
|
|
# NEW: Source classification
|
|
'source_type': source_type,
|
|
|
|
# Migration tracking
|
|
'migration_note': f'Migrated from agent:claude-conversation on {MIGRATION_TIMESTAMP[:10]}',
|
|
}
|
|
|
|
# Remove old timestamp field (replaced by dual timestamps)
|
|
if 'timestamp' in provenance:
|
|
changes.append('timestamp_field_removed')
|
|
|
|
return migrated, changes
|
|
|
|
|
|
def migrate_entity_claims(
|
|
claims: List[Dict[str, Any]],
|
|
parent_source_archived_at: str,
|
|
parent_statement_created_at: str
|
|
) -> Tuple[List[Dict[str, Any]], int]:
|
|
"""
|
|
Migrate all entity_claims provenance blocks.
|
|
|
|
Returns:
|
|
Tuple of (migrated claims list, count of claims migrated)
|
|
"""
|
|
migrated_count = 0
|
|
|
|
for claim in claims:
|
|
if 'provenance' not in claim:
|
|
continue
|
|
|
|
prov = claim['provenance']
|
|
|
|
# Check if needs migration
|
|
if prov.get('agent') == 'claude-conversation' or 'statement_created_at' not in prov:
|
|
# Migrate the provenance block
|
|
migrated_prov, _ = migrate_provenance_block(
|
|
prov,
|
|
annotation_date=parent_statement_created_at,
|
|
parent_source_archived_at=parent_source_archived_at
|
|
)
|
|
claim['provenance'] = migrated_prov
|
|
migrated_count += 1
|
|
|
|
return claims, migrated_count
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Process a single custodian YAML file.
|
|
|
|
Returns:
|
|
Dict with processing results
|
|
"""
|
|
result = {
|
|
'file': str(filepath),
|
|
'status': 'unknown',
|
|
'changes': [],
|
|
'claims_migrated': 0,
|
|
'category': 'unknown',
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
# Read file
|
|
if HAS_RUAMEL and yaml_handler is not None:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml_handler.load(f)
|
|
elif pyyaml_module is not None:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = pyyaml_module.safe_load(f)
|
|
else:
|
|
result['status'] = 'error'
|
|
result['error'] = 'No YAML library available'
|
|
return result
|
|
|
|
if not data:
|
|
result['status'] = 'empty_file'
|
|
return result
|
|
|
|
# Check if file has ch_annotator section
|
|
ch_annotator = data.get('ch_annotator', {})
|
|
if not ch_annotator:
|
|
result['status'] = 'no_ch_annotator'
|
|
return result
|
|
|
|
# Check extraction_provenance
|
|
extraction_prov = ch_annotator.get('extraction_provenance', {})
|
|
if not extraction_prov:
|
|
result['status'] = 'no_extraction_provenance'
|
|
return result
|
|
|
|
# Detect category
|
|
path = extraction_prov.get('path', '')
|
|
result['category'] = detect_source_category(path)
|
|
|
|
# Skip non-ISIL/CSV sources (Phase 2 & 3)
|
|
if result['category'] not in ['isil_csv', 'unknown']:
|
|
result['status'] = f'skipped_phase2_{result["category"]}'
|
|
return result
|
|
|
|
# Check if already migrated
|
|
if extraction_prov.get('agent') != 'claude-conversation':
|
|
if 'statement_created_at' in extraction_prov and 'source_archived_at' in extraction_prov:
|
|
result['status'] = 'already_migrated'
|
|
return result
|
|
|
|
# Get annotation date for statement_created_at
|
|
annotation_prov = ch_annotator.get('annotation_provenance', {})
|
|
annotation_date = annotation_prov.get('annotation_date')
|
|
|
|
# Migrate extraction_provenance
|
|
migrated_extraction_prov, changes = migrate_provenance_block(
|
|
extraction_prov,
|
|
annotation_date=annotation_date
|
|
)
|
|
result['changes'].extend(changes)
|
|
|
|
# Update ch_annotator.extraction_provenance
|
|
ch_annotator['extraction_provenance'] = migrated_extraction_prov
|
|
|
|
# Get timestamps for entity claims inheritance
|
|
source_archived_at = migrated_extraction_prov['source_archived_at']
|
|
statement_created_at = migrated_extraction_prov['statement_created_at']
|
|
|
|
# Migrate entity_claims
|
|
entity_claims = ch_annotator.get('entity_claims', [])
|
|
if entity_claims:
|
|
migrated_claims, claims_count = migrate_entity_claims(
|
|
entity_claims,
|
|
source_archived_at,
|
|
statement_created_at
|
|
)
|
|
ch_annotator['entity_claims'] = migrated_claims
|
|
result['claims_migrated'] = claims_count
|
|
|
|
# Update data
|
|
data['ch_annotator'] = ch_annotator
|
|
|
|
# Write file (unless dry run)
|
|
if not dry_run:
|
|
if HAS_RUAMEL and yaml_handler is not None:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml_handler.dump(data, f)
|
|
elif pyyaml_module is not None:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
pyyaml_module.safe_dump(data, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
result['status'] = 'migrated' if not dry_run else 'would_migrate'
|
|
|
|
except Exception as e:
|
|
result['status'] = 'error'
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def create_backup(backup_dir: Path) -> bool:
|
|
"""
|
|
Create backup of custodian directory.
|
|
"""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
backup_path = backup_dir.parent / f"{backup_dir.name}.{timestamp}"
|
|
|
|
try:
|
|
if CUSTODIAN_DIR.exists():
|
|
print(f"Creating backup at {backup_path}...")
|
|
shutil.copytree(CUSTODIAN_DIR, backup_path)
|
|
print(f"Backup created: {backup_path}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error creating backup: {e}")
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Migrate agent:claude-conversation provenance to Rule 35 compliant format'
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be changed without modifying files')
|
|
parser.add_argument('--file', type=Path,
|
|
help='Process a specific file only')
|
|
parser.add_argument('--limit', type=int, default=0,
|
|
help='Limit number of files to process (0 = no limit)')
|
|
parser.add_argument('--backup', action='store_true',
|
|
help='Create backup before processing')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Show detailed progress')
|
|
parser.add_argument('--report', type=Path,
|
|
help='Write JSON report to file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("Migration: agent:claude-conversation → Rule 35 Compliant Provenance")
|
|
print("=" * 70)
|
|
print(f"Phase 1: ISIL/CSV Registry Sources")
|
|
print(f"Timestamp: {MIGRATION_TIMESTAMP}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
# Create backup if requested
|
|
if args.backup and not args.dry_run:
|
|
if not create_backup(BACKUP_DIR):
|
|
print("Backup failed. Aborting.")
|
|
sys.exit(1)
|
|
|
|
# Collect files to process
|
|
if args.file:
|
|
files = [args.file] if args.file.exists() else []
|
|
if not files:
|
|
print(f"Error: File not found: {args.file}")
|
|
sys.exit(1)
|
|
else:
|
|
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
|
|
|
|
if args.limit > 0:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Files to process: {len(files)}")
|
|
print()
|
|
|
|
# Process files
|
|
results = {
|
|
'migrated': 0,
|
|
'would_migrate': 0,
|
|
'already_migrated': 0,
|
|
'skipped': 0,
|
|
'errors': 0,
|
|
'claims_migrated': 0,
|
|
'by_category': {},
|
|
'details': [],
|
|
}
|
|
|
|
for i, filepath in enumerate(files, 1):
|
|
result = process_file(filepath, dry_run=args.dry_run)
|
|
|
|
# Update counters
|
|
status = result['status']
|
|
if status == 'migrated':
|
|
results['migrated'] += 1
|
|
elif status == 'would_migrate':
|
|
results['would_migrate'] += 1
|
|
elif status == 'already_migrated':
|
|
results['already_migrated'] += 1
|
|
elif status == 'error':
|
|
results['errors'] += 1
|
|
else:
|
|
results['skipped'] += 1
|
|
|
|
results['claims_migrated'] += result['claims_migrated']
|
|
|
|
# Track by category
|
|
cat = result['category']
|
|
results['by_category'][cat] = results['by_category'].get(cat, 0) + 1
|
|
|
|
# Store details
|
|
if args.report or args.verbose:
|
|
results['details'].append(result)
|
|
|
|
# Progress
|
|
if args.verbose or (i % 1000 == 0):
|
|
print(f"[{i}/{len(files)}] {filepath.name}: {status}")
|
|
elif i % 100 == 0:
|
|
print(f"Processed {i}/{len(files)} files...", end='\r')
|
|
|
|
print()
|
|
print()
|
|
print("=" * 70)
|
|
print("RESULTS")
|
|
print("=" * 70)
|
|
|
|
if args.dry_run:
|
|
print(f"Would migrate: {results['would_migrate']:,}")
|
|
else:
|
|
print(f"Migrated: {results['migrated']:,}")
|
|
|
|
print(f"Already migrated: {results['already_migrated']:,}")
|
|
print(f"Skipped (Phase 2): {results['skipped']:,}")
|
|
print(f"Errors: {results['errors']:,}")
|
|
print(f"Claims migrated: {results['claims_migrated']:,}")
|
|
print()
|
|
print("By category:")
|
|
for cat, count in sorted(results['by_category'].items()):
|
|
print(f" {cat}: {count:,}")
|
|
|
|
# Write report
|
|
if args.report:
|
|
report_data = {
|
|
'timestamp': MIGRATION_TIMESTAMP,
|
|
'dry_run': args.dry_run,
|
|
'summary': {
|
|
'migrated': results['migrated'],
|
|
'would_migrate': results['would_migrate'],
|
|
'already_migrated': results['already_migrated'],
|
|
'skipped': results['skipped'],
|
|
'errors': results['errors'],
|
|
'claims_migrated': results['claims_migrated'],
|
|
},
|
|
'by_category': results['by_category'],
|
|
'details': results['details'],
|
|
}
|
|
with open(args.report, 'w', encoding='utf-8') as f:
|
|
json.dump(report_data, f, indent=2, ensure_ascii=False)
|
|
print(f"\nReport written to: {args.report}")
|
|
|
|
print()
|
|
if args.dry_run:
|
|
print("This was a DRY RUN. No files were modified.")
|
|
print("Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|