#!/usr/bin/env python3
"""
Fast cleanup script for custodian YAML files.

This script:
1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions
2. RENAMES `validated_entity_claims` to `web-enrichments`

Optimized for speed by only processing files that need changes.
"""

import os
import sys
from pathlib import Path
from datetime import datetime
import subprocess


def remove_yaml_block(content: str, key: str) -> tuple[str, bool]:
    """
    Remove a top-level YAML block by key.
    Returns (modified_content, was_modified).
    """
    lines = content.split('\n')
    new_lines = []
    skip_until_next_top_level = False
    removed = False
    
    i = 0
    while i < len(lines):
        line = lines[i]
        
        # Check if this is the key we want to remove (top-level, no leading whitespace)
        if line.startswith(f'{key}:') and not line.startswith(' '):
            skip_until_next_top_level = True
            removed = True
            i += 1
            continue
        
        # If we're skipping, check if we've hit the next top-level key
        if skip_until_next_top_level:
            # Check for next top-level key (starts with non-space, non-empty, not a comment)
            if line and not line.startswith(' ') and not line.startswith('#') and not line.startswith('-'):
                skip_until_next_top_level = False
                new_lines.append(line)
            # Skip indented lines and empty lines within the block
            i += 1
            continue
        
        new_lines.append(line)
        i += 1
    
    return '\n'.join(new_lines), removed


def rename_yaml_key(content: str, old_key: str, new_key: str) -> tuple[str, bool]:
    """
    Rename a top-level YAML key.
    Returns (modified_content, was_modified).
    """
    lines = content.split('\n')
    new_lines = []
    renamed = False
    
    for line in lines:
        # Check if this is the key we want to rename (top-level)
        if line.startswith(f'{old_key}:') and not line.startswith(' '):
            new_line = line.replace(f'{old_key}:', f'{new_key}:', 1)
            new_lines.append(new_line)
            renamed = True
        else:
            new_lines.append(line)
    
    return '\n'.join(new_lines), renamed


def process_file(filepath: str, dry_run: bool = False) -> dict:
    """Process a single file."""
    stats = {
        'web_claims_removed': False,
        'validated_entity_claims_renamed': False,
        'error': None
    }
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        
        original_content = content
        
        # Remove web_claims
        content, removed = remove_yaml_block(content, 'web_claims')
        if removed:
            stats['web_claims_removed'] = True
        
        # Rename validated_entity_claims to web-enrichments
        content, renamed = rename_yaml_key(content, 'validated_entity_claims', 'web-enrichments')
        if renamed:
            stats['validated_entity_claims_renamed'] = True
        
        # Write back if changed
        if content != original_content and not dry_run:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
    
    except Exception as e:
        stats['error'] = str(e)
    
    return stats


def main():
    import argparse
    
    parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
    parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed')
    args = parser.parse_args()
    
    custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
    
    if not custodian_dir.exists():
        print(f"Error: Directory not found: {custodian_dir}")
        sys.exit(1)
    
    print(f"{'[DRY RUN] ' if args.dry_run else ''}Finding files to process...")
    print(f"Started at: {datetime.now().isoformat()}")
    
    # Find files that need processing using grep (fast)
    # Run grep from within the directory to avoid argument list too long
    print("  Scanning for web_claims...")
    files_with_web_claims = set()
    result = subprocess.run(
        ['grep', '-l', '^web_claims:'],
        input='\n'.join([str(f) for f in custodian_dir.glob('*.yaml')]),
        capture_output=True,
        text=True,
        cwd=custodian_dir
    )
    # Alternative: run in shell from directory
    result = subprocess.run(
        'grep -l "^web_claims:" *.yaml 2>/dev/null || true',
        shell=True,
        capture_output=True,
        text=True,
        cwd=custodian_dir
    )
    if result.stdout.strip():
        files_with_web_claims = set(
            str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f
        )
    
    print(f"    Found {len(files_with_web_claims)} files with web_claims")
    
    print("  Scanning for validated_entity_claims...")
    files_with_vec = set()
    result = subprocess.run(
        'grep -l "^validated_entity_claims:" *.yaml 2>/dev/null || true',
        shell=True,
        capture_output=True,
        text=True,
        cwd=custodian_dir
    )
    if result.stdout.strip():
        files_with_vec = set(
            str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f
        )
    
    print(f"    Found {len(files_with_vec)} files with validated_entity_claims")
    
    # Combine files to process
    files_to_process = files_with_web_claims | files_with_vec
    print(f"  Total unique files to process: {len(files_to_process)}")
    print()
    
    total_stats = {
        'files_processed': 0,
        'web_claims_removed': 0,
        'validated_entity_claims_renamed': 0,
        'errors': 0
    }
    
    for filepath in sorted(files_to_process):
        if not filepath:
            continue
            
        stats = process_file(filepath, dry_run=args.dry_run)
        total_stats['files_processed'] += 1
        
        if stats['web_claims_removed']:
            total_stats['web_claims_removed'] += 1
            if args.verbose:
                print(f"  Removed web_claims: {Path(filepath).name}")
        
        if stats['validated_entity_claims_renamed']:
            total_stats['validated_entity_claims_renamed'] += 1
            if args.verbose:
                print(f"  Renamed validated_entity_claims: {Path(filepath).name}")
        
        if stats['error']:
            total_stats['errors'] += 1
            print(f"  ERROR in {Path(filepath).name}: {stats['error']}")
        
        # Progress indicator
        if total_stats['files_processed'] % 500 == 0:
            print(f"  ... processed {total_stats['files_processed']} files")
    
    print()
    print("=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Files processed:                    {total_stats['files_processed']}")
    print(f"web_claims sections removed:        {total_stats['web_claims_removed']}")
    print(f"validated_entity_claims renamed:    {total_stats['validated_entity_claims_renamed']}")
    print(f"Errors:                             {total_stats['errors']}")
    print(f"Completed at: {datetime.now().isoformat()}")
    print()
    
    if args.dry_run:
        print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()