#!/usr/bin/env python3 """ Fast cleanup script for custodian YAML files. This script: 1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions 2. RENAMES `validated_entity_claims` to `web-enrichments` Optimized for speed by only processing files that need changes. """ import os import sys from pathlib import Path from datetime import datetime import subprocess def remove_yaml_block(content: str, key: str) -> tuple[str, bool]: """ Remove a top-level YAML block by key. Returns (modified_content, was_modified). """ lines = content.split('\n') new_lines = [] skip_until_next_top_level = False removed = False i = 0 while i < len(lines): line = lines[i] # Check if this is the key we want to remove (top-level, no leading whitespace) if line.startswith(f'{key}:') and not line.startswith(' '): skip_until_next_top_level = True removed = True i += 1 continue # If we're skipping, check if we've hit the next top-level key if skip_until_next_top_level: # Check for next top-level key (starts with non-space, non-empty, not a comment) if line and not line.startswith(' ') and not line.startswith('#') and not line.startswith('-'): skip_until_next_top_level = False new_lines.append(line) # Skip indented lines and empty lines within the block i += 1 continue new_lines.append(line) i += 1 return '\n'.join(new_lines), removed def rename_yaml_key(content: str, old_key: str, new_key: str) -> tuple[str, bool]: """ Rename a top-level YAML key. Returns (modified_content, was_modified). """ lines = content.split('\n') new_lines = [] renamed = False for line in lines: # Check if this is the key we want to rename (top-level) if line.startswith(f'{old_key}:') and not line.startswith(' '): new_line = line.replace(f'{old_key}:', f'{new_key}:', 1) new_lines.append(new_line) renamed = True else: new_lines.append(line) return '\n'.join(new_lines), renamed def process_file(filepath: str, dry_run: bool = False) -> dict: """Process a single file.""" stats = { 'web_claims_removed': False, 'validated_entity_claims_renamed': False, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() original_content = content # Remove web_claims content, removed = remove_yaml_block(content, 'web_claims') if removed: stats['web_claims_removed'] = True # Rename validated_entity_claims to web-enrichments content, renamed = rename_yaml_key(content, 'validated_entity_claims', 'web-enrichments') if renamed: stats['validated_entity_claims_renamed'] = True # Write back if changed if content != original_content and not dry_run: with open(filepath, 'w', encoding='utf-8') as f: f.write(content) except Exception as e: stats['error'] = str(e) return stats def main(): import argparse parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed') args = parser.parse_args() custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') if not custodian_dir.exists(): print(f"Error: Directory not found: {custodian_dir}") sys.exit(1) print(f"{'[DRY RUN] ' if args.dry_run else ''}Finding files to process...") print(f"Started at: {datetime.now().isoformat()}") # Find files that need processing using grep (fast) # Run grep from within the directory to avoid argument list too long print(" Scanning for web_claims...") files_with_web_claims = set() result = subprocess.run( ['grep', '-l', '^web_claims:'], input='\n'.join([str(f) for f in custodian_dir.glob('*.yaml')]), capture_output=True, text=True, cwd=custodian_dir ) # Alternative: run in shell from directory result = subprocess.run( 'grep -l "^web_claims:" *.yaml 2>/dev/null || true', shell=True, capture_output=True, text=True, cwd=custodian_dir ) if result.stdout.strip(): files_with_web_claims = set( str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f ) print(f" Found {len(files_with_web_claims)} files with web_claims") print(" Scanning for validated_entity_claims...") files_with_vec = set() result = subprocess.run( 'grep -l "^validated_entity_claims:" *.yaml 2>/dev/null || true', shell=True, capture_output=True, text=True, cwd=custodian_dir ) if result.stdout.strip(): files_with_vec = set( str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f ) print(f" Found {len(files_with_vec)} files with validated_entity_claims") # Combine files to process files_to_process = files_with_web_claims | files_with_vec print(f" Total unique files to process: {len(files_to_process)}") print() total_stats = { 'files_processed': 0, 'web_claims_removed': 0, 'validated_entity_claims_renamed': 0, 'errors': 0 } for filepath in sorted(files_to_process): if not filepath: continue stats = process_file(filepath, dry_run=args.dry_run) total_stats['files_processed'] += 1 if stats['web_claims_removed']: total_stats['web_claims_removed'] += 1 if args.verbose: print(f" Removed web_claims: {Path(filepath).name}") if stats['validated_entity_claims_renamed']: total_stats['validated_entity_claims_renamed'] += 1 if args.verbose: print(f" Renamed validated_entity_claims: {Path(filepath).name}") if stats['error']: total_stats['errors'] += 1 print(f" ERROR in {Path(filepath).name}: {stats['error']}") # Progress indicator if total_stats['files_processed'] % 500 == 0: print(f" ... processed {total_stats['files_processed']} files") print() print("=" * 60) print("SUMMARY") print("=" * 60) print(f"Files processed: {total_stats['files_processed']}") print(f"web_claims sections removed: {total_stats['web_claims_removed']}") print(f"validated_entity_claims renamed: {total_stats['validated_entity_claims_renamed']}") print(f"Errors: {total_stats['errors']}") print(f"Completed at: {datetime.now().isoformat()}") print() if args.dry_run: print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.") if __name__ == '__main__': main()