224 lines
7.3 KiB
Python
224 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast cleanup script for custodian YAML files.
|
|
|
|
This script:
|
|
1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions
|
|
2. RENAMES `validated_entity_claims` to `web-enrichments`
|
|
|
|
Optimized for speed by only processing files that need changes.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import subprocess
|
|
|
|
|
|
def remove_yaml_block(content: str, key: str) -> tuple[str, bool]:
|
|
"""
|
|
Remove a top-level YAML block by key.
|
|
Returns (modified_content, was_modified).
|
|
"""
|
|
lines = content.split('\n')
|
|
new_lines = []
|
|
skip_until_next_top_level = False
|
|
removed = False
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check if this is the key we want to remove (top-level, no leading whitespace)
|
|
if line.startswith(f'{key}:') and not line.startswith(' '):
|
|
skip_until_next_top_level = True
|
|
removed = True
|
|
i += 1
|
|
continue
|
|
|
|
# If we're skipping, check if we've hit the next top-level key
|
|
if skip_until_next_top_level:
|
|
# Check for next top-level key (starts with non-space, non-empty, not a comment)
|
|
if line and not line.startswith(' ') and not line.startswith('#') and not line.startswith('-'):
|
|
skip_until_next_top_level = False
|
|
new_lines.append(line)
|
|
# Skip indented lines and empty lines within the block
|
|
i += 1
|
|
continue
|
|
|
|
new_lines.append(line)
|
|
i += 1
|
|
|
|
return '\n'.join(new_lines), removed
|
|
|
|
|
|
def rename_yaml_key(content: str, old_key: str, new_key: str) -> tuple[str, bool]:
|
|
"""
|
|
Rename a top-level YAML key.
|
|
Returns (modified_content, was_modified).
|
|
"""
|
|
lines = content.split('\n')
|
|
new_lines = []
|
|
renamed = False
|
|
|
|
for line in lines:
|
|
# Check if this is the key we want to rename (top-level)
|
|
if line.startswith(f'{old_key}:') and not line.startswith(' '):
|
|
new_line = line.replace(f'{old_key}:', f'{new_key}:', 1)
|
|
new_lines.append(new_line)
|
|
renamed = True
|
|
else:
|
|
new_lines.append(line)
|
|
|
|
return '\n'.join(new_lines), renamed
|
|
|
|
|
|
def process_file(filepath: str, dry_run: bool = False) -> dict:
|
|
"""Process a single file."""
|
|
stats = {
|
|
'web_claims_removed': False,
|
|
'validated_entity_claims_renamed': False,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
|
|
# Remove web_claims
|
|
content, removed = remove_yaml_block(content, 'web_claims')
|
|
if removed:
|
|
stats['web_claims_removed'] = True
|
|
|
|
# Rename validated_entity_claims to web-enrichments
|
|
content, renamed = rename_yaml_key(content, 'validated_entity_claims', 'web-enrichments')
|
|
if renamed:
|
|
stats['validated_entity_claims_renamed'] = True
|
|
|
|
# Write back if changed
|
|
if content != original_content and not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
except Exception as e:
|
|
stats['error'] = str(e)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory not found: {custodian_dir}")
|
|
sys.exit(1)
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Finding files to process...")
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
|
|
# Find files that need processing using grep (fast)
|
|
# Run grep from within the directory to avoid argument list too long
|
|
print(" Scanning for web_claims...")
|
|
files_with_web_claims = set()
|
|
result = subprocess.run(
|
|
['grep', '-l', '^web_claims:'],
|
|
input='\n'.join([str(f) for f in custodian_dir.glob('*.yaml')]),
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=custodian_dir
|
|
)
|
|
# Alternative: run in shell from directory
|
|
result = subprocess.run(
|
|
'grep -l "^web_claims:" *.yaml 2>/dev/null || true',
|
|
shell=True,
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=custodian_dir
|
|
)
|
|
if result.stdout.strip():
|
|
files_with_web_claims = set(
|
|
str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f
|
|
)
|
|
|
|
print(f" Found {len(files_with_web_claims)} files with web_claims")
|
|
|
|
print(" Scanning for validated_entity_claims...")
|
|
files_with_vec = set()
|
|
result = subprocess.run(
|
|
'grep -l "^validated_entity_claims:" *.yaml 2>/dev/null || true',
|
|
shell=True,
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=custodian_dir
|
|
)
|
|
if result.stdout.strip():
|
|
files_with_vec = set(
|
|
str(custodian_dir / f) for f in result.stdout.strip().split('\n') if f
|
|
)
|
|
|
|
print(f" Found {len(files_with_vec)} files with validated_entity_claims")
|
|
|
|
# Combine files to process
|
|
files_to_process = files_with_web_claims | files_with_vec
|
|
print(f" Total unique files to process: {len(files_to_process)}")
|
|
print()
|
|
|
|
total_stats = {
|
|
'files_processed': 0,
|
|
'web_claims_removed': 0,
|
|
'validated_entity_claims_renamed': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for filepath in sorted(files_to_process):
|
|
if not filepath:
|
|
continue
|
|
|
|
stats = process_file(filepath, dry_run=args.dry_run)
|
|
total_stats['files_processed'] += 1
|
|
|
|
if stats['web_claims_removed']:
|
|
total_stats['web_claims_removed'] += 1
|
|
if args.verbose:
|
|
print(f" Removed web_claims: {Path(filepath).name}")
|
|
|
|
if stats['validated_entity_claims_renamed']:
|
|
total_stats['validated_entity_claims_renamed'] += 1
|
|
if args.verbose:
|
|
print(f" Renamed validated_entity_claims: {Path(filepath).name}")
|
|
|
|
if stats['error']:
|
|
total_stats['errors'] += 1
|
|
print(f" ERROR in {Path(filepath).name}: {stats['error']}")
|
|
|
|
# Progress indicator
|
|
if total_stats['files_processed'] % 500 == 0:
|
|
print(f" ... processed {total_stats['files_processed']} files")
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Files processed: {total_stats['files_processed']}")
|
|
print(f"web_claims sections removed: {total_stats['web_claims_removed']}")
|
|
print(f"validated_entity_claims renamed: {total_stats['validated_entity_claims_renamed']}")
|
|
print(f"Errors: {total_stats['errors']}")
|
|
print(f"Completed at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|