#!/usr/bin/env python3 """ Cleanup script for custodian YAML files. This script: 1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions 2. RENAMES `validated_entity_claims` to `web-enrichments` Per project decision: web_claims contain page titles, navigation items, and other non-entity data incorrectly classified as organization metadata. """ import os import sys from pathlib import Path import yaml from datetime import datetime # Configure YAML to preserve formatting as much as possible def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_representer) def cleanup_custodian_file(filepath: Path, dry_run: bool = False) -> dict: """ Clean up a single custodian YAML file. Returns dict with modification stats. """ stats = { 'web_claims_removed': False, 'validated_entity_claims_renamed': False, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML data = yaml.safe_load(content) if data is None: return stats modified = False # 1. Remove web_claims section (top-level) if 'web_claims' in data: del data['web_claims'] stats['web_claims_removed'] = True modified = True # Also check for web_claims in nested structures like digital_platform_v2 # But preserve those as they may be valid - only remove top-level web_claims # 2. Rename validated_entity_claims to web-enrichments if 'validated_entity_claims' in data: data['web-enrichments'] = data.pop('validated_entity_claims') stats['validated_entity_claims_renamed'] = True modified = True # Write back if modified if modified and not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) except Exception as e: stats['error'] = str(e) return stats def main(): import argparse parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process') parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed') args = parser.parse_args() custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') if not custodian_dir.exists(): print(f"Error: Directory not found: {custodian_dir}") sys.exit(1) yaml_files = list(custodian_dir.glob('*.yaml')) if args.limit: yaml_files = yaml_files[:args.limit] print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} YAML files...") print(f"Started at: {datetime.now().isoformat()}") print() total_stats = { 'files_processed': 0, 'web_claims_removed': 0, 'validated_entity_claims_renamed': 0, 'errors': 0 } for filepath in yaml_files: stats = cleanup_custodian_file(filepath, dry_run=args.dry_run) total_stats['files_processed'] += 1 if stats['web_claims_removed']: total_stats['web_claims_removed'] += 1 if args.verbose: print(f" Removed web_claims: {filepath.name}") if stats['validated_entity_claims_renamed']: total_stats['validated_entity_claims_renamed'] += 1 if args.verbose: print(f" Renamed validated_entity_claims: {filepath.name}") if stats['error']: total_stats['errors'] += 1 print(f" ERROR in {filepath.name}: {stats['error']}") # Progress indicator if total_stats['files_processed'] % 500 == 0: print(f" ... processed {total_stats['files_processed']} files") print() print("=" * 60) print("SUMMARY") print("=" * 60) print(f"Files processed: {total_stats['files_processed']}") print(f"web_claims sections removed: {total_stats['web_claims_removed']}") print(f"validated_entity_claims renamed: {total_stats['validated_entity_claims_renamed']}") print(f"Errors: {total_stats['errors']}") print() if args.dry_run: print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.") if __name__ == '__main__': main()