146 lines
4.8 KiB
Python
146 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup script for custodian YAML files.
|
|
|
|
This script:
|
|
1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions
|
|
2. RENAMES `validated_entity_claims` to `web-enrichments`
|
|
|
|
Per project decision: web_claims contain page titles, navigation items, and other
|
|
non-entity data incorrectly classified as organization metadata.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import yaml
|
|
from datetime import datetime
|
|
|
|
# Configure YAML to preserve formatting as much as possible
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
yaml.add_representer(str, str_representer)
|
|
|
|
|
|
def cleanup_custodian_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""
|
|
Clean up a single custodian YAML file.
|
|
|
|
Returns dict with modification stats.
|
|
"""
|
|
stats = {
|
|
'web_claims_removed': False,
|
|
'validated_entity_claims_renamed': False,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
data = yaml.safe_load(content)
|
|
|
|
if data is None:
|
|
return stats
|
|
|
|
modified = False
|
|
|
|
# 1. Remove web_claims section (top-level)
|
|
if 'web_claims' in data:
|
|
del data['web_claims']
|
|
stats['web_claims_removed'] = True
|
|
modified = True
|
|
|
|
# Also check for web_claims in nested structures like digital_platform_v2
|
|
# But preserve those as they may be valid - only remove top-level web_claims
|
|
|
|
# 2. Rename validated_entity_claims to web-enrichments
|
|
if 'validated_entity_claims' in data:
|
|
data['web-enrichments'] = data.pop('validated_entity_claims')
|
|
stats['validated_entity_claims_renamed'] = True
|
|
modified = True
|
|
|
|
# Write back if modified
|
|
if modified and not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
except Exception as e:
|
|
stats['error'] = str(e)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory not found: {custodian_dir}")
|
|
sys.exit(1)
|
|
|
|
yaml_files = list(custodian_dir.glob('*.yaml'))
|
|
|
|
if args.limit:
|
|
yaml_files = yaml_files[:args.limit]
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} YAML files...")
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
total_stats = {
|
|
'files_processed': 0,
|
|
'web_claims_removed': 0,
|
|
'validated_entity_claims_renamed': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for filepath in yaml_files:
|
|
stats = cleanup_custodian_file(filepath, dry_run=args.dry_run)
|
|
total_stats['files_processed'] += 1
|
|
|
|
if stats['web_claims_removed']:
|
|
total_stats['web_claims_removed'] += 1
|
|
if args.verbose:
|
|
print(f" Removed web_claims: {filepath.name}")
|
|
|
|
if stats['validated_entity_claims_renamed']:
|
|
total_stats['validated_entity_claims_renamed'] += 1
|
|
if args.verbose:
|
|
print(f" Renamed validated_entity_claims: {filepath.name}")
|
|
|
|
if stats['error']:
|
|
total_stats['errors'] += 1
|
|
print(f" ERROR in {filepath.name}: {stats['error']}")
|
|
|
|
# Progress indicator
|
|
if total_stats['files_processed'] % 500 == 0:
|
|
print(f" ... processed {total_stats['files_processed']} files")
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Files processed: {total_stats['files_processed']}")
|
|
print(f"web_claims sections removed: {total_stats['web_claims_removed']}")
|
|
print(f"validated_entity_claims renamed: {total_stats['validated_entity_claims_renamed']}")
|
|
print(f"Errors: {total_stats['errors']}")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|