glam/scripts/cleanup_web_claims.py
2025-12-21 00:01:54 +01:00

146 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
Cleanup script for custodian YAML files.
This script:
1. DELETES `web_claims` sections - they contain noisy, incorrectly classified extractions
2. RENAMES `validated_entity_claims` to `web-enrichments`
Per project decision: web_claims contain page titles, navigation items, and other
non-entity data incorrectly classified as organization metadata.
"""
import os
import sys
from pathlib import Path
import yaml
from datetime import datetime
# Configure YAML to preserve formatting as much as possible
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
yaml.add_representer(str, str_representer)
def cleanup_custodian_file(filepath: Path, dry_run: bool = False) -> dict:
"""
Clean up a single custodian YAML file.
Returns dict with modification stats.
"""
stats = {
'web_claims_removed': False,
'validated_entity_claims_renamed': False,
'error': None
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
data = yaml.safe_load(content)
if data is None:
return stats
modified = False
# 1. Remove web_claims section (top-level)
if 'web_claims' in data:
del data['web_claims']
stats['web_claims_removed'] = True
modified = True
# Also check for web_claims in nested structures like digital_platform_v2
# But preserve those as they may be valid - only remove top-level web_claims
# 2. Rename validated_entity_claims to web-enrichments
if 'validated_entity_claims' in data:
data['web-enrichments'] = data.pop('validated_entity_claims')
stats['validated_entity_claims_renamed'] = True
modified = True
# Write back if modified
if modified and not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
except Exception as e:
stats['error'] = str(e)
return stats
def main():
import argparse
parser = argparse.ArgumentParser(description='Clean up web_claims from custodian YAML files')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process')
parser.add_argument('--verbose', '-v', action='store_true', help='Print each file processed')
args = parser.parse_args()
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
if not custodian_dir.exists():
print(f"Error: Directory not found: {custodian_dir}")
sys.exit(1)
yaml_files = list(custodian_dir.glob('*.yaml'))
if args.limit:
yaml_files = yaml_files[:args.limit]
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} YAML files...")
print(f"Started at: {datetime.now().isoformat()}")
print()
total_stats = {
'files_processed': 0,
'web_claims_removed': 0,
'validated_entity_claims_renamed': 0,
'errors': 0
}
for filepath in yaml_files:
stats = cleanup_custodian_file(filepath, dry_run=args.dry_run)
total_stats['files_processed'] += 1
if stats['web_claims_removed']:
total_stats['web_claims_removed'] += 1
if args.verbose:
print(f" Removed web_claims: {filepath.name}")
if stats['validated_entity_claims_renamed']:
total_stats['validated_entity_claims_renamed'] += 1
if args.verbose:
print(f" Renamed validated_entity_claims: {filepath.name}")
if stats['error']:
total_stats['errors'] += 1
print(f" ERROR in {filepath.name}: {stats['error']}")
# Progress indicator
if total_stats['files_processed'] % 500 == 0:
print(f" ... processed {total_stats['files_processed']} files")
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Files processed: {total_stats['files_processed']}")
print(f"web_claims sections removed: {total_stats['web_claims_removed']}")
print(f"validated_entity_claims renamed: {total_stats['validated_entity_claims_renamed']}")
print(f"Errors: {total_stats['errors']}")
print()
if args.dry_run:
print("[DRY RUN] No changes were made. Run without --dry-run to apply changes.")
if __name__ == '__main__':
main()