#!/usr/bin/env python3 """ Clean up low-quality entities from custodian validated_entity_claims sections. This script removes: 1. Language codes (nl-NL, en-US, etc.) - HTML lang attributes, not real entities 2. Generic navigation labels (Home, Menu, Contact, etc.) 3. Numeric-only entities (image dimensions, years without context) 4. Single/double character entities 5. Common stopwords extracted as entities Removed entities are archived for audit purposes. Usage: python scripts/cleanup_entities.py --dry-run # Preview changes python scripts/cleanup_entities.py # Apply changes python scripts/cleanup_entities.py --verbose # Show details """ import os import re import glob import json import argparse from datetime import datetime, timezone from collections import Counter, defaultdict from typing import Optional # ============================================================================ # CLEANUP RULES - Add new patterns here # ============================================================================ # Language code patterns (HTML lang attributes) LANGUAGE_CODE_PATTERNS = [ r'^[a-z]{2}[-_][A-Z]{2}$', # nl-NL, en-US, de-DE r'^[a-z]{2}_[a-z]{2}$', # nl_nl, en_us r'^[a-z]{2}$', # nl, en, de, fr (when alone) ] # Two-letter codes that are NOT language codes (keep these) KEEP_TWO_LETTER = { 'eu', # European Union } # Generic navigation/UI labels (case-insensitive) GENERIC_LABELS = { # Navigation 'home', 'menu', 'contact', 'over', 'about', 'search', 'zoeken', 'terug', 'back', 'next', 'vorige', 'volgende', 'more', 'meer', # Common sections 'nieuws', 'news', 'agenda', 'events', 'evenementen', 'blog', 'login', 'logout', 'inloggen', 'uitloggen', 'registreren', 'cookie', 'cookies', 'privacy', 'disclaimer', 'terms', # Generic content labels 'lees meer', 'read more', 'bekijk', 'view', 'download', 'share', 'delen', 'print', 'email', 'e-mail', # Social media generic 'twitter', 'facebook', 'instagram', 'linkedin', 'youtube', 'social media', 'sociale media', 'volg ons', 'follow us', # Site elements 'header', 'footer', 'sidebar', 'main', 'content', 'skip to content', 'ga naar inhoud', } # Numeric-only patterns (image dimensions, isolated numbers) NUMERIC_PATTERNS = [ r"^'?\d+'?$", # '2025', '1200', 800 r'^\d+x\d+$', # 1920x1080 r'^\d+px$', # 100px r'^\d+%$', # 50% ] # Entity types that should be filtered more aggressively LOW_VALUE_TYPES = { 'QTY.MSR', # Measurements (often image dimensions) 'QTY.CNT', # Counts without context } # Minimum entity length (after normalization) MIN_ENTITY_LENGTH = 3 # Maximum occurrences to consider "too generic" (appears in >X files) # Entities like "nl-NL" appear 500+ times - clearly metadata, not content MAX_GENERIC_OCCURRENCES = 100 def normalize_entity(name: str) -> str: """Normalize entity name for comparison.""" norm = name.lower().strip() norm = norm.strip("'\"") norm = ' '.join(norm.split()) return norm def is_language_code(entity: str) -> bool: """Check if entity is a language code.""" # Keep known non-language two-letter codes if entity.lower() in KEEP_TWO_LETTER: return False for pattern in LANGUAGE_CODE_PATTERNS: if re.match(pattern, entity, re.IGNORECASE): return True return False def is_generic_label(entity: str) -> bool: """Check if entity is a generic navigation/UI label.""" return normalize_entity(entity) in GENERIC_LABELS def is_numeric_only(entity: str) -> bool: """Check if entity is numeric-only.""" for pattern in NUMERIC_PATTERNS: if re.match(pattern, entity.strip("'\"")): return True return False def is_too_short(entity: str) -> bool: """Check if entity is too short to be meaningful.""" norm = normalize_entity(entity) return len(norm) < MIN_ENTITY_LENGTH def should_filter_entity(entity_name: str, entity_type: str) -> tuple[bool, str]: """ Determine if an entity should be filtered out. Returns: (should_filter, reason) """ # Language codes if is_language_code(entity_name): return True, "language_code" # Generic labels if is_generic_label(entity_name): return True, "generic_label" # Numeric only if is_numeric_only(entity_name): return True, "numeric_only" # Too short if is_too_short(entity_name): return True, "too_short" # Low-value types with generic content if entity_type in LOW_VALUE_TYPES: # Keep measurements if they have context (e.g., "150 cm") if re.match(r"^'?\d+'?$", entity_name.strip("'\"")): return True, f"low_value_type_{entity_type}" return False, "" def extract_validated_claims_section(content: str) -> Optional[str]: """Extract the validated_entity_claims section from YAML content.""" match = re.search( r'(validated_entity_claims:.*?)(?=\n[a-z_]+:|\Z)', content, re.DOTALL ) return match.group(1) if match else None def parse_claims_from_section(section: str) -> list[dict]: """Parse individual claims from the claims section.""" claims = [] # Find the claims list claims_match = re.search(r'claims:\s*\n(.*)', section, re.DOTALL) if not claims_match: return claims claims_text = claims_match.group(1) # Split into individual claim blocks claim_blocks = re.split(r'\n - entity:', claims_text) for i, block in enumerate(claim_blocks): if not block.strip(): continue # Add back the "- entity:" prefix for parsing (except first) if i > 0: block = "- entity:" + block claim = {} # Extract fields entity_match = re.search(r'entity: (.+)', block) if entity_match: claim['entity'] = entity_match.group(1).strip() type_match = re.search(r'entity_type: (\S+)', block) if type_match: claim['entity_type'] = type_match.group(1).strip() xpath_match = re.search(r'xpath: (.+)', block) if xpath_match: claim['xpath'] = xpath_match.group(1).strip() # Get confidence scores for field in ['base_confidence', 'layout_score', 'pattern_score', 'final_confidence']: match = re.search(rf'{field}: ([\d.]+)', block) if match: claim[field] = float(match.group(1)) layout_match = re.search(r'layout_match: (.+)', block) if layout_match: claim['layout_match'] = layout_match.group(1).strip() if 'entity' in claim: claims.append(claim) return claims def rebuild_claims_yaml(claims: list[dict], metadata: dict) -> str: """Rebuild the validated_entity_claims YAML section.""" lines = ['validated_entity_claims:'] lines.append(f" extraction_timestamp: '{metadata.get('extraction_timestamp', '')}'") lines.append(f" extraction_method: {metadata.get('extraction_method', 'hybrid_llm_pattern_layout_v1')}") lines.append(f" confidence_threshold: {metadata.get('confidence_threshold', 0.6)}") lines.append(f" entities_count: {len(claims)}") lines.append(f" cleanup_applied: '{datetime.now(timezone.utc).isoformat()}'") lines.append(' claims:') for claim in claims: lines.append(f" - entity: {claim['entity']}") lines.append(f" entity_type: {claim['entity_type']}") if 'xpath' in claim: lines.append(f" xpath: {claim['xpath']}") if 'base_confidence' in claim: lines.append(f" base_confidence: {claim['base_confidence']}") if 'layout_score' in claim: lines.append(f" layout_score: {claim['layout_score']}") if 'pattern_score' in claim: lines.append(f" pattern_score: {claim['pattern_score']}") if 'final_confidence' in claim: lines.append(f" final_confidence: {claim['final_confidence']}") if 'layout_match' in claim: lines.append(f" layout_match: {claim['layout_match']}") return '\n'.join(lines) def extract_metadata_from_section(section: str) -> dict: """Extract metadata fields from the validated_entity_claims section.""" metadata = {} ts_match = re.search(r"extraction_timestamp: '([^']+)'", section) if ts_match: metadata['extraction_timestamp'] = ts_match.group(1) method_match = re.search(r'extraction_method: (\S+)', section) if method_match: metadata['extraction_method'] = method_match.group(1) thresh_match = re.search(r'confidence_threshold: ([\d.]+)', section) if thresh_match: metadata['confidence_threshold'] = float(thresh_match.group(1)) return metadata def process_file(filepath: str, dry_run: bool = True, verbose: bool = False) -> dict: """ Process a single custodian file and clean up entities. Returns: dict with processing results """ result = { 'file': os.path.basename(filepath), 'original_count': 0, 'filtered_count': 0, 'removed': [], 'kept': 0, 'modified': False, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Check if file has validated_entity_claims if 'validated_entity_claims:' not in content: return result # Extract the section section = extract_validated_claims_section(content) if not section: return result # Parse claims claims = parse_claims_from_section(section) result['original_count'] = len(claims) if not claims: return result # Extract metadata metadata = extract_metadata_from_section(section) # Filter claims kept_claims = [] removed_claims = [] for claim in claims: entity_name = claim.get('entity', '') entity_type = claim.get('entity_type', '') should_filter, reason = should_filter_entity(entity_name, entity_type) if should_filter: removed_claims.append({ 'entity': entity_name, 'type': entity_type, 'reason': reason }) else: kept_claims.append(claim) result['filtered_count'] = len(removed_claims) result['removed'] = removed_claims result['kept'] = len(kept_claims) # If nothing to remove, skip if not removed_claims: return result result['modified'] = True if verbose: print(f"\n {result['file']}:") print(f" Original: {result['original_count']}, Kept: {result['kept']}, Removed: {result['filtered_count']}") for r in removed_claims[:5]: print(f" - {r['entity'][:40]} ({r['type']}) -> {r['reason']}") if len(removed_claims) > 5: print(f" ... and {len(removed_claims) - 5} more") if not dry_run: # Rebuild the section new_section = rebuild_claims_yaml(kept_claims, metadata) # Replace in content new_content = re.sub( r'validated_entity_claims:.*?(?=\n[a-z_]+:|\Z)', new_section + '\n', content, flags=re.DOTALL ) # Write back with open(filepath, 'w', encoding='utf-8') as f: f.write(new_content) except Exception as e: result['error'] = str(e) return result def main(): parser = argparse.ArgumentParser( description='Clean up low-quality entities from custodian files' ) parser.add_argument( '--dry-run', action='store_true', help='Preview changes without modifying files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Show detailed output for each file' ) parser.add_argument( '--limit', type=int, default=None, help='Process only N files (for testing)' ) parser.add_argument( '--pattern', type=str, default='data/custodian/NL-*.yaml', help='Glob pattern for files to process' ) args = parser.parse_args() print("=" * 70) print("Entity Cleanup Script") print("=" * 70) if args.dry_run: print("\n[DRY RUN] - No files will be modified\n") # Find files files = sorted(glob.glob(args.pattern)) if args.limit: files = files[:args.limit] print(f"Found {len(files)} files to process\n") # Track statistics stats = { 'total_files': len(files), 'files_with_claims': 0, 'files_modified': 0, 'total_original': 0, 'total_removed': 0, 'total_kept': 0, 'removal_reasons': Counter(), 'errors': [], } # Archive for removed entities archive = { 'cleanup_date': datetime.now(timezone.utc).isoformat(), 'dry_run': args.dry_run, 'files_processed': len(files), 'removed_entities': [] } # Process files for i, filepath in enumerate(files): if i % 200 == 0 and not args.verbose: print(f" Processing {i}/{len(files)}...") result = process_file(filepath, dry_run=args.dry_run, verbose=args.verbose) if result['error']: stats['errors'].append((result['file'], result['error'])) continue if result['original_count'] > 0: stats['files_with_claims'] += 1 stats['total_original'] += result['original_count'] stats['total_kept'] += result['kept'] stats['total_removed'] += result['filtered_count'] if result['modified']: stats['files_modified'] += 1 for removed in result['removed']: stats['removal_reasons'][removed['reason']] += 1 archive['removed_entities'].append({ 'file': result['file'], 'entity': removed['entity'], 'type': removed['type'], 'reason': removed['reason'] }) # Print summary print("\n" + "=" * 70) print("CLEANUP SUMMARY") print("=" * 70) print(f"\nFiles:") print(f" Total processed: {stats['total_files']:,}") print(f" With claims: {stats['files_with_claims']:,}") print(f" Modified: {stats['files_modified']:,}") print(f"\nEntities:") print(f" Original total: {stats['total_original']:,}") print(f" Removed: {stats['total_removed']:,} ({100*stats['total_removed']/max(1,stats['total_original']):.1f}%)") print(f" Kept: {stats['total_kept']:,} ({100*stats['total_kept']/max(1,stats['total_original']):.1f}%)") print(f"\nRemoval reasons:") for reason, count in stats['removal_reasons'].most_common(): print(f" {reason:20s}: {count:,}") if stats['errors']: print(f"\nErrors: {len(stats['errors'])}") for file, error in stats['errors'][:5]: print(f" {file}: {error}") # Save archive archive_path = f"reports/entity_cleanup_archive_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" os.makedirs('reports', exist_ok=True) with open(archive_path, 'w', encoding='utf-8') as f: json.dump(archive, f, indent=2, ensure_ascii=False) print(f"\nArchive saved: {archive_path}") if args.dry_run: print("\n[DRY RUN] No files were modified. Run without --dry-run to apply changes.") if __name__ == '__main__': main()