#!/usr/bin/env python3 """ Re-enrich files with fabricated web_enrichment sections. Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath pointer to the exact location in archived HTML where that value appears." This script: 1. Identifies files with fabricated web_enrichment (no xpath) 2. Extracts the source_url from each 3. Creates a list of URLs to re-fetch and re-enrich with proper xpath provenance """ import os import sys import re import yaml from pathlib import Path from datetime import datetime import subprocess import json def has_xpath_in_block(block_content: str) -> bool: """Check if a block contains xpath provenance.""" return 'xpath:' in block_content def has_vague_confidence(block_content: str) -> bool: """Check if block has vague confidence scores (no xpath but has confidence).""" has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block_content)) has_xpath = 'xpath:' in block_content return has_confidence and not has_xpath def extract_block(content: str, key: str) -> tuple[int, int, str]: """ Extract a top-level YAML block by key. Returns (start_line, end_line, block_content) or (-1, -1, '') if not found. """ lines = content.split('\n') in_block = False start_idx = -1 for i, line in enumerate(lines): if line.startswith(f'{key}:') and not line.startswith(' '): in_block = True start_idx = i continue if in_block: if line and not line.startswith(' ') and not line.startswith('#') and ':' in line: block_content = '\n'.join(lines[start_idx:i]) return start_idx, i, block_content if in_block: block_content = '\n'.join(lines[start_idx:]) return start_idx, len(lines), block_content return -1, -1, '' def analyze_file(filepath: Path) -> dict: """Analyze a single file for problematic web_enrichment.""" result = { 'filepath': str(filepath), 'filename': filepath.name, 'has_web_enrichment': False, 'has_vague_confidence': False, 'has_xpath': False, 'is_fabricated': False, 'claims_count': 0, 'source_url': None, 'website': None, 'ghcid': filepath.stem } try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Extract web_enrichment block start, end, block = extract_block(content, 'web_enrichment') if start >= 0: result['has_web_enrichment'] = True result['has_xpath'] = 'xpath:' in block result['has_vague_confidence'] = has_vague_confidence(block) result['claims_count'] = block.count('claim_type:') # Extract source_url from the block source_url_match = re.search(r'source_url:\s*([^\n]+)', block) if source_url_match: result['source_url'] = source_url_match.group(1).strip().strip("'\"") result['is_fabricated'] = result['has_vague_confidence'] and not result['has_xpath'] # Also try to get website from google_maps_enrichment or other sources parsed = yaml.safe_load(content) if parsed: # Try google_maps_enrichment gm = parsed.get('google_maps_enrichment', {}) if gm and gm.get('website'): result['website'] = gm['website'] # Try original_entry oe = parsed.get('original_entry', {}) if oe and oe.get('website'): result['website'] = result['website'] or oe['website'] except Exception as e: result['error'] = str(e) return result def main(): import argparse parser = argparse.ArgumentParser(description='Identify files needing web enrichment') parser.add_argument('--output', '-o', default='data/reenrich_queue.json', help='Output file for re-enrichment queue') parser.add_argument('--verbose', '-v', action='store_true', help='Print details') args = parser.parse_args() custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') print(f"Analyzing web_enrichment sections...") print(f"Started at: {datetime.now().isoformat()}") print() # Find files with web_enrichment using grep (fast) result = subprocess.run( 'grep -l "^web_enrichment:" *.yaml 2>/dev/null || true', shell=True, capture_output=True, text=True, cwd=custodian_dir ) files_to_check = [] if result.stdout.strip(): files_to_check = [custodian_dir / f for f in result.stdout.strip().split('\n') if f] print(f"Found {len(files_to_check)} files with web_enrichment sections") print() # Analyze all files fabricated_files = [] valid_files = [] for filepath in files_to_check: analysis = analyze_file(filepath) if 'error' in analysis: if args.verbose: print(f" ERROR in {filepath.name}: {analysis['error']}") continue if analysis['is_fabricated']: fabricated_files.append(analysis) if args.verbose: url = analysis['source_url'] or analysis['website'] or 'NO URL' print(f" FABRICATED: {analysis['filename']} ({analysis['claims_count']} claims) -> {url}") else: valid_files.append(analysis) print("=" * 60) print("ANALYSIS RESULTS") print("=" * 60) print(f"Total files with web_enrichment: {len(files_to_check)}") print(f" With proper xpath provenance: {len(valid_files)}") print(f" FABRICATED (need re-enrichment): {len(fabricated_files)}") print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated_files)}") print() # Extract URLs for re-enrichment reenrich_queue = [] no_url_files = [] for f in fabricated_files: url = f['source_url'] or f['website'] if url: reenrich_queue.append({ 'ghcid': f['ghcid'], 'filepath': f['filepath'], 'url': url, 'claims_count': f['claims_count'] }) else: no_url_files.append(f['filename']) print(f"Files with URLs to re-enrich: {len(reenrich_queue)}") print(f"Files without URLs (manual review): {len(no_url_files)}") if no_url_files and args.verbose: print("\nFiles without URLs:") for f in no_url_files[:10]: print(f" - {f}") if len(no_url_files) > 10: print(f" ... and {len(no_url_files) - 10} more") # Save queue output_path = Path('/Users/kempersc/apps/glam') / args.output output_path.parent.mkdir(parents=True, exist_ok=True) output_data = { 'generated_at': datetime.now().isoformat(), 'total_files': len(fabricated_files), 'files_with_urls': len(reenrich_queue), 'files_without_urls': no_url_files, 'queue': reenrich_queue } with open(output_path, 'w') as f: json.dump(output_data, f, indent=2) print(f"\nRe-enrichment queue saved to: {output_path}") print(f"Completed at: {datetime.now().isoformat()}") if __name__ == '__main__': main()