#!/usr/bin/env python3 """ Fast extraction of URLs from fabricated web_enrichment files. Uses regex only - no YAML parsing. """ import os import re import json import subprocess from pathlib import Path from datetime import datetime def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') print(f"Finding files with fabricated web_enrichment...") # Find all files with web_enrichment result = subprocess.run( 'grep -l "^web_enrichment:" *.yaml', shell=True, capture_output=True, text=True, cwd=custodian_dir ) all_files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()] print(f"Found {len(all_files)} files with web_enrichment") # Filter to those with 'confidence:' but without 'xpath:' fabricated = [] valid = [] for filename in all_files: filepath = custodian_dir / filename try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: print(f"Error reading {filename}: {e}") continue # Find web_enrichment block - match from web_enrichment: to next top-level key or end match = re.search(r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)', content, re.MULTILINE | re.DOTALL) if not match: continue block = match.group(0) has_xpath = 'xpath:' in block has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block)) if has_confidence and not has_xpath: # Extract source_url url_match = re.search(r'source_url:\s*[\'"]?([^\s\'"]+)', block) url = url_match.group(1) if url_match else None # Count claims claims_count = block.count('claim_type:') fabricated.append({ 'ghcid': filepath.stem, 'filepath': str(filepath), 'url': url, 'claims_count': claims_count }) else: valid.append(filename) print(f"\n{'='*60}") print("RESULTS") print('='*60) print(f"Files with proper xpath: {len(valid)}") print(f"FABRICATED (no xpath): {len(fabricated)}") print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated)}") # Files with URLs with_url = [f for f in fabricated if f['url']] without_url = [f for f in fabricated if not f['url']] print(f"\nFiles with URL for re-enrichment: {len(with_url)}") print(f"Files without URL (need lookup): {len(without_url)}") # Save output output = { 'generated_at': datetime.now().isoformat(), 'total_fabricated': len(fabricated), 'with_url': len(with_url), 'without_url': len(without_url), 'queue': with_url, 'missing_url': [f['ghcid'] for f in without_url] } output_path = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json') with open(output_path, 'w') as f: json.dump(output, f, indent=2) print(f"\nSaved to: {output_path}") # Show sample URLs print(f"\nSample URLs to re-enrich:") for item in with_url[:10]: print(f" {item['ghcid']}: {item['url']}") if __name__ == '__main__': main()