glam/scripts/extract_reenrich_urls_fast.py

#!/usr/bin/env python3
"""
Fast extraction of URLs from fabricated web_enrichment files.
Uses regex only - no YAML parsing.
"""
import os
import re
import json
import subprocess
from pathlib import Path
from datetime import datetime

def main():
    custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')

    print(f"Finding files with fabricated web_enrichment...")

    # Find all files with web_enrichment
    result = subprocess.run(
        'grep -l "^web_enrichment:" *.yaml',
        shell=True, capture_output=True, text=True, cwd=custodian_dir
    )

    all_files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()]
    print(f"Found {len(all_files)} files with web_enrichment")

    # Filter to those with 'confidence:' but without 'xpath:'
    fabricated = []
    valid = []

    for filename in all_files:
        filepath = custodian_dir / filename

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            continue

        # Find web_enrichment block - match from web_enrichment: to next top-level key or end
        match = re.search(r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)', content, re.MULTILINE | re.DOTALL)
        if not match:
            continue

        block = match.group(0)

        has_xpath = 'xpath:' in block
        has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block))

        if has_confidence and not has_xpath:
            # Extract source_url
            url_match = re.search(r'source_url:\s*[\'"]?([^\s\'"]+)', block)
            url = url_match.group(1) if url_match else None

            # Count claims
            claims_count = block.count('claim_type:')

            fabricated.append({
                'ghcid': filepath.stem,
                'filepath': str(filepath),
                'url': url,
                'claims_count': claims_count
            })
        else:
            valid.append(filename)

    print(f"\n{'='*60}")
    print("RESULTS")
    print('='*60)
    print(f"Files with proper xpath:             {len(valid)}")
    print(f"FABRICATED (no xpath):               {len(fabricated)}")
    print(f"Total fabricated claims:             {sum(f['claims_count'] for f in fabricated)}")

    # Files with URLs
    with_url = [f for f in fabricated if f['url']]
    without_url = [f for f in fabricated if not f['url']]

    print(f"\nFiles with URL for re-enrichment:    {len(with_url)}")
    print(f"Files without URL (need lookup):     {len(without_url)}")

    # Save output
    output = {
        'generated_at': datetime.now().isoformat(),
        'total_fabricated': len(fabricated),
        'with_url': len(with_url),
        'without_url': len(without_url),
        'queue': with_url,
        'missing_url': [f['ghcid'] for f in without_url]
    }

    output_path = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json')
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=2)

    print(f"\nSaved to: {output_path}")

    # Show sample URLs
    print(f"\nSample URLs to re-enrich:")
    for item in with_url[:10]:
        print(f"  {item['ghcid']}: {item['url']}")

if __name__ == '__main__':
    main()