glam/scripts/reenrich_fabricated_web_claims.py

#!/usr/bin/env python3
"""
Re-enrich files with fabricated web_enrichment sections.

Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath
pointer to the exact location in archived HTML where that value appears."

This script:
1. Identifies files with fabricated web_enrichment (no xpath)
2. Extracts the source_url from each
3. Creates a list of URLs to re-fetch and re-enrich with proper xpath provenance
"""

import os
import sys
import re
import yaml
from pathlib import Path
from datetime import datetime
import subprocess
import json


def has_xpath_in_block(block_content: str) -> bool:
    """Check if a block contains xpath provenance."""
    return 'xpath:' in block_content


def has_vague_confidence(block_content: str) -> bool:
    """Check if block has vague confidence scores (no xpath but has confidence)."""
    has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block_content))
    has_xpath = 'xpath:' in block_content
    return has_confidence and not has_xpath


def extract_block(content: str, key: str) -> tuple[int, int, str]:
    """
    Extract a top-level YAML block by key.
    Returns (start_line, end_line, block_content) or (-1, -1, '') if not found.
    """
    lines = content.split('\n')
    in_block = False
    start_idx = -1

    for i, line in enumerate(lines):
        if line.startswith(f'{key}:') and not line.startswith(' '):
            in_block = True
            start_idx = i
            continue

        if in_block:
            if line and not line.startswith(' ') and not line.startswith('#') and ':' in line:
                block_content = '\n'.join(lines[start_idx:i])
                return start_idx, i, block_content

    if in_block:
        block_content = '\n'.join(lines[start_idx:])
        return start_idx, len(lines), block_content

    return -1, -1, ''


def analyze_file(filepath: Path) -> dict:
    """Analyze a single file for problematic web_enrichment."""
    result = {
        'filepath': str(filepath),
        'filename': filepath.name,
        'has_web_enrichment': False,
        'has_vague_confidence': False,
        'has_xpath': False,
        'is_fabricated': False,
        'claims_count': 0,
        'source_url': None,
        'website': None,
        'ghcid': filepath.stem
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract web_enrichment block
        start, end, block = extract_block(content, 'web_enrichment')

        if start >= 0:
            result['has_web_enrichment'] = True
            result['has_xpath'] = 'xpath:' in block
            result['has_vague_confidence'] = has_vague_confidence(block)
            result['claims_count'] = block.count('claim_type:')

            # Extract source_url from the block
            source_url_match = re.search(r'source_url:\s*([^\n]+)', block)
            if source_url_match:
                result['source_url'] = source_url_match.group(1).strip().strip("'\"")

            result['is_fabricated'] = result['has_vague_confidence'] and not result['has_xpath']

        # Also try to get website from google_maps_enrichment or other sources
        parsed = yaml.safe_load(content)
        if parsed:
            # Try google_maps_enrichment
            gm = parsed.get('google_maps_enrichment', {})
            if gm and gm.get('website'):
                result['website'] = gm['website']

            # Try original_entry
            oe = parsed.get('original_entry', {})
            if oe and oe.get('website'):
                result['website'] = result['website'] or oe['website']

    except Exception as e:
        result['error'] = str(e)

    return result


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Identify files needing web enrichment')
    parser.add_argument('--output', '-o', default='data/reenrich_queue.json',
                        help='Output file for re-enrichment queue')
    parser.add_argument('--verbose', '-v', action='store_true', help='Print details')
    args = parser.parse_args()

    custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')

    print(f"Analyzing web_enrichment sections...")
    print(f"Started at: {datetime.now().isoformat()}")
    print()

    # Find files with web_enrichment using grep (fast)
    result = subprocess.run(
        'grep -l "^web_enrichment:" *.yaml 2>/dev/null || true',
        shell=True,
        capture_output=True,
        text=True,
        cwd=custodian_dir
    )

    files_to_check = []
    if result.stdout.strip():
        files_to_check = [custodian_dir / f for f in result.stdout.strip().split('\n') if f]

    print(f"Found {len(files_to_check)} files with web_enrichment sections")
    print()

    # Analyze all files
    fabricated_files = []
    valid_files = []

    for filepath in files_to_check:
        analysis = analyze_file(filepath)

        if 'error' in analysis:
            if args.verbose:
                print(f"  ERROR in {filepath.name}: {analysis['error']}")
            continue

        if analysis['is_fabricated']:
            fabricated_files.append(analysis)
            if args.verbose:
                url = analysis['source_url'] or analysis['website'] or 'NO URL'
                print(f"  FABRICATED: {analysis['filename']} ({analysis['claims_count']} claims) -> {url}")
        else:
            valid_files.append(analysis)

    print("=" * 60)
    print("ANALYSIS RESULTS")
    print("=" * 60)
    print(f"Total files with web_enrichment:     {len(files_to_check)}")
    print(f"  With proper xpath provenance:      {len(valid_files)}")
    print(f"  FABRICATED (need re-enrichment):   {len(fabricated_files)}")
    print(f"Total fabricated claims:             {sum(f['claims_count'] for f in fabricated_files)}")
    print()

    # Extract URLs for re-enrichment
    reenrich_queue = []
    no_url_files = []

    for f in fabricated_files:
        url = f['source_url'] or f['website']
        if url:
            reenrich_queue.append({
                'ghcid': f['ghcid'],
                'filepath': f['filepath'],
                'url': url,
                'claims_count': f['claims_count']
            })
        else:
            no_url_files.append(f['filename'])

    print(f"Files with URLs to re-enrich:        {len(reenrich_queue)}")
    print(f"Files without URLs (manual review):  {len(no_url_files)}")

    if no_url_files and args.verbose:
        print("\nFiles without URLs:")
        for f in no_url_files[:10]:
            print(f"  - {f}")
        if len(no_url_files) > 10:
            print(f"  ... and {len(no_url_files) - 10} more")

    # Save queue
    output_path = Path('/Users/kempersc/apps/glam') / args.output
    output_path.parent.mkdir(parents=True, exist_ok=True)

    output_data = {
        'generated_at': datetime.now().isoformat(),
        'total_files': len(fabricated_files),
        'files_with_urls': len(reenrich_queue),
        'files_without_urls': no_url_files,
        'queue': reenrich_queue
    }

    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=2)

    print(f"\nRe-enrichment queue saved to: {output_path}")
    print(f"Completed at: {datetime.now().isoformat()}")


if __name__ == '__main__':
    main()