glam/scripts/reenrich_fabricated_web_claims.py
2025-12-21 22:12:34 +01:00

223 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Re-enrich files with fabricated web_enrichment sections.
Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath
pointer to the exact location in archived HTML where that value appears."
This script:
1. Identifies files with fabricated web_enrichment (no xpath)
2. Extracts the source_url from each
3. Creates a list of URLs to re-fetch and re-enrich with proper xpath provenance
"""
import os
import sys
import re
import yaml
from pathlib import Path
from datetime import datetime
import subprocess
import json
def has_xpath_in_block(block_content: str) -> bool:
"""Check if a block contains xpath provenance."""
return 'xpath:' in block_content
def has_vague_confidence(block_content: str) -> bool:
"""Check if block has vague confidence scores (no xpath but has confidence)."""
has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block_content))
has_xpath = 'xpath:' in block_content
return has_confidence and not has_xpath
def extract_block(content: str, key: str) -> tuple[int, int, str]:
"""
Extract a top-level YAML block by key.
Returns (start_line, end_line, block_content) or (-1, -1, '') if not found.
"""
lines = content.split('\n')
in_block = False
start_idx = -1
for i, line in enumerate(lines):
if line.startswith(f'{key}:') and not line.startswith(' '):
in_block = True
start_idx = i
continue
if in_block:
if line and not line.startswith(' ') and not line.startswith('#') and ':' in line:
block_content = '\n'.join(lines[start_idx:i])
return start_idx, i, block_content
if in_block:
block_content = '\n'.join(lines[start_idx:])
return start_idx, len(lines), block_content
return -1, -1, ''
def analyze_file(filepath: Path) -> dict:
"""Analyze a single file for problematic web_enrichment."""
result = {
'filepath': str(filepath),
'filename': filepath.name,
'has_web_enrichment': False,
'has_vague_confidence': False,
'has_xpath': False,
'is_fabricated': False,
'claims_count': 0,
'source_url': None,
'website': None,
'ghcid': filepath.stem
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Extract web_enrichment block
start, end, block = extract_block(content, 'web_enrichment')
if start >= 0:
result['has_web_enrichment'] = True
result['has_xpath'] = 'xpath:' in block
result['has_vague_confidence'] = has_vague_confidence(block)
result['claims_count'] = block.count('claim_type:')
# Extract source_url from the block
source_url_match = re.search(r'source_url:\s*([^\n]+)', block)
if source_url_match:
result['source_url'] = source_url_match.group(1).strip().strip("'\"")
result['is_fabricated'] = result['has_vague_confidence'] and not result['has_xpath']
# Also try to get website from google_maps_enrichment or other sources
parsed = yaml.safe_load(content)
if parsed:
# Try google_maps_enrichment
gm = parsed.get('google_maps_enrichment', {})
if gm and gm.get('website'):
result['website'] = gm['website']
# Try original_entry
oe = parsed.get('original_entry', {})
if oe and oe.get('website'):
result['website'] = result['website'] or oe['website']
except Exception as e:
result['error'] = str(e)
return result
def main():
import argparse
parser = argparse.ArgumentParser(description='Identify files needing web enrichment')
parser.add_argument('--output', '-o', default='data/reenrich_queue.json',
help='Output file for re-enrichment queue')
parser.add_argument('--verbose', '-v', action='store_true', help='Print details')
args = parser.parse_args()
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print(f"Analyzing web_enrichment sections...")
print(f"Started at: {datetime.now().isoformat()}")
print()
# Find files with web_enrichment using grep (fast)
result = subprocess.run(
'grep -l "^web_enrichment:" *.yaml 2>/dev/null || true',
shell=True,
capture_output=True,
text=True,
cwd=custodian_dir
)
files_to_check = []
if result.stdout.strip():
files_to_check = [custodian_dir / f for f in result.stdout.strip().split('\n') if f]
print(f"Found {len(files_to_check)} files with web_enrichment sections")
print()
# Analyze all files
fabricated_files = []
valid_files = []
for filepath in files_to_check:
analysis = analyze_file(filepath)
if 'error' in analysis:
if args.verbose:
print(f" ERROR in {filepath.name}: {analysis['error']}")
continue
if analysis['is_fabricated']:
fabricated_files.append(analysis)
if args.verbose:
url = analysis['source_url'] or analysis['website'] or 'NO URL'
print(f" FABRICATED: {analysis['filename']} ({analysis['claims_count']} claims) -> {url}")
else:
valid_files.append(analysis)
print("=" * 60)
print("ANALYSIS RESULTS")
print("=" * 60)
print(f"Total files with web_enrichment: {len(files_to_check)}")
print(f" With proper xpath provenance: {len(valid_files)}")
print(f" FABRICATED (need re-enrichment): {len(fabricated_files)}")
print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated_files)}")
print()
# Extract URLs for re-enrichment
reenrich_queue = []
no_url_files = []
for f in fabricated_files:
url = f['source_url'] or f['website']
if url:
reenrich_queue.append({
'ghcid': f['ghcid'],
'filepath': f['filepath'],
'url': url,
'claims_count': f['claims_count']
})
else:
no_url_files.append(f['filename'])
print(f"Files with URLs to re-enrich: {len(reenrich_queue)}")
print(f"Files without URLs (manual review): {len(no_url_files)}")
if no_url_files and args.verbose:
print("\nFiles without URLs:")
for f in no_url_files[:10]:
print(f" - {f}")
if len(no_url_files) > 10:
print(f" ... and {len(no_url_files) - 10} more")
# Save queue
output_path = Path('/Users/kempersc/apps/glam') / args.output
output_path.parent.mkdir(parents=True, exist_ok=True)
output_data = {
'generated_at': datetime.now().isoformat(),
'total_files': len(fabricated_files),
'files_with_urls': len(reenrich_queue),
'files_without_urls': no_url_files,
'queue': reenrich_queue
}
with open(output_path, 'w') as f:
json.dump(output_data, f, indent=2)
print(f"\nRe-enrichment queue saved to: {output_path}")
print(f"Completed at: {datetime.now().isoformat()}")
if __name__ == '__main__':
main()