223 lines
7.3 KiB
Python
223 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Re-enrich files with fabricated web_enrichment sections.
|
|
|
|
Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath
|
|
pointer to the exact location in archived HTML where that value appears."
|
|
|
|
This script:
|
|
1. Identifies files with fabricated web_enrichment (no xpath)
|
|
2. Extracts the source_url from each
|
|
3. Creates a list of URLs to re-fetch and re-enrich with proper xpath provenance
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import subprocess
|
|
import json
|
|
|
|
|
|
def has_xpath_in_block(block_content: str) -> bool:
|
|
"""Check if a block contains xpath provenance."""
|
|
return 'xpath:' in block_content
|
|
|
|
|
|
def has_vague_confidence(block_content: str) -> bool:
|
|
"""Check if block has vague confidence scores (no xpath but has confidence)."""
|
|
has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block_content))
|
|
has_xpath = 'xpath:' in block_content
|
|
return has_confidence and not has_xpath
|
|
|
|
|
|
def extract_block(content: str, key: str) -> tuple[int, int, str]:
|
|
"""
|
|
Extract a top-level YAML block by key.
|
|
Returns (start_line, end_line, block_content) or (-1, -1, '') if not found.
|
|
"""
|
|
lines = content.split('\n')
|
|
in_block = False
|
|
start_idx = -1
|
|
|
|
for i, line in enumerate(lines):
|
|
if line.startswith(f'{key}:') and not line.startswith(' '):
|
|
in_block = True
|
|
start_idx = i
|
|
continue
|
|
|
|
if in_block:
|
|
if line and not line.startswith(' ') and not line.startswith('#') and ':' in line:
|
|
block_content = '\n'.join(lines[start_idx:i])
|
|
return start_idx, i, block_content
|
|
|
|
if in_block:
|
|
block_content = '\n'.join(lines[start_idx:])
|
|
return start_idx, len(lines), block_content
|
|
|
|
return -1, -1, ''
|
|
|
|
|
|
def analyze_file(filepath: Path) -> dict:
|
|
"""Analyze a single file for problematic web_enrichment."""
|
|
result = {
|
|
'filepath': str(filepath),
|
|
'filename': filepath.name,
|
|
'has_web_enrichment': False,
|
|
'has_vague_confidence': False,
|
|
'has_xpath': False,
|
|
'is_fabricated': False,
|
|
'claims_count': 0,
|
|
'source_url': None,
|
|
'website': None,
|
|
'ghcid': filepath.stem
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract web_enrichment block
|
|
start, end, block = extract_block(content, 'web_enrichment')
|
|
|
|
if start >= 0:
|
|
result['has_web_enrichment'] = True
|
|
result['has_xpath'] = 'xpath:' in block
|
|
result['has_vague_confidence'] = has_vague_confidence(block)
|
|
result['claims_count'] = block.count('claim_type:')
|
|
|
|
# Extract source_url from the block
|
|
source_url_match = re.search(r'source_url:\s*([^\n]+)', block)
|
|
if source_url_match:
|
|
result['source_url'] = source_url_match.group(1).strip().strip("'\"")
|
|
|
|
result['is_fabricated'] = result['has_vague_confidence'] and not result['has_xpath']
|
|
|
|
# Also try to get website from google_maps_enrichment or other sources
|
|
parsed = yaml.safe_load(content)
|
|
if parsed:
|
|
# Try google_maps_enrichment
|
|
gm = parsed.get('google_maps_enrichment', {})
|
|
if gm and gm.get('website'):
|
|
result['website'] = gm['website']
|
|
|
|
# Try original_entry
|
|
oe = parsed.get('original_entry', {})
|
|
if oe and oe.get('website'):
|
|
result['website'] = result['website'] or oe['website']
|
|
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Identify files needing web enrichment')
|
|
parser.add_argument('--output', '-o', default='data/reenrich_queue.json',
|
|
help='Output file for re-enrichment queue')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Print details')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print(f"Analyzing web_enrichment sections...")
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
# Find files with web_enrichment using grep (fast)
|
|
result = subprocess.run(
|
|
'grep -l "^web_enrichment:" *.yaml 2>/dev/null || true',
|
|
shell=True,
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=custodian_dir
|
|
)
|
|
|
|
files_to_check = []
|
|
if result.stdout.strip():
|
|
files_to_check = [custodian_dir / f for f in result.stdout.strip().split('\n') if f]
|
|
|
|
print(f"Found {len(files_to_check)} files with web_enrichment sections")
|
|
print()
|
|
|
|
# Analyze all files
|
|
fabricated_files = []
|
|
valid_files = []
|
|
|
|
for filepath in files_to_check:
|
|
analysis = analyze_file(filepath)
|
|
|
|
if 'error' in analysis:
|
|
if args.verbose:
|
|
print(f" ERROR in {filepath.name}: {analysis['error']}")
|
|
continue
|
|
|
|
if analysis['is_fabricated']:
|
|
fabricated_files.append(analysis)
|
|
if args.verbose:
|
|
url = analysis['source_url'] or analysis['website'] or 'NO URL'
|
|
print(f" FABRICATED: {analysis['filename']} ({analysis['claims_count']} claims) -> {url}")
|
|
else:
|
|
valid_files.append(analysis)
|
|
|
|
print("=" * 60)
|
|
print("ANALYSIS RESULTS")
|
|
print("=" * 60)
|
|
print(f"Total files with web_enrichment: {len(files_to_check)}")
|
|
print(f" With proper xpath provenance: {len(valid_files)}")
|
|
print(f" FABRICATED (need re-enrichment): {len(fabricated_files)}")
|
|
print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated_files)}")
|
|
print()
|
|
|
|
# Extract URLs for re-enrichment
|
|
reenrich_queue = []
|
|
no_url_files = []
|
|
|
|
for f in fabricated_files:
|
|
url = f['source_url'] or f['website']
|
|
if url:
|
|
reenrich_queue.append({
|
|
'ghcid': f['ghcid'],
|
|
'filepath': f['filepath'],
|
|
'url': url,
|
|
'claims_count': f['claims_count']
|
|
})
|
|
else:
|
|
no_url_files.append(f['filename'])
|
|
|
|
print(f"Files with URLs to re-enrich: {len(reenrich_queue)}")
|
|
print(f"Files without URLs (manual review): {len(no_url_files)}")
|
|
|
|
if no_url_files and args.verbose:
|
|
print("\nFiles without URLs:")
|
|
for f in no_url_files[:10]:
|
|
print(f" - {f}")
|
|
if len(no_url_files) > 10:
|
|
print(f" ... and {len(no_url_files) - 10} more")
|
|
|
|
# Save queue
|
|
output_path = Path('/Users/kempersc/apps/glam') / args.output
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_data = {
|
|
'generated_at': datetime.now().isoformat(),
|
|
'total_files': len(fabricated_files),
|
|
'files_with_urls': len(reenrich_queue),
|
|
'files_without_urls': no_url_files,
|
|
'queue': reenrich_queue
|
|
}
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(output_data, f, indent=2)
|
|
|
|
print(f"\nRe-enrichment queue saved to: {output_path}")
|
|
print(f"Completed at: {datetime.now().isoformat()}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|