glam/scripts/extract_reenrich_urls_fast.py
2025-12-21 22:12:34 +01:00

104 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
Fast extraction of URLs from fabricated web_enrichment files.
Uses regex only - no YAML parsing.
"""
import os
import re
import json
import subprocess
from pathlib import Path
from datetime import datetime
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print(f"Finding files with fabricated web_enrichment...")
# Find all files with web_enrichment
result = subprocess.run(
'grep -l "^web_enrichment:" *.yaml',
shell=True, capture_output=True, text=True, cwd=custodian_dir
)
all_files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()]
print(f"Found {len(all_files)} files with web_enrichment")
# Filter to those with 'confidence:' but without 'xpath:'
fabricated = []
valid = []
for filename in all_files:
filepath = custodian_dir / filename
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading {filename}: {e}")
continue
# Find web_enrichment block - match from web_enrichment: to next top-level key or end
match = re.search(r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)', content, re.MULTILINE | re.DOTALL)
if not match:
continue
block = match.group(0)
has_xpath = 'xpath:' in block
has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block))
if has_confidence and not has_xpath:
# Extract source_url
url_match = re.search(r'source_url:\s*[\'"]?([^\s\'"]+)', block)
url = url_match.group(1) if url_match else None
# Count claims
claims_count = block.count('claim_type:')
fabricated.append({
'ghcid': filepath.stem,
'filepath': str(filepath),
'url': url,
'claims_count': claims_count
})
else:
valid.append(filename)
print(f"\n{'='*60}")
print("RESULTS")
print('='*60)
print(f"Files with proper xpath: {len(valid)}")
print(f"FABRICATED (no xpath): {len(fabricated)}")
print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated)}")
# Files with URLs
with_url = [f for f in fabricated if f['url']]
without_url = [f for f in fabricated if not f['url']]
print(f"\nFiles with URL for re-enrichment: {len(with_url)}")
print(f"Files without URL (need lookup): {len(without_url)}")
# Save output
output = {
'generated_at': datetime.now().isoformat(),
'total_fabricated': len(fabricated),
'with_url': len(with_url),
'without_url': len(without_url),
'queue': with_url,
'missing_url': [f['ghcid'] for f in without_url]
}
output_path = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json')
with open(output_path, 'w') as f:
json.dump(output, f, indent=2)
print(f"\nSaved to: {output_path}")
# Show sample URLs
print(f"\nSample URLs to re-enrich:")
for item in with_url[:10]:
print(f" {item['ghcid']}: {item['url']}")
if __name__ == '__main__':
main()