104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast extraction of URLs from fabricated web_enrichment files.
|
|
Uses regex only - no YAML parsing.
|
|
"""
|
|
import os
|
|
import re
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
print(f"Finding files with fabricated web_enrichment...")
|
|
|
|
# Find all files with web_enrichment
|
|
result = subprocess.run(
|
|
'grep -l "^web_enrichment:" *.yaml',
|
|
shell=True, capture_output=True, text=True, cwd=custodian_dir
|
|
)
|
|
|
|
all_files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()]
|
|
print(f"Found {len(all_files)} files with web_enrichment")
|
|
|
|
# Filter to those with 'confidence:' but without 'xpath:'
|
|
fabricated = []
|
|
valid = []
|
|
|
|
for filename in all_files:
|
|
filepath = custodian_dir / filename
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading {filename}: {e}")
|
|
continue
|
|
|
|
# Find web_enrichment block - match from web_enrichment: to next top-level key or end
|
|
match = re.search(r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)', content, re.MULTILINE | re.DOTALL)
|
|
if not match:
|
|
continue
|
|
|
|
block = match.group(0)
|
|
|
|
has_xpath = 'xpath:' in block
|
|
has_confidence = bool(re.search(r'confidence:\s*0\.\d+', block))
|
|
|
|
if has_confidence and not has_xpath:
|
|
# Extract source_url
|
|
url_match = re.search(r'source_url:\s*[\'"]?([^\s\'"]+)', block)
|
|
url = url_match.group(1) if url_match else None
|
|
|
|
# Count claims
|
|
claims_count = block.count('claim_type:')
|
|
|
|
fabricated.append({
|
|
'ghcid': filepath.stem,
|
|
'filepath': str(filepath),
|
|
'url': url,
|
|
'claims_count': claims_count
|
|
})
|
|
else:
|
|
valid.append(filename)
|
|
|
|
print(f"\n{'='*60}")
|
|
print("RESULTS")
|
|
print('='*60)
|
|
print(f"Files with proper xpath: {len(valid)}")
|
|
print(f"FABRICATED (no xpath): {len(fabricated)}")
|
|
print(f"Total fabricated claims: {sum(f['claims_count'] for f in fabricated)}")
|
|
|
|
# Files with URLs
|
|
with_url = [f for f in fabricated if f['url']]
|
|
without_url = [f for f in fabricated if not f['url']]
|
|
|
|
print(f"\nFiles with URL for re-enrichment: {len(with_url)}")
|
|
print(f"Files without URL (need lookup): {len(without_url)}")
|
|
|
|
# Save output
|
|
output = {
|
|
'generated_at': datetime.now().isoformat(),
|
|
'total_fabricated': len(fabricated),
|
|
'with_url': len(with_url),
|
|
'without_url': len(without_url),
|
|
'queue': with_url,
|
|
'missing_url': [f['ghcid'] for f in without_url]
|
|
}
|
|
|
|
output_path = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json')
|
|
with open(output_path, 'w') as f:
|
|
json.dump(output, f, indent=2)
|
|
|
|
print(f"\nSaved to: {output_path}")
|
|
|
|
# Show sample URLs
|
|
print(f"\nSample URLs to re-enrich:")
|
|
for item in with_url[:10]:
|
|
print(f" {item['ghcid']}: {item['url']}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|