glam/scripts/discover_websites_crawl4ai.py
2025-12-26 14:30:31 +01:00

150 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Simplified Website Discovery for Custodians using crawl4ai.
Discovers websites by:
1. Searching DuckDuckGo
2. Verifying with crawl4ai
3. Updating YAML files with discovered URLs
"""
import asyncio
import httpx
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin, urlparse
import yaml
# Logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
REQUEST_DELAY = 3.0 # seconds between requests
DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q="
async def discover_websites(name, city, country):
"""Search DuckDuckGo and verify websites."""
logger.info(f"Searching for: {name}")
# Simple search - use .format() to avoid f-string issues
city_part = f" {city}" if city else ""
query = f"{name}{city_part}" if city_part else f"{name}"
# Search DuckDuckGo
search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}"
try:
async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
response = await client.get(search_url)
if response.status_code not in [200, 202]:
logger.warning(f"Search failed: {response.status_code}")
return None
html = response.text
links = []
for match in re.finditer(r'<a[^>]+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)</a>', html, re.I):
href = match.group(1).replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
if href:
links.append({'url': href, 'title': match.group(3)})
if not links:
logger.info(f"No results found")
return None
logger.info(f"Found {len(links)} candidates, verifying...")
verified = []
for link in sorted(links, key=lambda x: len(x['title'])):
try:
async with httpx.AsyncClient(timeout=15.0) as client:
verify_response = await client.get(link['url'])
if verify_response.status_code == 200:
logger.info(f"Verified: {link['url']}")
verified.append({
'url': link['url'],
'title': link['title'],
'status': 'found'
})
else:
logger.debug(f"Verification failed for {link['url']}")
except Exception:
logger.debug(f"Verification error for {link['url']}")
if verified:
best = verified[0]
logger.info(f"Best candidate: {best['url']}")
return {
'status': 'found',
'message': f"Discovered and verified: {best['url']}",
'website_url': best['url'],
'title': best.get('title'),
}
else:
logger.info(f"No valid websites found")
return {
'status': 'not_found',
'message': 'No valid results found'
}
except Exception as e:
logger.error(f"Search error: {e}")
return None
def update_custodian_file(filepath, website_url, title):
"""Update custodian YAML file with discovered website."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
logger.error(f"Invalid file: {filepath}")
return False
# Add website discovery section
entry['website_discovery'] = {
'website_url': website_url,
'discovery_date': datetime.now(timezone.utc).isoformat(),
'discovery_method': 'crawl4ai_search_and_verify',
'title': title,
'confidence_score': 0.0, # Will be updated if verification succeeds
}
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
logger.info(f"Updated: {filepath}")
return True
except Exception as e:
logger.error(f"Failed to update {filepath}: {e}")
return False
async def main():
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file
logger.info(f"Processing {len(files)} custodian files...")
for filepath in files:
name = Path(filepath).stem.replace('_', ' ')
logger.info(f"Processing: {name}")
url = await discover_websites(name, None, 'JP')
if url:
website_url = url.get('website_url') or url.get('url')
title = url.get('title')
if update_custodian_file(filepath, website_url, title):
logger.info(f" → Discovered: {website_url}")
else:
logger.info(f"No website found")
logger.info("Done!")
if __name__ == '__main__':
asyncio.run(main())