150 lines
5.4 KiB
Python
150 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simplified Website Discovery for Custodians using crawl4ai.
|
|
Discovers websites by:
|
|
1. Searching DuckDuckGo
|
|
2. Verifying with crawl4ai
|
|
3. Updating YAML files with discovered URLs
|
|
"""
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
import yaml
|
|
|
|
# Logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
|
|
REQUEST_DELAY = 3.0 # seconds between requests
|
|
DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q="
|
|
|
|
async def discover_websites(name, city, country):
|
|
"""Search DuckDuckGo and verify websites."""
|
|
logger.info(f"Searching for: {name}")
|
|
|
|
# Simple search - use .format() to avoid f-string issues
|
|
city_part = f" {city}" if city else ""
|
|
query = f"{name}{city_part}" if city_part else f"{name}"
|
|
|
|
# Search DuckDuckGo
|
|
search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}"
|
|
|
|
try:
|
|
async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
|
|
response = await client.get(search_url)
|
|
if response.status_code not in [200, 202]:
|
|
logger.warning(f"Search failed: {response.status_code}")
|
|
return None
|
|
|
|
html = response.text
|
|
links = []
|
|
for match in re.finditer(r'<a[^>]+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)</a>', html, re.I):
|
|
href = match.group(1).replace('&', '&').replace('<', '<').replace('>', '>')
|
|
if href:
|
|
links.append({'url': href, 'title': match.group(3)})
|
|
|
|
if not links:
|
|
logger.info(f"No results found")
|
|
return None
|
|
|
|
logger.info(f"Found {len(links)} candidates, verifying...")
|
|
|
|
verified = []
|
|
for link in sorted(links, key=lambda x: len(x['title'])):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
verify_response = await client.get(link['url'])
|
|
if verify_response.status_code == 200:
|
|
logger.info(f"Verified: {link['url']}")
|
|
verified.append({
|
|
'url': link['url'],
|
|
'title': link['title'],
|
|
'status': 'found'
|
|
})
|
|
else:
|
|
logger.debug(f"Verification failed for {link['url']}")
|
|
except Exception:
|
|
logger.debug(f"Verification error for {link['url']}")
|
|
|
|
if verified:
|
|
best = verified[0]
|
|
logger.info(f"Best candidate: {best['url']}")
|
|
return {
|
|
'status': 'found',
|
|
'message': f"Discovered and verified: {best['url']}",
|
|
'website_url': best['url'],
|
|
'title': best.get('title'),
|
|
}
|
|
else:
|
|
logger.info(f"No valid websites found")
|
|
return {
|
|
'status': 'not_found',
|
|
'message': 'No valid results found'
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search error: {e}")
|
|
return None
|
|
|
|
def update_custodian_file(filepath, website_url, title):
|
|
"""Update custodian YAML file with discovered website."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
if not entry:
|
|
logger.error(f"Invalid file: {filepath}")
|
|
return False
|
|
|
|
# Add website discovery section
|
|
entry['website_discovery'] = {
|
|
'website_url': website_url,
|
|
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
|
'discovery_method': 'crawl4ai_search_and_verify',
|
|
'title': title,
|
|
'confidence_score': 0.0, # Will be updated if verification succeeds
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
logger.info(f"Updated: {filepath}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to update {filepath}: {e}")
|
|
return False
|
|
|
|
async def main():
|
|
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file
|
|
|
|
logger.info(f"Processing {len(files)} custodian files...")
|
|
|
|
for filepath in files:
|
|
name = Path(filepath).stem.replace('_', ' ')
|
|
logger.info(f"Processing: {name}")
|
|
|
|
url = await discover_websites(name, None, 'JP')
|
|
|
|
if url:
|
|
website_url = url.get('website_url') or url.get('url')
|
|
title = url.get('title')
|
|
if update_custodian_file(filepath, website_url, title):
|
|
logger.info(f" → Discovered: {website_url}")
|
|
else:
|
|
logger.info(f"No website found")
|
|
|
|
logger.info("Done!")
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|