#!/usr/bin/env python3 """ Simplified Website Discovery for Custodians using crawl4ai. Discovers websites by: 1. Searching DuckDuckGo 2. Verifying with crawl4ai 3. Updating YAML files with discovered URLs """ import asyncio import httpx import json import logging import re import sys from datetime import datetime, timezone from pathlib import Path from urllib.parse import urljoin, urlparse import yaml # Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json" REQUEST_DELAY = 3.0 # seconds between requests DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q=" async def discover_websites(name, city, country): """Search DuckDuckGo and verify websites.""" logger.info(f"Searching for: {name}") # Simple search - use .format() to avoid f-string issues city_part = f" {city}" if city else "" query = f"{name}{city_part}" if city_part else f"{name}" # Search DuckDuckGo search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}" try: async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client: response = await client.get(search_url) if response.status_code not in [200, 202]: logger.warning(f"Search failed: {response.status_code}") return None html = response.text links = [] for match in re.finditer(r']+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)', html, re.I): href = match.group(1).replace('&', '&').replace('<', '<').replace('>', '>') if href: links.append({'url': href, 'title': match.group(3)}) if not links: logger.info(f"No results found") return None logger.info(f"Found {len(links)} candidates, verifying...") verified = [] for link in sorted(links, key=lambda x: len(x['title'])): try: async with httpx.AsyncClient(timeout=15.0) as client: verify_response = await client.get(link['url']) if verify_response.status_code == 200: logger.info(f"Verified: {link['url']}") verified.append({ 'url': link['url'], 'title': link['title'], 'status': 'found' }) else: logger.debug(f"Verification failed for {link['url']}") except Exception: logger.debug(f"Verification error for {link['url']}") if verified: best = verified[0] logger.info(f"Best candidate: {best['url']}") return { 'status': 'found', 'message': f"Discovered and verified: {best['url']}", 'website_url': best['url'], 'title': best.get('title'), } else: logger.info(f"No valid websites found") return { 'status': 'not_found', 'message': 'No valid results found' } except Exception as e: logger.error(f"Search error: {e}") return None def update_custodian_file(filepath, website_url, title): """Update custodian YAML file with discovered website.""" try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: logger.error(f"Invalid file: {filepath}") return False # Add website discovery section entry['website_discovery'] = { 'website_url': website_url, 'discovery_date': datetime.now(timezone.utc).isoformat(), 'discovery_method': 'crawl4ai_search_and_verify', 'title': title, 'confidence_score': 0.0, # Will be updated if verification succeeds } with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.info(f"Updated: {filepath}") return True except Exception as e: logger.error(f"Failed to update {filepath}: {e}") return False async def main(): files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file logger.info(f"Processing {len(files)} custodian files...") for filepath in files: name = Path(filepath).stem.replace('_', ' ') logger.info(f"Processing: {name}") url = await discover_websites(name, None, 'JP') if url: website_url = url.get('website_url') or url.get('url') title = url.get('title') if update_custodian_file(filepath, website_url, title): logger.info(f" → Discovered: {website_url}") else: logger.info(f"No website found") logger.info("Done!") if __name__ == '__main__': asyncio.run(main())