glam/scripts/discover_websites_crawl4ai.py

#!/usr/bin/env python3
"""
Simplified Website Discovery for Custodians using crawl4ai.
Discovers websites by:
1. Searching DuckDuckGo
2. Verifying with crawl4ai
3. Updating YAML files with discovered URLs
"""
import asyncio
import httpx
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin, urlparse
import yaml

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
REQUEST_DELAY = 3.0  # seconds between requests
DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q="

async def discover_websites(name, city, country):
    """Search DuckDuckGo and verify websites."""
    logger.info(f"Searching for: {name}")

    # Simple search - use .format() to avoid f-string issues
    city_part = f" {city}" if city else ""
    query = f"{name}{city_part}" if city_part else f"{name}"

    # Search DuckDuckGo
    search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}"

    try:
        async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
            response = await client.get(search_url)
            if response.status_code not in [200, 202]:
                logger.warning(f"Search failed: {response.status_code}")
                return None

            html = response.text
            links = []
            for match in re.finditer(r'<a[^>]+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)</a>', html, re.I):
                href = match.group(1).replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
                if href:
                    links.append({'url': href, 'title': match.group(3)})

            if not links:
                logger.info(f"No results found")
                return None

            logger.info(f"Found {len(links)} candidates, verifying...")

            verified = []
            for link in sorted(links, key=lambda x: len(x['title'])):
                try:
                    async with httpx.AsyncClient(timeout=15.0) as client:
                        verify_response = await client.get(link['url'])
                        if verify_response.status_code == 200:
                            logger.info(f"Verified: {link['url']}")
                            verified.append({
                                'url': link['url'],
                                'title': link['title'],
                                'status': 'found'
                            })
                        else:
                            logger.debug(f"Verification failed for {link['url']}")
                except Exception:
                    logger.debug(f"Verification error for {link['url']}")

            if verified:
                best = verified[0]
                logger.info(f"Best candidate: {best['url']}")
                return {
                    'status': 'found',
                    'message': f"Discovered and verified: {best['url']}",
                    'website_url': best['url'],
                    'title': best.get('title'),
                }
            else:
                logger.info(f"No valid websites found")
                return {
                    'status': 'not_found',
                    'message': 'No valid results found'
                }

    except Exception as e:
        logger.error(f"Search error: {e}")
        return None

def update_custodian_file(filepath, website_url, title):
    """Update custodian YAML file with discovered website."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)
        if not entry:
                logger.error(f"Invalid file: {filepath}")
                return False

        # Add website discovery section
        entry['website_discovery'] = {
            'website_url': website_url,
            'discovery_date': datetime.now(timezone.utc).isoformat(),
            'discovery_method': 'crawl4ai_search_and_verify',
            'title': title,
            'confidence_score': 0.0,  # Will be updated if verification succeeds
        }

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        logger.info(f"Updated: {filepath}")
        return True
    except Exception as e:
        logger.error(f"Failed to update {filepath}: {e}")
        return False

async def main():
    files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1]  # Test with 1 file

    logger.info(f"Processing {len(files)} custodian files...")

    for filepath in files:
        name = Path(filepath).stem.replace('_', ' ')
        logger.info(f"Processing: {name}")

        url = await discover_websites(name, None, 'JP')

        if url:
            website_url = url.get('website_url') or url.get('url')
            title = url.get('title')
            if update_custodian_file(filepath, website_url, title):
                logger.info(f"  → Discovered: {website_url}")
            else:
                logger.info(f"No website found")

    logger.info("Done!")

if __name__ == '__main__':
    asyncio.run(main())