glam/scripts/discover_websites_crawl4ai.py

#!/usr/bin/env python3
"""
Simplified Website Discovery for Custodians using DuckDuckGo Instant Answer API.
Discovers websites by searching and updating YAML files.
"""
import httpx
import logging
import re
from datetime import datetime, timezone
from pathlib import Path
import yaml

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
REQUEST_DELAY = 1.0  # seconds between requests

# DuckDuckGo Instant Answer API
DDG_API_URL = "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1"


def discover_websites(name, city, country):
    """Search DuckDuckGo Instant Answer API and verify websites."""
    logger.info(f"Searching for: {name}")

    # Simple search query
    query = f"{name} {city}" if city else f"{name}"
    search_url = DDG_API_URL.format(query)

    try:
        with httpx.Client(follow_redirects=True, timeout=30.0) as client:
            response = client.get(search_url)

            if response.status_code not in (200, 201, 202, 301, 302):
                logger.warning(f"Search failed: {response.status_code}")
                return None

            try:
                data = response.json()
            except Exception as e:
                logger.error(f"Failed to parse JSON: {e}")
                return None

            # Extract URLs from DuckDuckGo results
            results = []

            # Extract URLs from DuckDuckGo results
            results = []

            # Check for instant answer with URL
            if 'Abstract' in data and 'AbstractText' in data['Abstract']:
                abstract = data['Abstract']['AbstractText']
                urls = re.findall(r'https?://[^\s<>"\'()]+', abstract)
                for url in urls[:5]:  # Take first 5 URLs
                    results.append({
                        'url': url,
                        'title': abstract[:50],
                        'source': 'instant_answer'
                    })

            # Check for related topics (but skip if empty list)
            if 'RelatedTopics' in data and isinstance(data['RelatedTopics'], dict):
                # Related topics may contain URLs
                for topic in data['RelatedTopics'].get('Topics', [])[:3]:
                    if 'FirstURL' in topic:
                        results.append({
                            'url': topic['FirstURL'],
                            'title': topic.get('Text', '')[:50],
                            'source': 'related_topic'
                        })

            # Remove duplicates while preserving order
            seen = set()
            unique_results = []
            for r in results:
                if r['url'] not in seen:
                    seen.add(r['url'])
                    unique_results.append(r)

            if not unique_results:
                logger.info(f"No results found")
                return None

            logger.info(f"Found {len(unique_results)} candidates, verifying...")

            # Verify candidates
            for result in unique_results:
                try:
                    with httpx.Client(timeout=10.0, follow_redirects=True) as verify_client:
                        verify_response = verify_client.get(result['url'])

                        if verify_response.status_code == 200:
                            logger.info(f"Verified: {result['url']}")
                            return {
                                'status': 'found',
                                'website_url': result['url'],
                                'title': result.get('title', ''),
                                'source': result.get('source', 'search')
                            }
                        else:
                            logger.debug(f"Verification failed: {result['url']} - {verify_response.status_code}")

                except Exception as e:
                    logger.debug(f"Verification error for {result['url']}: {e}")

            logger.info(f"No valid websites found")
            return None

    except Exception as e:
        logger.error(f"Search error: {e}")
        return None


def update_custodian_file(filepath, website_url, title):
    """Update custodian YAML file with discovered website."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)
            if not entry:
                logger.error(f"Invalid file: {filepath}")
                return False

        # Add website discovery
        entry['website_discovery'] = {
            'website_url': website_url,
            'discovery_date': datetime.now(timezone.utc).isoformat(),
            'discovery_method': 'duckduckgo_instant_answer',
            'search_query': "unknown",
            'confidence_score': 1.0,
            'title': title
        }

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        logger.info(f"Updated: {filepath}")
        return True
    except Exception as e:
        logger.error(f"Failed to update {filepath}: {e}")
        return False


def main():
    import sys
    limit = int(sys.argv[sys.argv.index('--limit') + 1]) if '--limit' in sys.argv else 10

    files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:limit]

    logger.info(f"Processing {len(files)} custodian files...")

    for filepath in files:
        # Read custodian YAML to get actual name
        with open(filepath, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)
            if not entry:
                logger.warning(f"Invalid file: {filepath}")
                continue

        # Extract custodian name for search (fallback to filename stem)
        name = entry.get('custodian_name', {}).get('claim_value') or Path(filepath).stem.replace('_', ' ')

        logger.info(f"Processing: {name}")

        result = discover_websites(name, None, 'JP')

        if result and result.get('website_url'):
            if update_custodian_file(filepath, result['website_url'], result.get('title', '')):
                logger.info(f"  → Discovered: {result['website_url']}")
        else:
            logger.info(f"No website found")

    logger.info("Done!")


if __name__ == '__main__':
    main()