#!/usr/bin/env python3 """ Simplified Website Discovery for Custodians using DuckDuckGo Instant Answer API. Discovers websites by searching and updating YAML files. """ import httpx import logging import re from datetime import datetime, timezone from pathlib import Path import yaml # Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json" REQUEST_DELAY = 1.0 # seconds between requests # DuckDuckGo Instant Answer API DDG_API_URL = "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1" def discover_websites(name, city, country): """Search DuckDuckGo Instant Answer API and verify websites.""" logger.info(f"Searching for: {name}") # Simple search query query = f"{name} {city}" if city else f"{name}" search_url = DDG_API_URL.format(query) try: with httpx.Client(follow_redirects=True, timeout=30.0) as client: response = client.get(search_url) if response.status_code not in (200, 201, 202, 301, 302): logger.warning(f"Search failed: {response.status_code}") return None try: data = response.json() except Exception as e: logger.error(f"Failed to parse JSON: {e}") return None # Extract URLs from DuckDuckGo results results = [] # Extract URLs from DuckDuckGo results results = [] # Check for instant answer with URL if 'Abstract' in data and 'AbstractText' in data['Abstract']: abstract = data['Abstract']['AbstractText'] urls = re.findall(r'https?://[^\s<>"\'()]+', abstract) for url in urls[:5]: # Take first 5 URLs results.append({ 'url': url, 'title': abstract[:50], 'source': 'instant_answer' }) # Check for related topics (but skip if empty list) if 'RelatedTopics' in data and isinstance(data['RelatedTopics'], dict): # Related topics may contain URLs for topic in data['RelatedTopics'].get('Topics', [])[:3]: if 'FirstURL' in topic: results.append({ 'url': topic['FirstURL'], 'title': topic.get('Text', '')[:50], 'source': 'related_topic' }) # Remove duplicates while preserving order seen = set() unique_results = [] for r in results: if r['url'] not in seen: seen.add(r['url']) unique_results.append(r) if not unique_results: logger.info(f"No results found") return None logger.info(f"Found {len(unique_results)} candidates, verifying...") # Verify candidates for result in unique_results: try: with httpx.Client(timeout=10.0, follow_redirects=True) as verify_client: verify_response = verify_client.get(result['url']) if verify_response.status_code == 200: logger.info(f"Verified: {result['url']}") return { 'status': 'found', 'website_url': result['url'], 'title': result.get('title', ''), 'source': result.get('source', 'search') } else: logger.debug(f"Verification failed: {result['url']} - {verify_response.status_code}") except Exception as e: logger.debug(f"Verification error for {result['url']}: {e}") logger.info(f"No valid websites found") return None except Exception as e: logger.error(f"Search error: {e}") return None def update_custodian_file(filepath, website_url, title): """Update custodian YAML file with discovered website.""" try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: logger.error(f"Invalid file: {filepath}") return False # Add website discovery entry['website_discovery'] = { 'website_url': website_url, 'discovery_date': datetime.now(timezone.utc).isoformat(), 'discovery_method': 'duckduckgo_instant_answer', 'search_query': "unknown", 'confidence_score': 1.0, 'title': title } with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.info(f"Updated: {filepath}") return True except Exception as e: logger.error(f"Failed to update {filepath}: {e}") return False def main(): import sys limit = int(sys.argv[sys.argv.index('--limit') + 1]) if '--limit' in sys.argv else 10 files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:limit] logger.info(f"Processing {len(files)} custodian files...") for filepath in files: # Read custodian YAML to get actual name with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: logger.warning(f"Invalid file: {filepath}") continue # Extract custodian name for search (fallback to filename stem) name = entry.get('custodian_name', {}).get('claim_value') or Path(filepath).stem.replace('_', ' ') logger.info(f"Processing: {name}") result = discover_websites(name, None, 'JP') if result and result.get('website_url'): if update_custodian_file(filepath, result['website_url'], result.get('title', '')): logger.info(f" → Discovered: {result['website_url']}") else: logger.info(f"No website found") logger.info("Done!") if __name__ == '__main__': main()