glam/scripts/discover_websites_crawl4ai.py
2025-12-27 02:15:17 +01:00

183 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""
Simplified Website Discovery for Custodians using DuckDuckGo Instant Answer API.
Discovers websites by searching and updating YAML files.
"""
import httpx
import logging
import re
from datetime import datetime, timezone
from pathlib import Path
import yaml
# Logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
REQUEST_DELAY = 1.0 # seconds between requests
# DuckDuckGo Instant Answer API
DDG_API_URL = "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1"
def discover_websites(name, city, country):
"""Search DuckDuckGo Instant Answer API and verify websites."""
logger.info(f"Searching for: {name}")
# Simple search query
query = f"{name} {city}" if city else f"{name}"
search_url = DDG_API_URL.format(query)
try:
with httpx.Client(follow_redirects=True, timeout=30.0) as client:
response = client.get(search_url)
if response.status_code not in (200, 201, 202, 301, 302):
logger.warning(f"Search failed: {response.status_code}")
return None
try:
data = response.json()
except Exception as e:
logger.error(f"Failed to parse JSON: {e}")
return None
# Extract URLs from DuckDuckGo results
results = []
# Extract URLs from DuckDuckGo results
results = []
# Check for instant answer with URL
if 'Abstract' in data and 'AbstractText' in data['Abstract']:
abstract = data['Abstract']['AbstractText']
urls = re.findall(r'https?://[^\s<>"\'()]+', abstract)
for url in urls[:5]: # Take first 5 URLs
results.append({
'url': url,
'title': abstract[:50],
'source': 'instant_answer'
})
# Check for related topics (but skip if empty list)
if 'RelatedTopics' in data and isinstance(data['RelatedTopics'], dict):
# Related topics may contain URLs
for topic in data['RelatedTopics'].get('Topics', [])[:3]:
if 'FirstURL' in topic:
results.append({
'url': topic['FirstURL'],
'title': topic.get('Text', '')[:50],
'source': 'related_topic'
})
# Remove duplicates while preserving order
seen = set()
unique_results = []
for r in results:
if r['url'] not in seen:
seen.add(r['url'])
unique_results.append(r)
if not unique_results:
logger.info(f"No results found")
return None
logger.info(f"Found {len(unique_results)} candidates, verifying...")
# Verify candidates
for result in unique_results:
try:
with httpx.Client(timeout=10.0, follow_redirects=True) as verify_client:
verify_response = verify_client.get(result['url'])
if verify_response.status_code == 200:
logger.info(f"Verified: {result['url']}")
return {
'status': 'found',
'website_url': result['url'],
'title': result.get('title', ''),
'source': result.get('source', 'search')
}
else:
logger.debug(f"Verification failed: {result['url']} - {verify_response.status_code}")
except Exception as e:
logger.debug(f"Verification error for {result['url']}: {e}")
logger.info(f"No valid websites found")
return None
except Exception as e:
logger.error(f"Search error: {e}")
return None
def update_custodian_file(filepath, website_url, title):
"""Update custodian YAML file with discovered website."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
logger.error(f"Invalid file: {filepath}")
return False
# Add website discovery
entry['website_discovery'] = {
'website_url': website_url,
'discovery_date': datetime.now(timezone.utc).isoformat(),
'discovery_method': 'duckduckgo_instant_answer',
'search_query': "unknown",
'confidence_score': 1.0,
'title': title
}
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
logger.info(f"Updated: {filepath}")
return True
except Exception as e:
logger.error(f"Failed to update {filepath}: {e}")
return False
def main():
import sys
limit = int(sys.argv[sys.argv.index('--limit') + 1]) if '--limit' in sys.argv else 10
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:limit]
logger.info(f"Processing {len(files)} custodian files...")
for filepath in files:
# Read custodian YAML to get actual name
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
logger.warning(f"Invalid file: {filepath}")
continue
# Extract custodian name for search (fallback to filename stem)
name = entry.get('custodian_name', {}).get('claim_value') or Path(filepath).stem.replace('_', ' ')
logger.info(f"Processing: {name}")
result = discover_websites(name, None, 'JP')
if result and result.get('website_url'):
if update_custodian_file(filepath, result['website_url'], result.get('title', '')):
logger.info(f" → Discovered: {result['website_url']}")
else:
logger.info(f"No website found")
logger.info("Done!")
if __name__ == '__main__':
main()