183 lines
6.6 KiB
Python
183 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simplified Website Discovery for Custodians using DuckDuckGo Instant Answer API.
|
|
Discovers websites by searching and updating YAML files.
|
|
"""
|
|
import httpx
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
# Logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
|
|
REQUEST_DELAY = 1.0 # seconds between requests
|
|
|
|
# DuckDuckGo Instant Answer API
|
|
DDG_API_URL = "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1"
|
|
|
|
|
|
def discover_websites(name, city, country):
|
|
"""Search DuckDuckGo Instant Answer API and verify websites."""
|
|
logger.info(f"Searching for: {name}")
|
|
|
|
# Simple search query
|
|
query = f"{name} {city}" if city else f"{name}"
|
|
search_url = DDG_API_URL.format(query)
|
|
|
|
try:
|
|
with httpx.Client(follow_redirects=True, timeout=30.0) as client:
|
|
response = client.get(search_url)
|
|
|
|
if response.status_code not in (200, 201, 202, 301, 302):
|
|
logger.warning(f"Search failed: {response.status_code}")
|
|
return None
|
|
|
|
try:
|
|
data = response.json()
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse JSON: {e}")
|
|
return None
|
|
|
|
# Extract URLs from DuckDuckGo results
|
|
results = []
|
|
|
|
# Extract URLs from DuckDuckGo results
|
|
results = []
|
|
|
|
# Check for instant answer with URL
|
|
if 'Abstract' in data and 'AbstractText' in data['Abstract']:
|
|
abstract = data['Abstract']['AbstractText']
|
|
urls = re.findall(r'https?://[^\s<>"\'()]+', abstract)
|
|
for url in urls[:5]: # Take first 5 URLs
|
|
results.append({
|
|
'url': url,
|
|
'title': abstract[:50],
|
|
'source': 'instant_answer'
|
|
})
|
|
|
|
# Check for related topics (but skip if empty list)
|
|
if 'RelatedTopics' in data and isinstance(data['RelatedTopics'], dict):
|
|
# Related topics may contain URLs
|
|
for topic in data['RelatedTopics'].get('Topics', [])[:3]:
|
|
if 'FirstURL' in topic:
|
|
results.append({
|
|
'url': topic['FirstURL'],
|
|
'title': topic.get('Text', '')[:50],
|
|
'source': 'related_topic'
|
|
})
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_results = []
|
|
for r in results:
|
|
if r['url'] not in seen:
|
|
seen.add(r['url'])
|
|
unique_results.append(r)
|
|
|
|
if not unique_results:
|
|
logger.info(f"No results found")
|
|
return None
|
|
|
|
logger.info(f"Found {len(unique_results)} candidates, verifying...")
|
|
|
|
# Verify candidates
|
|
for result in unique_results:
|
|
try:
|
|
with httpx.Client(timeout=10.0, follow_redirects=True) as verify_client:
|
|
verify_response = verify_client.get(result['url'])
|
|
|
|
if verify_response.status_code == 200:
|
|
logger.info(f"Verified: {result['url']}")
|
|
return {
|
|
'status': 'found',
|
|
'website_url': result['url'],
|
|
'title': result.get('title', ''),
|
|
'source': result.get('source', 'search')
|
|
}
|
|
else:
|
|
logger.debug(f"Verification failed: {result['url']} - {verify_response.status_code}")
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Verification error for {result['url']}: {e}")
|
|
|
|
logger.info(f"No valid websites found")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search error: {e}")
|
|
return None
|
|
|
|
|
|
def update_custodian_file(filepath, website_url, title):
|
|
"""Update custodian YAML file with discovered website."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
if not entry:
|
|
logger.error(f"Invalid file: {filepath}")
|
|
return False
|
|
|
|
# Add website discovery
|
|
entry['website_discovery'] = {
|
|
'website_url': website_url,
|
|
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
|
'discovery_method': 'duckduckgo_instant_answer',
|
|
'search_query': "unknown",
|
|
'confidence_score': 1.0,
|
|
'title': title
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
logger.info(f"Updated: {filepath}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to update {filepath}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
import sys
|
|
limit = int(sys.argv[sys.argv.index('--limit') + 1]) if '--limit' in sys.argv else 10
|
|
|
|
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:limit]
|
|
|
|
logger.info(f"Processing {len(files)} custodian files...")
|
|
|
|
for filepath in files:
|
|
# Read custodian YAML to get actual name
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
if not entry:
|
|
logger.warning(f"Invalid file: {filepath}")
|
|
continue
|
|
|
|
# Extract custodian name for search (fallback to filename stem)
|
|
name = entry.get('custodian_name', {}).get('claim_value') or Path(filepath).stem.replace('_', ' ')
|
|
|
|
logger.info(f"Processing: {name}")
|
|
|
|
result = discover_websites(name, None, 'JP')
|
|
|
|
if result and result.get('website_url'):
|
|
if update_custodian_file(filepath, result['website_url'], result.get('title', '')):
|
|
logger.info(f" → Discovered: {result['website_url']}")
|
|
else:
|
|
logger.info(f"No website found")
|
|
|
|
logger.info("Done!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|