#!/usr/bin/env python3 """ Discover website URLs for custodian YAML files that are missing them. This script uses web search (via DuckDuckGo or Google) to find official websites for heritage institutions based on their name and location. Search strategy: 1. Search for institution name + city + country 2. Search for institution name + "official website" 3. Search for institution name + institution type (museum, library, archive) Output: - Updates custodian YAML files with discovered website URLs - Stores provenance for discovered URLs Usage: python scripts/discover_custodian_websites.py [options] Options: --dry-run Show what would be discovered without modifying files --limit N Process only first N files (for testing) --file PATH Process a single specific file --country CODE Filter by country code (e.g., JP, CZ) --resume Resume from last checkpoint Requirements: pip install duckduckgo-search pyyaml httpx """ import argparse import asyncio import json import logging import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: from duckduckgo_search import DDGS except ImportError: print("Please install duckduckgo-search: pip install duckduckgo-search") sys.exit(1) try: import httpx except ImportError: print("Please install httpx: pip install httpx") sys.exit(1) # Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_checkpoint.json" REQUEST_DELAY = 3.0 # seconds between searches (be nice to search engines) # Domain blacklist (not actual institution websites) DOMAIN_BLACKLIST = { 'wikipedia.org', 'wikidata.org', 'wikimedia.org', 'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com', 'tiktok.com', 'pinterest.com', 'tripadvisor.com', 'tripadvisor.jp', 'yelp.com', 'google.com', 'google.co.jp', 'maps.google.com', 'amazon.com', 'amazon.co.jp', 'ebay.com', 'booking.com', 'expedia.com', 'hotels.com', 'foursquare.com', 'bing.com', 'yahoo.com', 'findagrave.com', 'ancestry.com', 'familysearch.org', 'academia.edu', 'researchgate.net', 'timeanddate.com', 'weather.com', } # Domain preferences (prefer these TLDs for official sites) PREFERRED_TLDS = { 'JP': ['.go.jp', '.lg.jp', '.ac.jp', '.or.jp', '.jp'], 'CZ': ['.cz', '.gov.cz'], 'NL': ['.nl', '.gov.nl'], 'BE': ['.be', '.gov.be'], 'DE': ['.de', '.gov.de'], 'AT': ['.at', '.gv.at'], 'CH': ['.ch', '.admin.ch'], } def get_custodian_name(entry: dict) -> str | None: """Extract institution name from entry.""" # Priority 1: Emic name (native language official name) if entry.get('custodian_name', {}).get('emic_name'): return entry['custodian_name']['emic_name'] # Priority 2: Wikidata native language label (ja, zh, ko, etc.) wikidata = entry.get('wikidata_enrichment', {}) country = get_country_from_entry(entry) # Map country to preferred label language country_lang_map = { 'JP': 'ja', 'CN': 'zh', 'KR': 'ko', 'TW': 'zh', 'TH': 'th', 'VN': 'vi', 'RU': 'ru', 'GR': 'el', 'IL': 'he', 'SA': 'ar', 'IR': 'fa', } if country in country_lang_map: lang = country_lang_map[country] native_label = wikidata.get(f'wikidata_label_{lang}') or wikidata.get('wikidata_labels', {}).get(lang) if native_label: return native_label # Priority 3: Claim value if entry.get('custodian_name', {}).get('claim_value'): return entry['custodian_name']['claim_value'] # Priority 4: Original entry name if entry.get('original_entry', {}).get('name'): return entry['original_entry']['name'] # Priority 5: Organisatie (Dutch) if entry.get('original_entry', {}).get('organisatie'): return entry['original_entry']['organisatie'] return None def get_country_from_entry(entry: dict) -> str | None: """Extract country code from entry.""" # Check location.country if entry.get('location', {}).get('country'): return entry['location']['country'] # Check original_entry.locations if entry.get('original_entry', {}).get('locations'): loc = entry['original_entry']['locations'][0] if loc.get('country'): return loc['country'] return None def get_location_info(entry: dict) -> dict: """Extract location information from entry.""" location = {} # Check original_entry.locations if entry.get('original_entry', {}).get('locations'): loc = entry['original_entry']['locations'][0] location['city'] = loc.get('city') location['region'] = loc.get('region') location['country'] = loc.get('country') location['street_address'] = loc.get('street_address') # Check original_entry directly if not location.get('city'): orig = entry.get('original_entry', {}) location['city'] = orig.get('city') or orig.get('plaats') location['country'] = orig.get('country') return location def get_institution_type(entry: dict) -> str | None: """Get institution type for search refinement.""" inst_type = entry.get('original_entry', {}).get('institution_type') if inst_type: type_map = { 'LIBRARY': 'library', 'MUSEUM': 'museum', 'ARCHIVE': 'archive', 'GALLERY': 'gallery', 'RESEARCH_CENTER': 'research center', 'EDUCATION_PROVIDER': 'university', } return type_map.get(inst_type) return None def has_website(entry: dict) -> bool: """Check if entry already has a website.""" # Check various website fields if entry.get('original_entry', {}).get('webadres_organisatie'): return True # Check identifiers for ident in entry.get('original_entry', {}).get('identifiers', []): if ident.get('identifier_scheme') == 'Website': return True # Check enrichment fields if entry.get('website_discovery', {}).get('website_url'): return True if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'): return True if entry.get('google_maps_enrichment', {}).get('website'): return True return False def is_valid_website(url: str, country: str | None = None) -> bool: """Check if URL is a valid institutional website.""" if not url: return False try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www prefix if domain.startswith('www.'): domain = domain[4:] # Check blacklist for blacklisted in DOMAIN_BLACKLIST: if blacklisted in domain: return False return True except Exception: return False def score_website(url: str, country: str, name: str) -> int: """Score a website URL based on likelihood of being official site.""" score = 0 try: parsed = urlparse(url) domain = parsed.netloc.lower() # Prefer country-specific TLDs preferred = PREFERRED_TLDS.get(country, []) for i, tld in enumerate(preferred): if domain.endswith(tld): score += (len(preferred) - i) * 10 break # Prefer HTTPS if parsed.scheme == 'https': score += 5 # Prefer shorter paths (homepage vs deep link) path_depth = len([p for p in parsed.path.split('/') if p]) score -= path_depth * 2 # Check if institution name words appear in domain name_words = set(re.findall(r'\w+', name.lower())) domain_words = set(re.findall(r'\w+', domain)) common_words = name_words & domain_words score += len(common_words) * 5 except Exception: pass return score def search_for_website(name: str, location: dict, inst_type: str | None = None) -> list[dict]: """Search for institution website using DuckDuckGo.""" results = [] # Build search queries queries = [] city = location.get('city', '') country = location.get('country', '') # Primary query: name + city if city: queries.append(f'"{name}" {city}') # Secondary query: name + country + institution type if inst_type: queries.append(f'"{name}" {country} {inst_type} official') # Tertiary: just the name with "official website" queries.append(f'"{name}" official website') ddgs = DDGS() for query in queries[:2]: # Limit to 2 queries per institution try: search_results = list(ddgs.text(query, max_results=5)) for r in search_results: url = r.get('href') or r.get('url') if url and is_valid_website(url, country): results.append({ 'url': url, 'title': r.get('title', ''), 'snippet': r.get('body', ''), 'query': query, 'score': score_website(url, country, name) }) time.sleep(1) # Rate limit between queries except Exception as e: logger.warning(f"Search error for '{query}': {e}") time.sleep(2) # Sort by score and deduplicate seen_domains = set() unique_results = [] for r in sorted(results, key=lambda x: -x['score']): domain = urlparse(r['url']).netloc.lower() if domain not in seen_domains: seen_domains.add(domain) unique_results.append(r) return unique_results[:3] # Return top 3 unique results async def verify_website(url: str) -> dict: """Verify that a website is accessible and get basic info.""" result = { 'accessible': False, 'final_url': url, 'status_code': None, 'title': None, } try: async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client: response = await client.get(url) result['accessible'] = response.status_code == 200 result['status_code'] = response.status_code result['final_url'] = str(response.url) # Extract title if result['accessible']: match = re.search(r'