#!/usr/bin/env python3 """ Discover website URLs for custodian YAML files that are missing them. This script uses web search (via DuckDuckGo or Google) to find official websites for heritage institutions based on their name and location. Search strategy: 1. Search for institution name + city + country 2. Search for institution name + "official website" 3. Search for institution name + institution type (museum, library, archive) Output: - Updates custodian YAML files with discovered website URLs - Stores provenance for discovered URLs Usage: python scripts/discover_custodian_websites.py [options] Options: --dry-run Show what would be discovered without modifying files --limit N Process only first N files (for testing) --file PATH Process a single specific file --country CODE Filter by country code (e.g., JP, CZ) --resume Resume from last checkpoint Requirements: pip install duckduckgo-search pyyaml httpx """ import argparse import asyncio import json import logging import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: from duckduckgo_search import DDGS except ImportError: print("Please install duckduckgo-search: pip install duckduckgo-search") sys.exit(1) try: import httpx except ImportError: print("Please install httpx: pip install httpx") sys.exit(1) # Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_checkpoint.json" REQUEST_DELAY = 3.0 # seconds between searches (be nice to search engines) # Domain blacklist (not actual institution websites) DOMAIN_BLACKLIST = { 'wikipedia.org', 'wikidata.org', 'wikimedia.org', 'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com', 'tiktok.com', 'pinterest.com', 'tripadvisor.com', 'tripadvisor.jp', 'yelp.com', 'google.com', 'google.co.jp', 'maps.google.com', 'amazon.com', 'amazon.co.jp', 'ebay.com', 'booking.com', 'expedia.com', 'hotels.com', 'foursquare.com', 'bing.com', 'yahoo.com', 'findagrave.com', 'ancestry.com', 'familysearch.org', 'academia.edu', 'researchgate.net', 'timeanddate.com', 'weather.com', } # Domain preferences (prefer these TLDs for official sites) PREFERRED_TLDS = { 'JP': ['.go.jp', '.lg.jp', '.ac.jp', '.or.jp', '.jp'], 'CZ': ['.cz', '.gov.cz'], 'NL': ['.nl', '.gov.nl'], 'BE': ['.be', '.gov.be'], 'DE': ['.de', '.gov.de'], 'AT': ['.at', '.gv.at'], 'CH': ['.ch', '.admin.ch'], } def get_custodian_name(entry: dict) -> str | None: """Extract institution name from entry.""" # Priority 1: Emic name (native language official name) if entry.get('custodian_name', {}).get('emic_name'): return entry['custodian_name']['emic_name'] # Priority 2: Wikidata native language label (ja, zh, ko, etc.) wikidata = entry.get('wikidata_enrichment', {}) country = get_country_from_entry(entry) # Map country to preferred label language country_lang_map = { 'JP': 'ja', 'CN': 'zh', 'KR': 'ko', 'TW': 'zh', 'TH': 'th', 'VN': 'vi', 'RU': 'ru', 'GR': 'el', 'IL': 'he', 'SA': 'ar', 'IR': 'fa', } if country in country_lang_map: lang = country_lang_map[country] native_label = wikidata.get(f'wikidata_label_{lang}') or wikidata.get('wikidata_labels', {}).get(lang) if native_label: return native_label # Priority 3: Claim value if entry.get('custodian_name', {}).get('claim_value'): return entry['custodian_name']['claim_value'] # Priority 4: Original entry name if entry.get('original_entry', {}).get('name'): return entry['original_entry']['name'] # Priority 5: Organisatie (Dutch) if entry.get('original_entry', {}).get('organisatie'): return entry['original_entry']['organisatie'] return None def get_country_from_entry(entry: dict) -> str | None: """Extract country code from entry.""" # Check location.country if entry.get('location', {}).get('country'): return entry['location']['country'] # Check original_entry.locations if entry.get('original_entry', {}).get('locations'): loc = entry['original_entry']['locations'][0] if loc.get('country'): return loc['country'] return None def get_location_info(entry: dict) -> dict: """Extract location information from entry.""" location = {} # Check original_entry.locations if entry.get('original_entry', {}).get('locations'): loc = entry['original_entry']['locations'][0] location['city'] = loc.get('city') location['region'] = loc.get('region') location['country'] = loc.get('country') location['street_address'] = loc.get('street_address') # Check original_entry directly if not location.get('city'): orig = entry.get('original_entry', {}) location['city'] = orig.get('city') or orig.get('plaats') location['country'] = orig.get('country') return location def get_institution_type(entry: dict) -> str | None: """Get institution type for search refinement.""" inst_type = entry.get('original_entry', {}).get('institution_type') if inst_type: type_map = { 'LIBRARY': 'library', 'MUSEUM': 'museum', 'ARCHIVE': 'archive', 'GALLERY': 'gallery', 'RESEARCH_CENTER': 'research center', 'EDUCATION_PROVIDER': 'university', } return type_map.get(inst_type) return None def has_website(entry: dict) -> bool: """Check if entry already has a website.""" # Check various website fields if entry.get('original_entry', {}).get('webadres_organisatie'): return True # Check identifiers for ident in entry.get('original_entry', {}).get('identifiers', []): if ident.get('identifier_scheme') == 'Website': return True # Check enrichment fields if entry.get('website_discovery', {}).get('website_url'): return True if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'): return True if entry.get('google_maps_enrichment', {}).get('website'): return True return False def is_valid_website(url: str, country: str | None = None) -> bool: """Check if URL is a valid institutional website.""" if not url: return False try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www prefix if domain.startswith('www.'): domain = domain[4:] # Check blacklist for blacklisted in DOMAIN_BLACKLIST: if blacklisted in domain: return False return True except Exception: return False def score_website(url: str, country: str, name: str) -> int: """Score a website URL based on likelihood of being official site.""" score = 0 try: parsed = urlparse(url) domain = parsed.netloc.lower() # Prefer country-specific TLDs preferred = PREFERRED_TLDS.get(country, []) for i, tld in enumerate(preferred): if domain.endswith(tld): score += (len(preferred) - i) * 10 break # Prefer HTTPS if parsed.scheme == 'https': score += 5 # Prefer shorter paths (homepage vs deep link) path_depth = len([p for p in parsed.path.split('/') if p]) score -= path_depth * 2 # Check if institution name words appear in domain name_words = set(re.findall(r'\w+', name.lower())) domain_words = set(re.findall(r'\w+', domain)) common_words = name_words & domain_words score += len(common_words) * 5 except Exception: pass return score def search_for_website(name: str, location: dict, inst_type: str | None = None) -> list[dict]: """Search for institution website using DuckDuckGo.""" results = [] # Build search queries queries = [] city = location.get('city', '') country = location.get('country', '') # Primary query: name + city if city: queries.append(f'"{name}" {city}') # Secondary query: name + country + institution type if inst_type: queries.append(f'"{name}" {country} {inst_type} official') # Tertiary: just the name with "official website" queries.append(f'"{name}" official website') ddgs = DDGS() for query in queries[:2]: # Limit to 2 queries per institution try: search_results = list(ddgs.text(query, max_results=5)) for r in search_results: url = r.get('href') or r.get('url') if url and is_valid_website(url, country): results.append({ 'url': url, 'title': r.get('title', ''), 'snippet': r.get('body', ''), 'query': query, 'score': score_website(url, country, name) }) time.sleep(1) # Rate limit between queries except Exception as e: logger.warning(f"Search error for '{query}': {e}") time.sleep(2) # Sort by score and deduplicate seen_domains = set() unique_results = [] for r in sorted(results, key=lambda x: -x['score']): domain = urlparse(r['url']).netloc.lower() if domain not in seen_domains: seen_domains.add(domain) unique_results.append(r) return unique_results[:3] # Return top 3 unique results async def verify_website(url: str) -> dict: """Verify that a website is accessible and get basic info.""" result = { 'accessible': False, 'final_url': url, 'status_code': None, 'title': None, } try: async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client: response = await client.get(url) result['accessible'] = response.status_code == 200 result['status_code'] = response.status_code result['final_url'] = str(response.url) # Extract title if result['accessible']: match = re.search(r']*>([^<]+)', response.text, re.I) if match: result['title'] = match.group(1).strip() except Exception as e: logger.debug(f"Failed to verify {url}: {e}") return result def load_checkpoint() -> dict: """Load progress checkpoint.""" if CHECKPOINT_FILE.exists(): with open(CHECKPOINT_FILE, 'r') as f: return json.load(f) return {'processed_files': [], 'found_count': 0, 'not_found_count': 0} def save_checkpoint(checkpoint: dict): """Save progress checkpoint.""" with open(CHECKPOINT_FILE, 'w') as f: json.dump(checkpoint, f, indent=2) def update_custodian_file(filepath: Path, website_url: str, discovery_info: dict) -> bool: """Update custodian YAML file with discovered website.""" try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return False # Add website discovery section entry['website_discovery'] = { 'website_url': website_url, 'discovery_date': datetime.now(timezone.utc).isoformat(), 'discovery_method': 'duckduckgo_search', 'search_query': discovery_info.get('query', ''), 'confidence_score': min(discovery_info.get('score', 0) / 50, 1.0), # Normalize to 0-1 'verification': { 'accessible': discovery_info.get('verification', {}).get('accessible', False), 'page_title': discovery_info.get('verification', {}).get('title'), 'final_url': discovery_info.get('verification', {}).get('final_url'), } } with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True except Exception as e: logger.error(f"Failed to update {filepath}: {e}") return False async def process_file(filepath: Path, dry_run: bool = False) -> dict: """Process a single custodian file.""" result = { 'filename': filepath.name, 'status': 'skipped', 'website': None, } try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: result['status'] = 'empty' return result # Skip if already has website if has_website(entry): result['status'] = 'has_website' return result # Get institution info name = get_custodian_name(entry) if not name: result['status'] = 'no_name' return result location = get_location_info(entry) inst_type = get_institution_type(entry) country = location.get('country', filepath.name[:2]) logger.info(f"Searching for: {name} ({location.get('city', 'unknown city')}, {country})") # Search for website search_results = search_for_website(name, location, inst_type) if not search_results: result['status'] = 'not_found' return result # Verify top result best = search_results[0] verification = await verify_website(best['url']) best['verification'] = verification if verification['accessible']: result['website'] = verification['final_url'] result['status'] = 'found' result['discovery_info'] = best if not dry_run: update_custodian_file(filepath, verification['final_url'], best) logger.info(f" → Found: {verification['final_url']}") else: # Try second result if first is inaccessible if len(search_results) > 1: second = search_results[1] verification2 = await verify_website(second['url']) if verification2['accessible']: second['verification'] = verification2 result['website'] = verification2['final_url'] result['status'] = 'found' result['discovery_info'] = second if not dry_run: update_custodian_file(filepath, verification2['final_url'], second) logger.info(f" → Found (2nd): {verification2['final_url']}") else: result['status'] = 'inaccessible' else: result['status'] = 'inaccessible' except Exception as e: result['status'] = 'error' result['error'] = str(e) logger.error(f"Error processing {filepath}: {e}") return result async def main(): parser = argparse.ArgumentParser(description='Discover websites for custodian files') parser.add_argument('--dry-run', action='store_true', help='Show what would be discovered') parser.add_argument('--limit', type=int, help='Process only first N files') parser.add_argument('--file', type=str, help='Process a single specific file') parser.add_argument('--country', type=str, help='Filter by country code (e.g., JP, CZ)') parser.add_argument('--resume', action='store_true', help='Resume from checkpoint') args = parser.parse_args() # Get files to process if args.file: files = [Path(args.file)] else: pattern = f"{args.country}-*.yaml" if args.country else "*.yaml" files = sorted(CUSTODIAN_DIR.glob(pattern)) # Filter out non-custodian files files = [f for f in files if f.name[0].isupper() and '-' in f.name] # Load checkpoint checkpoint = load_checkpoint() if args.resume else {'processed_files': [], 'found_count': 0, 'not_found_count': 0} processed_set = set(checkpoint['processed_files']) if args.resume: files = [f for f in files if f.name not in processed_set] logger.info(f"Resuming: {len(processed_set)} files already processed, {len(files)} remaining") # Apply limit if args.limit: files = files[:args.limit] logger.info(f"Processing {len(files)} custodian files...") # Process files found_count = checkpoint.get('found_count', 0) not_found_count = checkpoint.get('not_found_count', 0) for i, filepath in enumerate(files): result = await process_file(filepath, args.dry_run) # Update counts if result['status'] == 'found': found_count += 1 elif result['status'] in ('not_found', 'inaccessible'): not_found_count += 1 # Update checkpoint if not args.dry_run: checkpoint['processed_files'].append(filepath.name) checkpoint['found_count'] = found_count checkpoint['not_found_count'] = not_found_count if (i + 1) % 10 == 0: save_checkpoint(checkpoint) # Progress update if (i + 1) % 10 == 0: logger.info(f"Progress: {i + 1}/{len(files)} - Found: {found_count}, Not found: {not_found_count}") # Rate limiting time.sleep(REQUEST_DELAY) # Final checkpoint save if not args.dry_run: save_checkpoint(checkpoint) # Summary logger.info(f"\n{'='*50}") logger.info(f"Discovery complete!") logger.info(f" Files processed: {len(files)}") logger.info(f" Websites found: {found_count}") logger.info(f" Not found: {not_found_count}") logger.info(f"{'='*50}") if __name__ == '__main__': asyncio.run(main())