#!/usr/bin/env python3 """ Enrich custodian YAML files with logo images using Playwright. This script extracts logo URLs from heritage institution websites with proper provenance, following AGENTS.md Rule 6 (WebObservation Claims MUST Have XPath Provenance). Logo extraction looks for: 1. or (favicon/icon) 2. (Open Graph image) 3. elements with logo/brand in class/id/alt attributes 4. SVG elements with logo class/id Output format follows WebClaim schema with: - claim_type: logo_url, favicon_url, og_image_url - claim_value: The extracted image URL - source_url: Website where logo was found - css_selector: CSS selector to the element (for verification) - retrieved_on: ISO 8601 timestamp Usage: python scripts/enrich_custodian_logos_playwright.py [options] Options: --dry-run Show what would be enriched without modifying files --limit N Process only first N files (for testing) --file PATH Process a single specific file --country CODE Filter by country code (e.g., NL, BE, DE) --resume Resume from last checkpoint Requirements: pip install playwright pyyaml playwright install chromium """ import argparse import asyncio import json import logging import sys from datetime import datetime, timezone from pathlib import Path from urllib.parse import urljoin import yaml try: from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout except ImportError: print("Please install playwright: pip install playwright && playwright install chromium") sys.exit(1) # Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" CHECKPOINT_FILE = CUSTODIAN_DIR / ".logo_enrichment_playwright_checkpoint.json" REQUEST_DELAY = 2.0 # seconds between requests PAGE_TIMEOUT = 30000 # 30 seconds # JavaScript to extract logo information from page LOGO_EXTRACTION_JS = """ () => { const results = { favicons: [], ogImages: [], logos: [], svgLogos: [], primaryLogo: null }; // Helper to generate a CSS selector for an element function getCssSelector(el) { if (el.id) return '#' + el.id; let path = []; while (el && el.nodeType === Node.ELEMENT_NODE) { let selector = el.nodeName.toLowerCase(); if (el.id) { selector = '#' + el.id; path.unshift(selector); break; } else if (el.className && typeof el.className === 'string') { const classes = el.className.trim().split(/\\s+/).filter(c => c).slice(0, 2); if (classes.length > 0) { selector += '.' + classes.join('.'); } } // Add index if needed let sibling = el; let nth = 1; while (sibling = sibling.previousElementSibling) { if (sibling.nodeName.toLowerCase() === el.nodeName.toLowerCase()) nth++; } if (nth > 1) selector += ':nth-of-type(' + nth + ')'; path.unshift(selector); el = el.parentNode; } return path.join(' > '); } // Get favicons from link elements document.querySelectorAll('link[rel*="icon"]').forEach(el => { if (el.href) { results.favicons.push({ href: el.href, rel: el.rel, type: el.type || '', sizes: el.sizes?.value || '', selector: getCssSelector(el) }); } }); // Get apple-touch-icons document.querySelectorAll('link[rel*="apple-touch"]').forEach(el => { if (el.href) { results.favicons.push({ href: el.href, rel: el.rel, type: el.type || '', sizes: el.sizes?.value || '', selector: getCssSelector(el) }); } }); // Get og:image const ogImage = document.querySelector('meta[property="og:image"]'); if (ogImage && ogImage.content) { results.ogImages.push({ content: ogImage.content, selector: getCssSelector(ogImage) }); } // Also check twitter:image const twitterImage = document.querySelector('meta[name="twitter:image"]'); if (twitterImage && twitterImage.content) { results.ogImages.push({ content: twitterImage.content, selector: getCssSelector(twitterImage) }); } // Logo detection patterns const logoPatterns = /logo|brand|site-icon|masthead|emblem/i; const excludePatterns = /sponsor|partner|social|facebook|twitter|instagram|linkedin|youtube|tiktok|footer-logo|cookie/i; // Get images with logo indicators (prioritize header/nav) const headerNav = document.querySelector('header, nav, [role="banner"]'); // First check header/nav for primary logo if (headerNav) { headerNav.querySelectorAll('img').forEach(el => { const attrs = `${el.className || ''} ${el.id || ''} ${el.alt || ''} ${el.src || ''}`.toLowerCase(); if (logoPatterns.test(attrs) && !excludePatterns.test(attrs) && el.src) { if (!results.primaryLogo) { results.primaryLogo = { src: el.src, alt: el.alt || '', class: el.className || '', id: el.id || '', selector: getCssSelector(el), location: 'header' }; } } }); // Check for SVG logos in header headerNav.querySelectorAll('svg').forEach(el => { const attrs = `${el.className?.baseVal || ''} ${el.id || ''}`.toLowerCase(); if (logoPatterns.test(attrs) && !excludePatterns.test(attrs)) { if (!results.primaryLogo) { results.primaryLogo = { src: '[inline-svg]', alt: el.getAttribute('aria-label') || '', class: el.className?.baseVal || '', id: el.id || '', selector: getCssSelector(el), location: 'header', isInlineSvg: true }; } results.svgLogos.push({ class: el.className?.baseVal || '', id: el.id || '', selector: getCssSelector(el), ariaLabel: el.getAttribute('aria-label') || '' }); } }); } // Then check rest of page for additional logos document.querySelectorAll('img').forEach(el => { const attrs = `${el.className || ''} ${el.id || ''} ${el.alt || ''} ${el.src || ''}`.toLowerCase(); if (logoPatterns.test(attrs) && el.src && !excludePatterns.test(attrs)) { results.logos.push({ src: el.src, alt: el.alt || '', class: el.className || '', id: el.id || '', selector: getCssSelector(el) }); } }); // Deduplicate logos by src const seenSrcs = new Set(); results.logos = results.logos.filter(l => { if (seenSrcs.has(l.src)) return false; seenSrcs.add(l.src); return true; }); // Deduplicate favicons by href const seenHrefs = new Set(); results.favicons = results.favicons.filter(f => { if (seenHrefs.has(f.href)) return false; seenHrefs.add(f.href); return true; }); return results; } """ def get_website_url(entry: dict) -> str | None: """Extract website URL from custodian entry.""" # Priority 1: Original entry webadres if entry.get('original_entry', {}).get('webadres_organisatie'): url = entry['original_entry']['webadres_organisatie'] if url and url.strip() and url.strip().lower() not in ('null', 'none', ''): return normalize_url(url.strip()) # Priority 2: Museum register website if entry.get('museum_register_enrichment', {}).get('website_url'): url = entry['museum_register_enrichment']['website_url'] if url and url.strip(): return normalize_url(url.strip()) # Priority 3: Wikidata official website if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'): url = entry['wikidata_enrichment']['wikidata_official_website'] if url and url.strip(): return normalize_url(url.strip()) # Priority 4: Google Maps website if entry.get('google_maps_enrichment', {}).get('website'): url = entry['google_maps_enrichment']['website'] if url and url.strip(): return normalize_url(url.strip()) # Priority 5: Web enrichment source URL if entry.get('web_enrichment', {}).get('source_url'): url = entry['web_enrichment']['source_url'] if url and url.strip(): return normalize_url(url.strip()) return None def normalize_url(url: str) -> str: """Normalize URL to ensure it has a scheme.""" if not url: return url url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url url = url.rstrip('/') return url def get_custodian_name(entry: dict) -> str: """Get display name for a custodian entry.""" if entry.get('custodian_name', {}).get('emic_name'): return entry['custodian_name']['emic_name'] if entry.get('original_entry', {}).get('organisatie'): return entry['original_entry']['organisatie'] if entry.get('museum_register_enrichment', {}).get('museum_name'): return entry['museum_register_enrichment']['museum_name'] return "Unknown" def load_checkpoint() -> dict: """Load progress checkpoint.""" if CHECKPOINT_FILE.exists(): with open(CHECKPOINT_FILE, 'r') as f: return json.load(f) return {'processed_files': [], 'last_index': 0} def save_checkpoint(checkpoint: dict): """Save progress checkpoint.""" with open(CHECKPOINT_FILE, 'w') as f: json.dump(checkpoint, f, indent=2) def select_best_favicon(favicons: list[dict]) -> dict | None: """Select the best favicon from available options.""" if not favicons: return None # Priority: SVG > largest PNG > ICO svg_favicons = [f for f in favicons if f['href'].endswith('.svg') or f['type'] == 'image/svg+xml'] if svg_favicons: return svg_favicons[0] # Look for apple-touch-icon (high res) apple_icons = [f for f in favicons if 'apple-touch' in f['rel']] if apple_icons: # Sort by size if available sized = [f for f in apple_icons if f.get('sizes')] if sized: sized.sort(key=lambda x: int(x['sizes'].split('x')[0]) if 'x' in x['sizes'] else 0, reverse=True) return sized[0] return apple_icons[0] # Look for standard icon icons = [f for f in favicons if f['rel'] == 'icon'] if icons: # Prefer PNG over ICO png_icons = [i for i in icons if '.png' in i['href']] if png_icons: sized = [f for f in png_icons if f.get('sizes')] if sized: sized.sort(key=lambda x: int(x['sizes'].split('x')[0]) if 'x' in x['sizes'] else 0, reverse=True) return sized[0] return png_icons[0] return icons[0] # Return first available return favicons[0] if favicons else None def build_logo_claims(logo_data: dict, source_url: str, timestamp: str) -> list[dict]: """Build WebClaim-compatible claims from extracted logo data.""" claims = [] # Primary logo (highest priority) if logo_data.get('primaryLogo'): primary = logo_data['primaryLogo'] if primary.get('isInlineSvg'): claims.append({ 'claim_type': 'logo_url', 'claim_value': '[inline-svg]', 'source_url': source_url, 'css_selector': primary.get('selector', ''), 'retrieved_on': timestamp, 'extraction_method': 'playwright_svg_detection', 'detection_confidence': 'high', 'is_inline_svg': True, 'aria_label': primary.get('alt', ''), }) elif primary.get('src'): claims.append({ 'claim_type': 'logo_url', 'claim_value': primary['src'], 'source_url': source_url, 'css_selector': primary.get('selector', ''), 'retrieved_on': timestamp, 'extraction_method': 'playwright_header_logo', 'detection_confidence': 'high', 'alt_text': primary.get('alt', ''), }) # Best favicon best_favicon = select_best_favicon(logo_data.get('favicons', [])) if best_favicon: claims.append({ 'claim_type': 'favicon_url', 'claim_value': best_favicon['href'], 'source_url': source_url, 'css_selector': best_favicon.get('selector', ''), 'retrieved_on': timestamp, 'extraction_method': 'playwright_link_rel', 'favicon_type': best_favicon.get('type', ''), 'favicon_sizes': best_favicon.get('sizes', ''), }) # OG Image if logo_data.get('ogImages'): og = logo_data['ogImages'][0] claims.append({ 'claim_type': 'og_image_url', 'claim_value': og['content'], 'source_url': source_url, 'css_selector': og.get('selector', ''), 'retrieved_on': timestamp, 'extraction_method': 'playwright_meta_og', }) return claims async def extract_logos_from_url(page, url: str) -> dict | None: """Navigate to URL and extract logo information using Playwright.""" try: # Navigate to the page response = await page.goto(url, wait_until='domcontentloaded', timeout=PAGE_TIMEOUT) if not response or response.status >= 400: logger.warning(f"Failed to load {url}: HTTP {response.status if response else 'no response'}") return None # Wait a bit for JS to execute await page.wait_for_timeout(1500) # Try to dismiss cookie banners (common patterns) cookie_selectors = [ 'button:has-text("Accept")', 'button:has-text("Accepteren")', 'button:has-text("Akzeptieren")', 'button:has-text("Accepter")', 'button:has-text("OK")', '[id*="cookie"] button', '[class*="cookie"] button', '.consent-banner button', ] for selector in cookie_selectors: try: button = page.locator(selector).first if await button.is_visible(timeout=500): await button.click(timeout=1000) await page.wait_for_timeout(500) break except Exception: continue # Extract logo information logo_data = await page.evaluate(LOGO_EXTRACTION_JS) return logo_data except PlaywrightTimeout: logger.warning(f"Timeout loading {url}") return None except Exception as e: logger.error(f"Error extracting logos from {url}: {e}") return None async def enrich_custodian_with_logos( filepath: Path, page, dry_run: bool = False ) -> dict: """ Enrich a single custodian file with logo data. Returns dict with: - success: bool - logos_found: int - message: str """ try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return {'success': False, 'logos_found': 0, 'message': 'Empty file'} # Check if already has logo enrichment if entry.get('logo_enrichment', {}).get('claims'): return { 'success': True, 'logos_found': len(entry['logo_enrichment']['claims']), 'message': 'Already enriched (skipped)' } # Get website URL website_url = get_website_url(entry) if not website_url: return {'success': False, 'logos_found': 0, 'message': 'No website URL'} custodian_name = get_custodian_name(entry) logger.info(f"Processing: {custodian_name} ({website_url})") # Extract logos logo_data = await extract_logos_from_url(page, website_url) if not logo_data: return {'success': False, 'logos_found': 0, 'message': 'Failed to extract logos'} # Build claims timestamp = datetime.now(timezone.utc).isoformat() claims = build_logo_claims(logo_data, website_url, timestamp) if not claims: return {'success': True, 'logos_found': 0, 'message': 'No logos found'} # Prepare enrichment data logo_enrichment = { 'enrichment_timestamp': timestamp, 'source_url': website_url, 'extraction_method': 'playwright_browser', 'claims': claims, 'summary': { 'total_claims': len(claims), 'has_primary_logo': logo_data.get('primaryLogo') is not None, 'has_favicon': any(c['claim_type'] == 'favicon_url' for c in claims), 'has_og_image': any(c['claim_type'] == 'og_image_url' for c in claims), 'favicon_count': len(logo_data.get('favicons', [])), } } if dry_run: logger.info(f" [DRY RUN] Would add {len(claims)} logo claims") for claim in claims: logger.info(f" - {claim['claim_type']}: {claim['claim_value'][:80]}...") return {'success': True, 'logos_found': len(claims), 'message': 'Dry run'} # Update entry entry['logo_enrichment'] = logo_enrichment # Save updated entry with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return { 'success': True, 'logos_found': len(claims), 'message': f'Added {len(claims)} logo claims' } except Exception as e: logger.error(f"Error processing {filepath}: {e}") return {'success': False, 'logos_found': 0, 'message': str(e)} async def main(): parser = argparse.ArgumentParser(description='Enrich custodian files with logo data using Playwright') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--limit', type=int, default=0, help='Process only N files') parser.add_argument('--file', type=str, help='Process a single file') parser.add_argument('--country', type=str, help='Filter by country code') parser.add_argument('--resume', action='store_true', help='Resume from checkpoint') parser.add_argument('--headless', action='store_true', default=True, help='Run browser headless (default)') parser.add_argument('--visible', action='store_true', help='Show browser window') args = parser.parse_args() headless = not args.visible async with async_playwright() as p: browser = await p.chromium.launch(headless=headless) context = await browser.new_context( viewport={'width': 1280, 'height': 720}, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' ) page = await context.new_page() try: # Single file mode if args.file: filepath = Path(args.file) if not filepath.exists(): logger.error(f"File not found: {filepath}") sys.exit(1) result = await enrich_custodian_with_logos(filepath, page, args.dry_run) logger.info(f"Result: {result['message']} ({result['logos_found']} logos)") return # Batch mode checkpoint = load_checkpoint() if args.resume else {'processed_files': [], 'last_index': 0} # Get all custodian files files = sorted(CUSTODIAN_DIR.glob('*.yaml')) # Apply country filter if args.country: files = [f for f in files if f.name.startswith(f"{args.country}-")] # Apply limit if args.limit > 0: files = files[:args.limit] # Skip already processed if args.resume: files = [f for f in files if f.name not in checkpoint['processed_files']] logger.info(f"Processing {len(files)} custodian files...") stats = { 'processed': 0, 'success': 0, 'failed': 0, 'skipped': 0, 'logos_found': 0, } for i, filepath in enumerate(files): try: result = await enrich_custodian_with_logos(filepath, page, args.dry_run) stats['processed'] += 1 if result['success']: if 'skipped' in result['message'].lower(): stats['skipped'] += 1 else: stats['success'] += 1 stats['logos_found'] += result['logos_found'] else: stats['failed'] += 1 # Update checkpoint checkpoint['processed_files'].append(filepath.name) checkpoint['last_index'] = i if (i + 1) % 10 == 0: save_checkpoint(checkpoint) logger.info(f"Progress: {i+1}/{len(files)} - {stats['logos_found']} logos found") # Rate limiting await asyncio.sleep(REQUEST_DELAY) except KeyboardInterrupt: logger.info("Interrupted - saving checkpoint...") save_checkpoint(checkpoint) break # Final checkpoint save_checkpoint(checkpoint) # Summary logger.info("\n" + "="*60) logger.info("LOGO ENRICHMENT SUMMARY (Playwright)") logger.info("="*60) logger.info(f"Total processed: {stats['processed']}") logger.info(f"Successful: {stats['success']}") logger.info(f"Failed: {stats['failed']}") logger.info(f"Skipped (already enriched): {stats['skipped']}") logger.info(f"Total logos found: {stats['logos_found']}") logger.info("="*60) finally: await browser.close() if __name__ == '__main__': asyncio.run(main())