#!/usr/bin/env python3 """ Batch enrich custodians with Google Maps data using Playwright. This script reads custodian YAML files and enriches them with comprehensive Google Maps data scraped via Playwright, including: - Full review breakdown (all star ratings) - Review topics (what people talk about) - Popular times / live busyness - Related places - Detailed reviews (more than API's 5 limit) Usage: # Enrich a single custodian by GHCID python scripts/enrich_custodians_google_maps_playwright.py --ghcid NL-NH-AMS-M-RM # Enrich multiple custodians python scripts/enrich_custodians_google_maps_playwright.py --limit 10 # Dry run (show what would be done) python scripts/enrich_custodians_google_maps_playwright.py --dry-run --limit 5 Note: This script requires MCP Playwright to be running. It generates commands that should be executed via MCP tools, or can be run with local Playwright. """ import json import yaml import sys import time import asyncio from pathlib import Path from datetime import datetime, timezone from typing import Optional, List, Dict, Any, Tuple import logging import argparse # Add scripts to path sys.path.insert(0, str(Path(__file__).parent)) from scrape_google_maps_reviews import ( parse_snapshot_for_reviews, to_json_serializable, get_search_url, GoogleMapsScrapedData, ) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" OUTPUT_DIR = Path(__file__).parent.parent / "data" / "google_maps_enrichment" DELAY_BETWEEN_REQUESTS = 3 # seconds def load_custodian(ghcid: str) -> Optional[Dict[str, Any]]: """Load a custodian YAML file by GHCID.""" yaml_path = CUSTODIAN_DIR / f"{ghcid}.yaml" if not yaml_path.exists(): logger.warning(f"Custodian file not found: {yaml_path}") return None with open(yaml_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def get_custodian_search_query(custodian: Dict[str, Any]) -> Tuple[str, str, str]: """ Extract search query components from custodian data. Returns: Tuple of (name, city, country) """ # Get name - try various fields name = ( custodian.get('custodian_name', {}).get('emic_name') or custodian.get('name') or custodian.get('custodian_name', {}).get('english_name') or custodian.get('original_entry', {}).get('organisatie', '') or "Unknown" ) # Get city city = "" location = custodian.get('location', {}) if isinstance(location, dict): city = location.get('city', '') elif isinstance(location, list) and location: city = location[0].get('city', '') # Fallback to location resolution if not city: location_resolution = custodian.get('ghcid', {}).get('location_resolution', {}) city = location_resolution.get('city_name', '') # Get country country = "" if isinstance(location, dict): country = location.get('country', '') elif isinstance(location, list) and location: country = location[0].get('country', '') # Convert country code to name for better search country_names = { 'NL': 'Netherlands', 'BE': 'Belgium', 'DE': 'Germany', 'FR': 'France', 'GB': 'United Kingdom', 'US': 'United States', 'IT': 'Italy', 'ES': 'Spain', 'JP': 'Japan', 'CN': 'China', } country = country_names.get(country, country) return name, city, country def find_custodians_to_enrich( limit: Optional[int] = None, skip_enriched: bool = True, filter_country: Optional[str] = None, ) -> List[str]: """ Find custodian GHCIDs that need Google Maps enrichment. Args: limit: Maximum number of custodians to return skip_enriched: Skip custodians that already have Google Maps data filter_country: Only include custodians from this country code Returns: List of GHCIDs to enrich """ ghcids = [] if not CUSTODIAN_DIR.exists(): logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}") return ghcids yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml")) logger.info(f"Found {len(yaml_files)} custodian files") for yaml_path in yaml_files: ghcid = yaml_path.stem # Filter by country if specified if filter_country and not ghcid.startswith(filter_country + "-"): continue # Check if already enriched if skip_enriched: try: with open(yaml_path, 'r', encoding='utf-8') as f: custodian = yaml.safe_load(f) if custodian and custodian.get('google_maps_playwright_enrichment'): continue except Exception as e: logger.warning(f"Error reading {yaml_path}: {e}") continue ghcids.append(ghcid) if limit and len(ghcids) >= limit: break return ghcids def save_enrichment(ghcid: str, data: GoogleMapsScrapedData, custodian_path: Optional[Path] = None): """ Save Google Maps enrichment data. Can either: 1. Save to separate JSON file in google_maps_enrichment/ 2. Merge into custodian YAML file """ OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Save to JSON json_data = to_json_serializable(data) json_path = OUTPUT_DIR / f"{ghcid}_google_maps.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_data, f, indent=2, ensure_ascii=False) logger.info(f"Saved enrichment to: {json_path}") # Optionally merge into custodian YAML if custodian_path and custodian_path.exists(): try: with open(custodian_path, 'r', encoding='utf-8') as f: custodian = yaml.safe_load(f) custodian['google_maps_playwright_enrichment'] = json_data with open(custodian_path, 'w', encoding='utf-8') as f: yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.info(f"Merged enrichment into: {custodian_path}") except Exception as e: logger.error(f"Error merging into custodian: {e}") def generate_mcp_commands(ghcids: List[str]) -> List[Dict[str, Any]]: """ Generate MCP Playwright commands for batch processing. Returns a list of command dictionaries that can be executed via MCP. """ commands = [] for ghcid in ghcids: custodian = load_custodian(ghcid) if not custodian: continue name, city, country = get_custodian_search_query(custodian) url = get_search_url(name, city, country) commands.append({ "ghcid": ghcid, "name": name, "city": city, "country": country, "url": url, "commands": [ {"tool": "playwright_browser_navigate", "args": {"url": url}}, {"tool": "playwright_browser_wait_for", "args": {"time": 3}}, {"tool": "playwright_browser_snapshot", "args": {}}, ] }) return commands def main(): parser = argparse.ArgumentParser( description="Batch enrich custodians with Google Maps data via Playwright" ) parser.add_argument("--ghcid", help="Enrich a single custodian by GHCID") parser.add_argument("--limit", type=int, default=10, help="Limit number of custodians") parser.add_argument("--country", help="Filter by country code (e.g., NL, BE, DE)") parser.add_argument("--dry-run", action="store_true", help="Show what would be done") parser.add_argument("--output-commands", action="store_true", help="Output MCP commands as JSON") parser.add_argument("--skip-enriched", action="store_true", default=True, help="Skip already enriched") args = parser.parse_args() # Find custodians to enrich if args.ghcid: ghcids = [args.ghcid] else: ghcids = find_custodians_to_enrich( limit=args.limit, skip_enriched=args.skip_enriched, filter_country=args.country, ) if not ghcids: logger.info("No custodians to enrich") return 0 logger.info(f"Found {len(ghcids)} custodians to enrich") # Generate MCP commands commands = generate_mcp_commands(ghcids) if args.output_commands: print(json.dumps(commands, indent=2)) return 0 # Print instructions for manual MCP execution print("\n" + "=" * 70) print("MCP Playwright Enrichment Commands") print("=" * 70) for i, cmd in enumerate(commands): print(f"\n{i+1}. {cmd['ghcid']} - {cmd['name']}") print(f" City: {cmd['city']}, Country: {cmd['country']}") print(f" URL: {cmd['url']}") if args.dry_run: print(" [DRY RUN - would execute MCP commands]") else: print(" Commands:") for c in cmd['commands']: print(f" {c['tool']}({c['args']})") print("\n" + "=" * 70) print("To execute these commands:") print("1. Use MCP Playwright tools in your IDE") print("2. Navigate to each URL, wait, and take snapshot") print("3. Parse snapshot with parse_snapshot_for_reviews()") print("4. Save result with save_enrichment()") print("=" * 70) return 0 if __name__ == "__main__": sys.exit(main())