#!/usr/bin/env python3 """ Scrape Google Maps reviews using Playwright. This script uses Playwright to scrape comprehensive review data from Google Maps, overcoming the Google Places API limitation of only 5 reviews per place. Features: - Full review extraction (100+ reviews with scroll pagination) - Review breakdown by star rating - Popular times/busyness data - Related places - Review topics/themes - Individual review metadata (author, date, photos, text) Usage: python scripts/scrape_google_maps_reviews.py --name "Rijksmuseum" --city "Amsterdam" python scripts/scrape_google_maps_reviews.py --url "https://www.google.com/maps/place/..." python scripts/scrape_google_maps_reviews.py --ghcid "NL-NH-AMS-M-RM" Output: JSON file with comprehensive Google Maps data Note: This script is designed to be run interactively with MCP Playwright tools. For batch processing, use the GoogleMapsPlaywrightScraper class directly. """ import json import re import asyncio from datetime import datetime, timezone from pathlib import Path from dataclasses import dataclass, field, asdict from typing import Optional, List, Dict, Any import logging import argparse logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @dataclass class ReviewData: """Individual review data.""" author_name: str author_url: Optional[str] = None rating: Optional[int] = None text: Optional[str] = None relative_time: Optional[str] = None absolute_time: Optional[str] = None photos_count: int = 0 review_url: Optional[str] = None local_guide: bool = False reviews_by_author: Optional[int] = None photos_by_author: Optional[int] = None @dataclass class ReviewBreakdown: """Star rating breakdown.""" five_star: int = 0 four_star: int = 0 three_star: int = 0 two_star: int = 0 one_star: int = 0 @dataclass class PopularTimes: """Popular times data for a day.""" day: str hours: Dict[int, int] = field(default_factory=dict) # hour -> busyness percentage @dataclass class RelatedPlace: """Related/nearby place.""" name: str rating: Optional[float] = None review_count: Optional[int] = None place_type: Optional[str] = None url: Optional[str] = None @dataclass class GoogleMapsScrapedData: """Complete scraped data from Google Maps.""" # Basic info name: str place_id: Optional[str] = None google_maps_url: Optional[str] = None # Location address: Optional[str] = None latitude: Optional[float] = None longitude: Optional[float] = None plus_code: Optional[str] = None # Contact phone: Optional[str] = None website: Optional[str] = None # Business info business_status: Optional[str] = None # OPERATIONAL, CLOSED_TEMPORARILY, etc. price_level: Optional[str] = None place_types: List[str] = field(default_factory=list) # Hours hours_text: Optional[str] = None is_open_now: Optional[bool] = None opening_hours: Optional[Dict[str, str]] = None # Ratings rating: Optional[float] = None total_reviews: Optional[int] = None review_breakdown: Optional[ReviewBreakdown] = None # Reviews reviews: List[ReviewData] = field(default_factory=list) review_topics: Dict[str, int] = field(default_factory=dict) # topic -> count # Popular times popular_times: List[PopularTimes] = field(default_factory=list) live_busyness: Optional[int] = None # current busyness percentage typical_visit_duration: Optional[str] = None # Related related_places: List[RelatedPlace] = field(default_factory=list) # Photos photo_count: Optional[int] = None photo_categories: Dict[str, int] = field(default_factory=dict) # Accessibility accessibility_features: List[str] = field(default_factory=list) # Metadata scrape_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) scrape_method: str = "playwright" def parse_snapshot_for_reviews(snapshot_text: str) -> GoogleMapsScrapedData: """ Parse a Playwright accessibility snapshot for Google Maps data. This parser is designed to work with the YAML-like accessibility tree format returned by MCP Playwright browser_snapshot tool. Args: snapshot_text: The accessibility tree text from Playwright Returns: Parsed GoogleMapsScrapedData """ data = GoogleMapsScrapedData(name="Unknown") lines = snapshot_text.split('\n') for i, line in enumerate(lines): # Extract place name from heading if 'heading "' in line and 'level=1' in line: name_match = re.search(r'heading "([^"]+)"', line) if name_match and data.name == "Unknown": data.name = name_match.group(1) # Rating and review count from img alt text like "4.7 stars" and "108,805 reviews" if 'img "' in line: # Rating pattern - "4.7 stars" rating_img = re.search(r'img "(\d+\.?\d*)\s*stars?"', line) if rating_img and data.rating is None: data.rating = float(rating_img.group(1)) # Review count pattern - "(108,805)" or "108,805 reviews" reviews_img = re.search(r'img "[^"]*?([\d,]+)\s*[Rr]eviews?"', line) if reviews_img and data.total_reviews is None: data.total_reviews = int(reviews_img.group(1).replace(',', '').replace('.', '')) # Review breakdown pattern - "5 stars, 81,221 reviews" breakdown_match = re.search(r'img "(\d)\s*stars?,?\s*([\d,\.]+)\s*reviews?"', line) if breakdown_match: stars = int(breakdown_match.group(1)) count = int(breakdown_match.group(2).replace(',', '').replace('.', '')) if data.review_breakdown is None: data.review_breakdown = ReviewBreakdown() if stars == 5: data.review_breakdown.five_star = count elif stars == 4: data.review_breakdown.four_star = count elif stars == 3: data.review_breakdown.three_star = count elif stars == 2: data.review_breakdown.two_star = count elif stars == 1: data.review_breakdown.one_star = count # Busyness data - "Currently 28% busy, usually 41% busy" busyness_match = re.search(r'Currently (\d+)% busy', line) if busyness_match: data.live_busyness = int(busyness_match.group(1)) # Address from button - 'button "Address: Museumstraat 1, 1071 XX Amsterdam"' if 'Address:' in line: addr_match = re.search(r'Address:\s*([^"]+)"', line) if addr_match: data.address = addr_match.group(1).strip() # Plus code - 'button "Plus code: 9V5P+X3 Amsterdam"' if 'Plus code:' in line: plus_match = re.search(r'Plus code:\s*([^"]+)"', line) if plus_match: data.plus_code = plus_match.group(1).strip() # Phone from button - 'button "Phone: 020 674 7000"' if 'Phone:' in line: phone_match = re.search(r'Phone:\s*([^"]+)"', line) if phone_match: data.phone = phone_match.group(1).strip() # Website from link text - 'link "Website: rijksmuseum.nl"' if 'Website:' in line: website_match = re.search(r'Website:\s*([^"]+)"', line) if website_match: data.website = website_match.group(1).strip() # Hours from button - 'button "Open · Closes 5 pm' if 'Open · Closes' in line or 'Closed · Opens' in line: hours_match = re.search(r'(Open|Closed)\s*·\s*(Closes|Opens)\s+(\d+\s*(?:am|pm))', line, re.IGNORECASE) if hours_match: data.hours_text = f"{hours_match.group(1)} · {hours_match.group(2)} {hours_match.group(3)}" data.is_open_now = hours_match.group(1).lower() == 'open' # Review topics from radio buttons - 'radio "rembrandt, mentioned in 2,834 reviews"' topic_match = re.search(r'radio "([^,]+),\s*mentioned in ([\d,]+) reviews"', line) if topic_match: topic = topic_match.group(1).strip().lower() count = int(topic_match.group(2).replace(',', '')) data.review_topics[topic] = count # Related places from links - 'link "Van Gogh Museum · 4.6 stars · 101,806 reviews · Art museum"' related_match = re.search(r'link "([^·]+)·\s*(\d+\.?\d*)\s*stars?\s*·\s*([\d,]+)\s*reviews?\s*·\s*([^"]+)"', line) if related_match: place = RelatedPlace( name=related_match.group(1).strip(), rating=float(related_match.group(2)), review_count=int(related_match.group(3).replace(',', '')), place_type=related_match.group(4).strip() ) data.related_places.append(place) # Extract coordinates from URL if present url_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', snapshot_text) if url_match: data.latitude = float(url_match.group(1)) data.longitude = float(url_match.group(2)) # Extract Google Maps URL maps_url_match = re.search(r'https://www\.google\.com/maps/place/[^\s\'"]+', snapshot_text) if maps_url_match: data.google_maps_url = maps_url_match.group(0) # Parse individual reviews from generic blocks with author names review_blocks = re.findall(r'generic "([^"]+)" \[ref=e\d+\]:\s*\n((?:.*?\n)*?)(?=generic "[^"]+" \[ref=e\d+\]:|$)', snapshot_text) for author_name, block in review_blocks: # Skip non-review blocks if 'Local Guide' in block or 'reviews' in block.lower(): review = ReviewData(author_name=author_name) # Extract rating rating_match = re.search(r'img "(\d)\s*stars?"', block) if rating_match: review.rating = int(rating_match.group(1)) # Extract time time_match = re.search(r'generic \[ref=e\d+\]: (a week ago|a month ago|a year ago|\d+\s*(?:day|week|month|year)s?\s*ago)', block) if time_match: review.relative_time = time_match.group(1) # Check if local guide if 'Local Guide' in block: review.local_guide = True # Extract author stats stats_match = re.search(r'(\d+)\s*reviews?\s*·\s*(\d+)\s*photos?', block) if stats_match: review.reviews_by_author = int(stats_match.group(1)) review.photos_by_author = int(stats_match.group(2)) # Extract review text (the longest text line) text_lines = re.findall(r'text: (.+)', block) if text_lines: review.text = max(text_lines, key=len) # Count photos in review photo_count = len(re.findall(r'Photo \d+ on .+\'s review', block)) review.photos_count = photo_count if review.rating is not None: # Only add if we found a rating data.reviews.append(review) return data def extract_review_from_snapshot_block(block: str) -> Optional[ReviewData]: """Extract a single review from a snapshot text block.""" review = ReviewData(author_name="Unknown") lines = block.strip().split('\n') for i, line in enumerate(lines): # Author name - typically first link text if 'link' in line.lower() and review.author_name == "Unknown": name_match = re.search(r'"([^"]+)"', line) if name_match: review.author_name = name_match.group(1) # Rating rating_match = re.search(r'(\d)\s*stars?', line) if rating_match: review.rating = int(rating_match.group(1)) # Time time_match = re.search(r'(\d+\s*(?:day|week|month|year)s?\s*ago|a\s+(?:day|week|month|year)\s+ago)', line, re.IGNORECASE) if time_match: review.relative_time = time_match.group(1) # Local guide if 'local guide' in line.lower(): review.local_guide = True # Reviews/photos by author author_stats = re.search(r'(\d+)\s*reviews?.*?(\d+)\s*photos?', line, re.IGNORECASE) if author_stats: review.reviews_by_author = int(author_stats.group(1)) review.photos_by_author = int(author_stats.group(2)) # Get review text - usually longest text block text_candidates = [] for line in lines: if len(line) > 50 and 'stars' not in line.lower() and 'reviews' not in line.lower(): text_candidates.append(line) if text_candidates: review.text = max(text_candidates, key=len) return review if review.author_name != "Unknown" else None def to_json_serializable(data: GoogleMapsScrapedData) -> Dict[str, Any]: """Convert dataclass to JSON-serializable dict.""" result: Dict[str, Any] = { "name": data.name, "scrape_timestamp": data.scrape_timestamp, "scrape_method": data.scrape_method, } # Add optional fields if present if data.place_id: result["place_id"] = data.place_id if data.google_maps_url: result["google_maps_url"] = data.google_maps_url if data.address: result["address"] = data.address if data.latitude and data.longitude: result["coordinates"] = {"latitude": data.latitude, "longitude": data.longitude} if data.phone: result["phone"] = data.phone if data.website: result["website"] = data.website if data.rating: result["rating"] = data.rating if data.total_reviews: result["total_reviews"] = data.total_reviews if data.review_breakdown: result["review_breakdown"] = { "5_star": data.review_breakdown.five_star, "4_star": data.review_breakdown.four_star, "3_star": data.review_breakdown.three_star, "2_star": data.review_breakdown.two_star, "1_star": data.review_breakdown.one_star, } if data.hours_text: result["hours_text"] = data.hours_text if data.is_open_now is not None: result["is_open_now"] = data.is_open_now if data.review_topics: result["review_topics"] = data.review_topics if data.reviews: result["reviews"] = [ { "author_name": r.author_name, "rating": r.rating, "text": r.text, "relative_time": r.relative_time, "local_guide": r.local_guide, "reviews_by_author": r.reviews_by_author, "photos_by_author": r.photos_by_author, } for r in data.reviews ] if data.related_places: result["related_places"] = [ { "name": p.name, "rating": p.rating, "review_count": p.review_count, } for p in data.related_places ] if data.photo_count: result["photo_count"] = data.photo_count if data.live_busyness: result["live_busyness_percent"] = data.live_busyness return result # MCP Playwright interaction functions # These generate the commands to be run via MCP tools def get_search_url(name: str, city: str, country: str = "") -> str: """Generate Google Maps search URL.""" query = f"{name} {city} {country}".strip() encoded = query.replace(' ', '+') return f"https://www.google.com/maps/search/{encoded}" def get_place_url_from_place_id(place_id: str) -> str: """Generate Google Maps URL from place ID.""" return f"https://www.google.com/maps/place/?q=place_id:{place_id}" # Instructions for MCP Playwright usage MCP_INSTRUCTIONS = """ ## How to scrape Google Maps with MCP Playwright 1. Navigate to the place: playwright_browser_navigate(url="https://www.google.com/maps/search/Rijksmuseum+Amsterdam") 2. Wait for the page to load: playwright_browser_wait_for(time=3) 3. Take a snapshot to get the accessibility tree: playwright_browser_snapshot() 4. To get more reviews, click the reviews section: playwright_browser_click(element="Reviews", ref="") 5. Scroll to load more reviews: playwright_browser_evaluate(function="() => { document.querySelector('[role=main]').scrollTo(0, 10000) }") 6. Take another snapshot to get loaded reviews: playwright_browser_snapshot() 7. Parse the snapshot using parse_snapshot_for_reviews() ## Data Available: - Rating and total review count - Review breakdown by stars (5★, 4★, 3★, 2★, 1★) - Individual reviews with author, date, text, photos - Review topics (e.g., "rembrandt (2,834)") - Popular times / live busyness - Related places - Hours, phone, website - Address """ def main(): parser = argparse.ArgumentParser(description="Scrape Google Maps reviews with Playwright") parser.add_argument("--name", help="Institution name") parser.add_argument("--city", help="City name") parser.add_argument("--country", default="", help="Country name (optional)") parser.add_argument("--url", help="Direct Google Maps URL") parser.add_argument("--place-id", help="Google Place ID") parser.add_argument("--ghcid", help="GHCID to look up institution") parser.add_argument("--output", help="Output JSON file path") parser.add_argument("--instructions", action="store_true", help="Show MCP Playwright instructions") args = parser.parse_args() if args.instructions: print(MCP_INSTRUCTIONS) return 0 if args.url: print(f"Use MCP Playwright to navigate to: {args.url}") elif args.place_id: url = get_place_url_from_place_id(args.place_id) print(f"Use MCP Playwright to navigate to: {url}") elif args.name and args.city: url = get_search_url(args.name, args.city, args.country) print(f"Use MCP Playwright to navigate to: {url}") else: print("Please provide --url, --place-id, or --name and --city") print("\nFor MCP Playwright usage instructions, run with --instructions") return 1 print("\nRun the following MCP Playwright commands:") print("1. playwright_browser_navigate(url=)") print("2. playwright_browser_wait_for(time=3)") print("3. playwright_browser_snapshot()") print("\nThen parse the snapshot with this script's parse_snapshot_for_reviews() function") return 0 if __name__ == "__main__": exit(main())