515 lines
19 KiB
Python
515 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape Google Maps reviews using Playwright.
|
|
|
|
This script uses Playwright to scrape comprehensive review data from Google Maps,
|
|
overcoming the Google Places API limitation of only 5 reviews per place.
|
|
|
|
Features:
|
|
- Full review extraction (100+ reviews with scroll pagination)
|
|
- Review breakdown by star rating
|
|
- Popular times/busyness data
|
|
- Related places
|
|
- Review topics/themes
|
|
- Individual review metadata (author, date, photos, text)
|
|
|
|
Usage:
|
|
python scripts/scrape_google_maps_reviews.py --name "Rijksmuseum" --city "Amsterdam"
|
|
python scripts/scrape_google_maps_reviews.py --url "https://www.google.com/maps/place/..."
|
|
python scripts/scrape_google_maps_reviews.py --ghcid "NL-NH-AMS-M-RM"
|
|
|
|
Output:
|
|
JSON file with comprehensive Google Maps data
|
|
|
|
Note:
|
|
This script is designed to be run interactively with MCP Playwright tools.
|
|
For batch processing, use the GoogleMapsPlaywrightScraper class directly.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import asyncio
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Optional, List, Dict, Any
|
|
import logging
|
|
import argparse
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ReviewData:
|
|
"""Individual review data."""
|
|
author_name: str
|
|
author_url: Optional[str] = None
|
|
rating: Optional[int] = None
|
|
text: Optional[str] = None
|
|
relative_time: Optional[str] = None
|
|
absolute_time: Optional[str] = None
|
|
photos_count: int = 0
|
|
review_url: Optional[str] = None
|
|
local_guide: bool = False
|
|
reviews_by_author: Optional[int] = None
|
|
photos_by_author: Optional[int] = None
|
|
|
|
|
|
@dataclass
|
|
class ReviewBreakdown:
|
|
"""Star rating breakdown."""
|
|
five_star: int = 0
|
|
four_star: int = 0
|
|
three_star: int = 0
|
|
two_star: int = 0
|
|
one_star: int = 0
|
|
|
|
|
|
@dataclass
|
|
class PopularTimes:
|
|
"""Popular times data for a day."""
|
|
day: str
|
|
hours: Dict[int, int] = field(default_factory=dict) # hour -> busyness percentage
|
|
|
|
|
|
@dataclass
|
|
class RelatedPlace:
|
|
"""Related/nearby place."""
|
|
name: str
|
|
rating: Optional[float] = None
|
|
review_count: Optional[int] = None
|
|
place_type: Optional[str] = None
|
|
url: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class GoogleMapsScrapedData:
|
|
"""Complete scraped data from Google Maps."""
|
|
# Basic info
|
|
name: str
|
|
place_id: Optional[str] = None
|
|
google_maps_url: Optional[str] = None
|
|
|
|
# Location
|
|
address: Optional[str] = None
|
|
latitude: Optional[float] = None
|
|
longitude: Optional[float] = None
|
|
plus_code: Optional[str] = None
|
|
|
|
# Contact
|
|
phone: Optional[str] = None
|
|
website: Optional[str] = None
|
|
|
|
# Business info
|
|
business_status: Optional[str] = None # OPERATIONAL, CLOSED_TEMPORARILY, etc.
|
|
price_level: Optional[str] = None
|
|
place_types: List[str] = field(default_factory=list)
|
|
|
|
# Hours
|
|
hours_text: Optional[str] = None
|
|
is_open_now: Optional[bool] = None
|
|
opening_hours: Optional[Dict[str, str]] = None
|
|
|
|
# Ratings
|
|
rating: Optional[float] = None
|
|
total_reviews: Optional[int] = None
|
|
review_breakdown: Optional[ReviewBreakdown] = None
|
|
|
|
# Reviews
|
|
reviews: List[ReviewData] = field(default_factory=list)
|
|
review_topics: Dict[str, int] = field(default_factory=dict) # topic -> count
|
|
|
|
# Popular times
|
|
popular_times: List[PopularTimes] = field(default_factory=list)
|
|
live_busyness: Optional[int] = None # current busyness percentage
|
|
typical_visit_duration: Optional[str] = None
|
|
|
|
# Related
|
|
related_places: List[RelatedPlace] = field(default_factory=list)
|
|
|
|
# Photos
|
|
photo_count: Optional[int] = None
|
|
photo_categories: Dict[str, int] = field(default_factory=dict)
|
|
|
|
# Accessibility
|
|
accessibility_features: List[str] = field(default_factory=list)
|
|
|
|
# Metadata
|
|
scrape_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
scrape_method: str = "playwright"
|
|
|
|
|
|
def parse_snapshot_for_reviews(snapshot_text: str) -> GoogleMapsScrapedData:
|
|
"""
|
|
Parse a Playwright accessibility snapshot for Google Maps data.
|
|
|
|
This parser is designed to work with the YAML-like accessibility tree format
|
|
returned by MCP Playwright browser_snapshot tool.
|
|
|
|
Args:
|
|
snapshot_text: The accessibility tree text from Playwright
|
|
|
|
Returns:
|
|
Parsed GoogleMapsScrapedData
|
|
"""
|
|
data = GoogleMapsScrapedData(name="Unknown")
|
|
|
|
lines = snapshot_text.split('\n')
|
|
|
|
for i, line in enumerate(lines):
|
|
# Extract place name from heading
|
|
if 'heading "' in line and 'level=1' in line:
|
|
name_match = re.search(r'heading "([^"]+)"', line)
|
|
if name_match and data.name == "Unknown":
|
|
data.name = name_match.group(1)
|
|
|
|
# Rating and review count from img alt text like "4.7 stars" and "108,805 reviews"
|
|
if 'img "' in line:
|
|
# Rating pattern - "4.7 stars"
|
|
rating_img = re.search(r'img "(\d+\.?\d*)\s*stars?"', line)
|
|
if rating_img and data.rating is None:
|
|
data.rating = float(rating_img.group(1))
|
|
|
|
# Review count pattern - "(108,805)" or "108,805 reviews"
|
|
reviews_img = re.search(r'img "[^"]*?([\d,]+)\s*[Rr]eviews?"', line)
|
|
if reviews_img and data.total_reviews is None:
|
|
data.total_reviews = int(reviews_img.group(1).replace(',', '').replace('.', ''))
|
|
|
|
# Review breakdown pattern - "5 stars, 81,221 reviews"
|
|
breakdown_match = re.search(r'img "(\d)\s*stars?,?\s*([\d,\.]+)\s*reviews?"', line)
|
|
if breakdown_match:
|
|
stars = int(breakdown_match.group(1))
|
|
count = int(breakdown_match.group(2).replace(',', '').replace('.', ''))
|
|
if data.review_breakdown is None:
|
|
data.review_breakdown = ReviewBreakdown()
|
|
if stars == 5:
|
|
data.review_breakdown.five_star = count
|
|
elif stars == 4:
|
|
data.review_breakdown.four_star = count
|
|
elif stars == 3:
|
|
data.review_breakdown.three_star = count
|
|
elif stars == 2:
|
|
data.review_breakdown.two_star = count
|
|
elif stars == 1:
|
|
data.review_breakdown.one_star = count
|
|
|
|
# Busyness data - "Currently 28% busy, usually 41% busy"
|
|
busyness_match = re.search(r'Currently (\d+)% busy', line)
|
|
if busyness_match:
|
|
data.live_busyness = int(busyness_match.group(1))
|
|
|
|
# Address from button - 'button "Address: Museumstraat 1, 1071 XX Amsterdam"'
|
|
if 'Address:' in line:
|
|
addr_match = re.search(r'Address:\s*([^"]+)"', line)
|
|
if addr_match:
|
|
data.address = addr_match.group(1).strip()
|
|
|
|
# Plus code - 'button "Plus code: 9V5P+X3 Amsterdam"'
|
|
if 'Plus code:' in line:
|
|
plus_match = re.search(r'Plus code:\s*([^"]+)"', line)
|
|
if plus_match:
|
|
data.plus_code = plus_match.group(1).strip()
|
|
|
|
# Phone from button - 'button "Phone: 020 674 7000"'
|
|
if 'Phone:' in line:
|
|
phone_match = re.search(r'Phone:\s*([^"]+)"', line)
|
|
if phone_match:
|
|
data.phone = phone_match.group(1).strip()
|
|
|
|
# Website from link text - 'link "Website: rijksmuseum.nl"'
|
|
if 'Website:' in line:
|
|
website_match = re.search(r'Website:\s*([^"]+)"', line)
|
|
if website_match:
|
|
data.website = website_match.group(1).strip()
|
|
|
|
# Hours from button - 'button "Open · Closes 5 pm'
|
|
if 'Open · Closes' in line or 'Closed · Opens' in line:
|
|
hours_match = re.search(r'(Open|Closed)\s*·\s*(Closes|Opens)\s+(\d+\s*(?:am|pm))', line, re.IGNORECASE)
|
|
if hours_match:
|
|
data.hours_text = f"{hours_match.group(1)} · {hours_match.group(2)} {hours_match.group(3)}"
|
|
data.is_open_now = hours_match.group(1).lower() == 'open'
|
|
|
|
# Review topics from radio buttons - 'radio "rembrandt, mentioned in 2,834 reviews"'
|
|
topic_match = re.search(r'radio "([^,]+),\s*mentioned in ([\d,]+) reviews"', line)
|
|
if topic_match:
|
|
topic = topic_match.group(1).strip().lower()
|
|
count = int(topic_match.group(2).replace(',', ''))
|
|
data.review_topics[topic] = count
|
|
|
|
# Related places from links - 'link "Van Gogh Museum · 4.6 stars · 101,806 reviews · Art museum"'
|
|
related_match = re.search(r'link "([^·]+)·\s*(\d+\.?\d*)\s*stars?\s*·\s*([\d,]+)\s*reviews?\s*·\s*([^"]+)"', line)
|
|
if related_match:
|
|
place = RelatedPlace(
|
|
name=related_match.group(1).strip(),
|
|
rating=float(related_match.group(2)),
|
|
review_count=int(related_match.group(3).replace(',', '')),
|
|
place_type=related_match.group(4).strip()
|
|
)
|
|
data.related_places.append(place)
|
|
|
|
# Extract coordinates from URL if present
|
|
url_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', snapshot_text)
|
|
if url_match:
|
|
data.latitude = float(url_match.group(1))
|
|
data.longitude = float(url_match.group(2))
|
|
|
|
# Extract Google Maps URL
|
|
maps_url_match = re.search(r'https://www\.google\.com/maps/place/[^\s\'"]+', snapshot_text)
|
|
if maps_url_match:
|
|
data.google_maps_url = maps_url_match.group(0)
|
|
|
|
# Parse individual reviews from generic blocks with author names
|
|
review_blocks = re.findall(r'generic "([^"]+)" \[ref=e\d+\]:\s*\n((?:.*?\n)*?)(?=generic "[^"]+" \[ref=e\d+\]:|$)', snapshot_text)
|
|
for author_name, block in review_blocks:
|
|
# Skip non-review blocks
|
|
if 'Local Guide' in block or 'reviews' in block.lower():
|
|
review = ReviewData(author_name=author_name)
|
|
|
|
# Extract rating
|
|
rating_match = re.search(r'img "(\d)\s*stars?"', block)
|
|
if rating_match:
|
|
review.rating = int(rating_match.group(1))
|
|
|
|
# Extract time
|
|
time_match = re.search(r'generic \[ref=e\d+\]: (a week ago|a month ago|a year ago|\d+\s*(?:day|week|month|year)s?\s*ago)', block)
|
|
if time_match:
|
|
review.relative_time = time_match.group(1)
|
|
|
|
# Check if local guide
|
|
if 'Local Guide' in block:
|
|
review.local_guide = True
|
|
|
|
# Extract author stats
|
|
stats_match = re.search(r'(\d+)\s*reviews?\s*·\s*(\d+)\s*photos?', block)
|
|
if stats_match:
|
|
review.reviews_by_author = int(stats_match.group(1))
|
|
review.photos_by_author = int(stats_match.group(2))
|
|
|
|
# Extract review text (the longest text line)
|
|
text_lines = re.findall(r'text: (.+)', block)
|
|
if text_lines:
|
|
review.text = max(text_lines, key=len)
|
|
|
|
# Count photos in review
|
|
photo_count = len(re.findall(r'Photo \d+ on .+\'s review', block))
|
|
review.photos_count = photo_count
|
|
|
|
if review.rating is not None: # Only add if we found a rating
|
|
data.reviews.append(review)
|
|
|
|
return data
|
|
|
|
|
|
def extract_review_from_snapshot_block(block: str) -> Optional[ReviewData]:
|
|
"""Extract a single review from a snapshot text block."""
|
|
review = ReviewData(author_name="Unknown")
|
|
|
|
lines = block.strip().split('\n')
|
|
|
|
for i, line in enumerate(lines):
|
|
# Author name - typically first link text
|
|
if 'link' in line.lower() and review.author_name == "Unknown":
|
|
name_match = re.search(r'"([^"]+)"', line)
|
|
if name_match:
|
|
review.author_name = name_match.group(1)
|
|
|
|
# Rating
|
|
rating_match = re.search(r'(\d)\s*stars?', line)
|
|
if rating_match:
|
|
review.rating = int(rating_match.group(1))
|
|
|
|
# Time
|
|
time_match = re.search(r'(\d+\s*(?:day|week|month|year)s?\s*ago|a\s+(?:day|week|month|year)\s+ago)', line, re.IGNORECASE)
|
|
if time_match:
|
|
review.relative_time = time_match.group(1)
|
|
|
|
# Local guide
|
|
if 'local guide' in line.lower():
|
|
review.local_guide = True
|
|
|
|
# Reviews/photos by author
|
|
author_stats = re.search(r'(\d+)\s*reviews?.*?(\d+)\s*photos?', line, re.IGNORECASE)
|
|
if author_stats:
|
|
review.reviews_by_author = int(author_stats.group(1))
|
|
review.photos_by_author = int(author_stats.group(2))
|
|
|
|
# Get review text - usually longest text block
|
|
text_candidates = []
|
|
for line in lines:
|
|
if len(line) > 50 and 'stars' not in line.lower() and 'reviews' not in line.lower():
|
|
text_candidates.append(line)
|
|
if text_candidates:
|
|
review.text = max(text_candidates, key=len)
|
|
|
|
return review if review.author_name != "Unknown" else None
|
|
|
|
|
|
def to_json_serializable(data: GoogleMapsScrapedData) -> Dict[str, Any]:
|
|
"""Convert dataclass to JSON-serializable dict."""
|
|
result: Dict[str, Any] = {
|
|
"name": data.name,
|
|
"scrape_timestamp": data.scrape_timestamp,
|
|
"scrape_method": data.scrape_method,
|
|
}
|
|
|
|
# Add optional fields if present
|
|
if data.place_id:
|
|
result["place_id"] = data.place_id
|
|
if data.google_maps_url:
|
|
result["google_maps_url"] = data.google_maps_url
|
|
if data.address:
|
|
result["address"] = data.address
|
|
if data.latitude and data.longitude:
|
|
result["coordinates"] = {"latitude": data.latitude, "longitude": data.longitude}
|
|
if data.phone:
|
|
result["phone"] = data.phone
|
|
if data.website:
|
|
result["website"] = data.website
|
|
if data.rating:
|
|
result["rating"] = data.rating
|
|
if data.total_reviews:
|
|
result["total_reviews"] = data.total_reviews
|
|
|
|
if data.review_breakdown:
|
|
result["review_breakdown"] = {
|
|
"5_star": data.review_breakdown.five_star,
|
|
"4_star": data.review_breakdown.four_star,
|
|
"3_star": data.review_breakdown.three_star,
|
|
"2_star": data.review_breakdown.two_star,
|
|
"1_star": data.review_breakdown.one_star,
|
|
}
|
|
|
|
if data.hours_text:
|
|
result["hours_text"] = data.hours_text
|
|
if data.is_open_now is not None:
|
|
result["is_open_now"] = data.is_open_now
|
|
|
|
if data.review_topics:
|
|
result["review_topics"] = data.review_topics
|
|
|
|
if data.reviews:
|
|
result["reviews"] = [
|
|
{
|
|
"author_name": r.author_name,
|
|
"rating": r.rating,
|
|
"text": r.text,
|
|
"relative_time": r.relative_time,
|
|
"local_guide": r.local_guide,
|
|
"reviews_by_author": r.reviews_by_author,
|
|
"photos_by_author": r.photos_by_author,
|
|
}
|
|
for r in data.reviews
|
|
]
|
|
|
|
if data.related_places:
|
|
result["related_places"] = [
|
|
{
|
|
"name": p.name,
|
|
"rating": p.rating,
|
|
"review_count": p.review_count,
|
|
}
|
|
for p in data.related_places
|
|
]
|
|
|
|
if data.photo_count:
|
|
result["photo_count"] = data.photo_count
|
|
|
|
if data.live_busyness:
|
|
result["live_busyness_percent"] = data.live_busyness
|
|
|
|
return result
|
|
|
|
|
|
# MCP Playwright interaction functions
|
|
# These generate the commands to be run via MCP tools
|
|
|
|
def get_search_url(name: str, city: str, country: str = "") -> str:
|
|
"""Generate Google Maps search URL."""
|
|
query = f"{name} {city} {country}".strip()
|
|
encoded = query.replace(' ', '+')
|
|
return f"https://www.google.com/maps/search/{encoded}"
|
|
|
|
|
|
def get_place_url_from_place_id(place_id: str) -> str:
|
|
"""Generate Google Maps URL from place ID."""
|
|
return f"https://www.google.com/maps/place/?q=place_id:{place_id}"
|
|
|
|
|
|
# Instructions for MCP Playwright usage
|
|
MCP_INSTRUCTIONS = """
|
|
## How to scrape Google Maps with MCP Playwright
|
|
|
|
1. Navigate to the place:
|
|
playwright_browser_navigate(url="https://www.google.com/maps/search/Rijksmuseum+Amsterdam")
|
|
|
|
2. Wait for the page to load:
|
|
playwright_browser_wait_for(time=3)
|
|
|
|
3. Take a snapshot to get the accessibility tree:
|
|
playwright_browser_snapshot()
|
|
|
|
4. To get more reviews, click the reviews section:
|
|
playwright_browser_click(element="Reviews", ref="<ref from snapshot>")
|
|
|
|
5. Scroll to load more reviews:
|
|
playwright_browser_evaluate(function="() => { document.querySelector('[role=main]').scrollTo(0, 10000) }")
|
|
|
|
6. Take another snapshot to get loaded reviews:
|
|
playwright_browser_snapshot()
|
|
|
|
7. Parse the snapshot using parse_snapshot_for_reviews()
|
|
|
|
## Data Available:
|
|
- Rating and total review count
|
|
- Review breakdown by stars (5★, 4★, 3★, 2★, 1★)
|
|
- Individual reviews with author, date, text, photos
|
|
- Review topics (e.g., "rembrandt (2,834)")
|
|
- Popular times / live busyness
|
|
- Related places
|
|
- Hours, phone, website
|
|
- Address
|
|
"""
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Scrape Google Maps reviews with Playwright")
|
|
parser.add_argument("--name", help="Institution name")
|
|
parser.add_argument("--city", help="City name")
|
|
parser.add_argument("--country", default="", help="Country name (optional)")
|
|
parser.add_argument("--url", help="Direct Google Maps URL")
|
|
parser.add_argument("--place-id", help="Google Place ID")
|
|
parser.add_argument("--ghcid", help="GHCID to look up institution")
|
|
parser.add_argument("--output", help="Output JSON file path")
|
|
parser.add_argument("--instructions", action="store_true", help="Show MCP Playwright instructions")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.instructions:
|
|
print(MCP_INSTRUCTIONS)
|
|
return 0
|
|
|
|
if args.url:
|
|
print(f"Use MCP Playwright to navigate to: {args.url}")
|
|
elif args.place_id:
|
|
url = get_place_url_from_place_id(args.place_id)
|
|
print(f"Use MCP Playwright to navigate to: {url}")
|
|
elif args.name and args.city:
|
|
url = get_search_url(args.name, args.city, args.country)
|
|
print(f"Use MCP Playwright to navigate to: {url}")
|
|
else:
|
|
print("Please provide --url, --place-id, or --name and --city")
|
|
print("\nFor MCP Playwright usage instructions, run with --instructions")
|
|
return 1
|
|
|
|
print("\nRun the following MCP Playwright commands:")
|
|
print("1. playwright_browser_navigate(url=<url above>)")
|
|
print("2. playwright_browser_wait_for(time=3)")
|
|
print("3. playwright_browser_snapshot()")
|
|
print("\nThen parse the snapshot with this script's parse_snapshot_for_reviews() function")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|