glam/scripts/scrape_google_maps_reviews.py
2025-12-09 10:46:43 +01:00

515 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Scrape Google Maps reviews using Playwright.
This script uses Playwright to scrape comprehensive review data from Google Maps,
overcoming the Google Places API limitation of only 5 reviews per place.
Features:
- Full review extraction (100+ reviews with scroll pagination)
- Review breakdown by star rating
- Popular times/busyness data
- Related places
- Review topics/themes
- Individual review metadata (author, date, photos, text)
Usage:
python scripts/scrape_google_maps_reviews.py --name "Rijksmuseum" --city "Amsterdam"
python scripts/scrape_google_maps_reviews.py --url "https://www.google.com/maps/place/..."
python scripts/scrape_google_maps_reviews.py --ghcid "NL-NH-AMS-M-RM"
Output:
JSON file with comprehensive Google Maps data
Note:
This script is designed to be run interactively with MCP Playwright tools.
For batch processing, use the GoogleMapsPlaywrightScraper class directly.
"""
import json
import re
import asyncio
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import Optional, List, Dict, Any
import logging
import argparse
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@dataclass
class ReviewData:
"""Individual review data."""
author_name: str
author_url: Optional[str] = None
rating: Optional[int] = None
text: Optional[str] = None
relative_time: Optional[str] = None
absolute_time: Optional[str] = None
photos_count: int = 0
review_url: Optional[str] = None
local_guide: bool = False
reviews_by_author: Optional[int] = None
photos_by_author: Optional[int] = None
@dataclass
class ReviewBreakdown:
"""Star rating breakdown."""
five_star: int = 0
four_star: int = 0
three_star: int = 0
two_star: int = 0
one_star: int = 0
@dataclass
class PopularTimes:
"""Popular times data for a day."""
day: str
hours: Dict[int, int] = field(default_factory=dict) # hour -> busyness percentage
@dataclass
class RelatedPlace:
"""Related/nearby place."""
name: str
rating: Optional[float] = None
review_count: Optional[int] = None
place_type: Optional[str] = None
url: Optional[str] = None
@dataclass
class GoogleMapsScrapedData:
"""Complete scraped data from Google Maps."""
# Basic info
name: str
place_id: Optional[str] = None
google_maps_url: Optional[str] = None
# Location
address: Optional[str] = None
latitude: Optional[float] = None
longitude: Optional[float] = None
plus_code: Optional[str] = None
# Contact
phone: Optional[str] = None
website: Optional[str] = None
# Business info
business_status: Optional[str] = None # OPERATIONAL, CLOSED_TEMPORARILY, etc.
price_level: Optional[str] = None
place_types: List[str] = field(default_factory=list)
# Hours
hours_text: Optional[str] = None
is_open_now: Optional[bool] = None
opening_hours: Optional[Dict[str, str]] = None
# Ratings
rating: Optional[float] = None
total_reviews: Optional[int] = None
review_breakdown: Optional[ReviewBreakdown] = None
# Reviews
reviews: List[ReviewData] = field(default_factory=list)
review_topics: Dict[str, int] = field(default_factory=dict) # topic -> count
# Popular times
popular_times: List[PopularTimes] = field(default_factory=list)
live_busyness: Optional[int] = None # current busyness percentage
typical_visit_duration: Optional[str] = None
# Related
related_places: List[RelatedPlace] = field(default_factory=list)
# Photos
photo_count: Optional[int] = None
photo_categories: Dict[str, int] = field(default_factory=dict)
# Accessibility
accessibility_features: List[str] = field(default_factory=list)
# Metadata
scrape_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
scrape_method: str = "playwright"
def parse_snapshot_for_reviews(snapshot_text: str) -> GoogleMapsScrapedData:
"""
Parse a Playwright accessibility snapshot for Google Maps data.
This parser is designed to work with the YAML-like accessibility tree format
returned by MCP Playwright browser_snapshot tool.
Args:
snapshot_text: The accessibility tree text from Playwright
Returns:
Parsed GoogleMapsScrapedData
"""
data = GoogleMapsScrapedData(name="Unknown")
lines = snapshot_text.split('\n')
for i, line in enumerate(lines):
# Extract place name from heading
if 'heading "' in line and 'level=1' in line:
name_match = re.search(r'heading "([^"]+)"', line)
if name_match and data.name == "Unknown":
data.name = name_match.group(1)
# Rating and review count from img alt text like "4.7 stars" and "108,805 reviews"
if 'img "' in line:
# Rating pattern - "4.7 stars"
rating_img = re.search(r'img "(\d+\.?\d*)\s*stars?"', line)
if rating_img and data.rating is None:
data.rating = float(rating_img.group(1))
# Review count pattern - "(108,805)" or "108,805 reviews"
reviews_img = re.search(r'img "[^"]*?([\d,]+)\s*[Rr]eviews?"', line)
if reviews_img and data.total_reviews is None:
data.total_reviews = int(reviews_img.group(1).replace(',', '').replace('.', ''))
# Review breakdown pattern - "5 stars, 81,221 reviews"
breakdown_match = re.search(r'img "(\d)\s*stars?,?\s*([\d,\.]+)\s*reviews?"', line)
if breakdown_match:
stars = int(breakdown_match.group(1))
count = int(breakdown_match.group(2).replace(',', '').replace('.', ''))
if data.review_breakdown is None:
data.review_breakdown = ReviewBreakdown()
if stars == 5:
data.review_breakdown.five_star = count
elif stars == 4:
data.review_breakdown.four_star = count
elif stars == 3:
data.review_breakdown.three_star = count
elif stars == 2:
data.review_breakdown.two_star = count
elif stars == 1:
data.review_breakdown.one_star = count
# Busyness data - "Currently 28% busy, usually 41% busy"
busyness_match = re.search(r'Currently (\d+)% busy', line)
if busyness_match:
data.live_busyness = int(busyness_match.group(1))
# Address from button - 'button "Address: Museumstraat 1, 1071 XX Amsterdam"'
if 'Address:' in line:
addr_match = re.search(r'Address:\s*([^"]+)"', line)
if addr_match:
data.address = addr_match.group(1).strip()
# Plus code - 'button "Plus code: 9V5P+X3 Amsterdam"'
if 'Plus code:' in line:
plus_match = re.search(r'Plus code:\s*([^"]+)"', line)
if plus_match:
data.plus_code = plus_match.group(1).strip()
# Phone from button - 'button "Phone: 020 674 7000"'
if 'Phone:' in line:
phone_match = re.search(r'Phone:\s*([^"]+)"', line)
if phone_match:
data.phone = phone_match.group(1).strip()
# Website from link text - 'link "Website: rijksmuseum.nl"'
if 'Website:' in line:
website_match = re.search(r'Website:\s*([^"]+)"', line)
if website_match:
data.website = website_match.group(1).strip()
# Hours from button - 'button "Open · Closes 5 pm'
if 'Open · Closes' in line or 'Closed · Opens' in line:
hours_match = re.search(r'(Open|Closed)\s*·\s*(Closes|Opens)\s+(\d+\s*(?:am|pm))', line, re.IGNORECASE)
if hours_match:
data.hours_text = f"{hours_match.group(1)} · {hours_match.group(2)} {hours_match.group(3)}"
data.is_open_now = hours_match.group(1).lower() == 'open'
# Review topics from radio buttons - 'radio "rembrandt, mentioned in 2,834 reviews"'
topic_match = re.search(r'radio "([^,]+),\s*mentioned in ([\d,]+) reviews"', line)
if topic_match:
topic = topic_match.group(1).strip().lower()
count = int(topic_match.group(2).replace(',', ''))
data.review_topics[topic] = count
# Related places from links - 'link "Van Gogh Museum · 4.6 stars · 101,806 reviews · Art museum"'
related_match = re.search(r'link "([^·]+)·\s*(\d+\.?\d*)\s*stars?\s*·\s*([\d,]+)\s*reviews?\s*·\s*([^"]+)"', line)
if related_match:
place = RelatedPlace(
name=related_match.group(1).strip(),
rating=float(related_match.group(2)),
review_count=int(related_match.group(3).replace(',', '')),
place_type=related_match.group(4).strip()
)
data.related_places.append(place)
# Extract coordinates from URL if present
url_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', snapshot_text)
if url_match:
data.latitude = float(url_match.group(1))
data.longitude = float(url_match.group(2))
# Extract Google Maps URL
maps_url_match = re.search(r'https://www\.google\.com/maps/place/[^\s\'"]+', snapshot_text)
if maps_url_match:
data.google_maps_url = maps_url_match.group(0)
# Parse individual reviews from generic blocks with author names
review_blocks = re.findall(r'generic "([^"]+)" \[ref=e\d+\]:\s*\n((?:.*?\n)*?)(?=generic "[^"]+" \[ref=e\d+\]:|$)', snapshot_text)
for author_name, block in review_blocks:
# Skip non-review blocks
if 'Local Guide' in block or 'reviews' in block.lower():
review = ReviewData(author_name=author_name)
# Extract rating
rating_match = re.search(r'img "(\d)\s*stars?"', block)
if rating_match:
review.rating = int(rating_match.group(1))
# Extract time
time_match = re.search(r'generic \[ref=e\d+\]: (a week ago|a month ago|a year ago|\d+\s*(?:day|week|month|year)s?\s*ago)', block)
if time_match:
review.relative_time = time_match.group(1)
# Check if local guide
if 'Local Guide' in block:
review.local_guide = True
# Extract author stats
stats_match = re.search(r'(\d+)\s*reviews?\s*·\s*(\d+)\s*photos?', block)
if stats_match:
review.reviews_by_author = int(stats_match.group(1))
review.photos_by_author = int(stats_match.group(2))
# Extract review text (the longest text line)
text_lines = re.findall(r'text: (.+)', block)
if text_lines:
review.text = max(text_lines, key=len)
# Count photos in review
photo_count = len(re.findall(r'Photo \d+ on .+\'s review', block))
review.photos_count = photo_count
if review.rating is not None: # Only add if we found a rating
data.reviews.append(review)
return data
def extract_review_from_snapshot_block(block: str) -> Optional[ReviewData]:
"""Extract a single review from a snapshot text block."""
review = ReviewData(author_name="Unknown")
lines = block.strip().split('\n')
for i, line in enumerate(lines):
# Author name - typically first link text
if 'link' in line.lower() and review.author_name == "Unknown":
name_match = re.search(r'"([^"]+)"', line)
if name_match:
review.author_name = name_match.group(1)
# Rating
rating_match = re.search(r'(\d)\s*stars?', line)
if rating_match:
review.rating = int(rating_match.group(1))
# Time
time_match = re.search(r'(\d+\s*(?:day|week|month|year)s?\s*ago|a\s+(?:day|week|month|year)\s+ago)', line, re.IGNORECASE)
if time_match:
review.relative_time = time_match.group(1)
# Local guide
if 'local guide' in line.lower():
review.local_guide = True
# Reviews/photos by author
author_stats = re.search(r'(\d+)\s*reviews?.*?(\d+)\s*photos?', line, re.IGNORECASE)
if author_stats:
review.reviews_by_author = int(author_stats.group(1))
review.photos_by_author = int(author_stats.group(2))
# Get review text - usually longest text block
text_candidates = []
for line in lines:
if len(line) > 50 and 'stars' not in line.lower() and 'reviews' not in line.lower():
text_candidates.append(line)
if text_candidates:
review.text = max(text_candidates, key=len)
return review if review.author_name != "Unknown" else None
def to_json_serializable(data: GoogleMapsScrapedData) -> Dict[str, Any]:
"""Convert dataclass to JSON-serializable dict."""
result: Dict[str, Any] = {
"name": data.name,
"scrape_timestamp": data.scrape_timestamp,
"scrape_method": data.scrape_method,
}
# Add optional fields if present
if data.place_id:
result["place_id"] = data.place_id
if data.google_maps_url:
result["google_maps_url"] = data.google_maps_url
if data.address:
result["address"] = data.address
if data.latitude and data.longitude:
result["coordinates"] = {"latitude": data.latitude, "longitude": data.longitude}
if data.phone:
result["phone"] = data.phone
if data.website:
result["website"] = data.website
if data.rating:
result["rating"] = data.rating
if data.total_reviews:
result["total_reviews"] = data.total_reviews
if data.review_breakdown:
result["review_breakdown"] = {
"5_star": data.review_breakdown.five_star,
"4_star": data.review_breakdown.four_star,
"3_star": data.review_breakdown.three_star,
"2_star": data.review_breakdown.two_star,
"1_star": data.review_breakdown.one_star,
}
if data.hours_text:
result["hours_text"] = data.hours_text
if data.is_open_now is not None:
result["is_open_now"] = data.is_open_now
if data.review_topics:
result["review_topics"] = data.review_topics
if data.reviews:
result["reviews"] = [
{
"author_name": r.author_name,
"rating": r.rating,
"text": r.text,
"relative_time": r.relative_time,
"local_guide": r.local_guide,
"reviews_by_author": r.reviews_by_author,
"photos_by_author": r.photos_by_author,
}
for r in data.reviews
]
if data.related_places:
result["related_places"] = [
{
"name": p.name,
"rating": p.rating,
"review_count": p.review_count,
}
for p in data.related_places
]
if data.photo_count:
result["photo_count"] = data.photo_count
if data.live_busyness:
result["live_busyness_percent"] = data.live_busyness
return result
# MCP Playwright interaction functions
# These generate the commands to be run via MCP tools
def get_search_url(name: str, city: str, country: str = "") -> str:
"""Generate Google Maps search URL."""
query = f"{name} {city} {country}".strip()
encoded = query.replace(' ', '+')
return f"https://www.google.com/maps/search/{encoded}"
def get_place_url_from_place_id(place_id: str) -> str:
"""Generate Google Maps URL from place ID."""
return f"https://www.google.com/maps/place/?q=place_id:{place_id}"
# Instructions for MCP Playwright usage
MCP_INSTRUCTIONS = """
## How to scrape Google Maps with MCP Playwright
1. Navigate to the place:
playwright_browser_navigate(url="https://www.google.com/maps/search/Rijksmuseum+Amsterdam")
2. Wait for the page to load:
playwright_browser_wait_for(time=3)
3. Take a snapshot to get the accessibility tree:
playwright_browser_snapshot()
4. To get more reviews, click the reviews section:
playwright_browser_click(element="Reviews", ref="<ref from snapshot>")
5. Scroll to load more reviews:
playwright_browser_evaluate(function="() => { document.querySelector('[role=main]').scrollTo(0, 10000) }")
6. Take another snapshot to get loaded reviews:
playwright_browser_snapshot()
7. Parse the snapshot using parse_snapshot_for_reviews()
## Data Available:
- Rating and total review count
- Review breakdown by stars (5★, 4★, 3★, 2★, 1★)
- Individual reviews with author, date, text, photos
- Review topics (e.g., "rembrandt (2,834)")
- Popular times / live busyness
- Related places
- Hours, phone, website
- Address
"""
def main():
parser = argparse.ArgumentParser(description="Scrape Google Maps reviews with Playwright")
parser.add_argument("--name", help="Institution name")
parser.add_argument("--city", help="City name")
parser.add_argument("--country", default="", help="Country name (optional)")
parser.add_argument("--url", help="Direct Google Maps URL")
parser.add_argument("--place-id", help="Google Place ID")
parser.add_argument("--ghcid", help="GHCID to look up institution")
parser.add_argument("--output", help="Output JSON file path")
parser.add_argument("--instructions", action="store_true", help="Show MCP Playwright instructions")
args = parser.parse_args()
if args.instructions:
print(MCP_INSTRUCTIONS)
return 0
if args.url:
print(f"Use MCP Playwright to navigate to: {args.url}")
elif args.place_id:
url = get_place_url_from_place_id(args.place_id)
print(f"Use MCP Playwright to navigate to: {url}")
elif args.name and args.city:
url = get_search_url(args.name, args.city, args.country)
print(f"Use MCP Playwright to navigate to: {url}")
else:
print("Please provide --url, --place-id, or --name and --city")
print("\nFor MCP Playwright usage instructions, run with --instructions")
return 1
print("\nRun the following MCP Playwright commands:")
print("1. playwright_browser_navigate(url=<url above>)")
print("2. playwright_browser_wait_for(time=3)")
print("3. playwright_browser_snapshot()")
print("\nThen parse the snapshot with this script's parse_snapshot_for_reviews() function")
return 0
if __name__ == "__main__":
exit(main())