glam/scripts/scrape_google_maps_reviews.py

#!/usr/bin/env python3
"""
Scrape Google Maps reviews using Playwright.

This script uses Playwright to scrape comprehensive review data from Google Maps,
overcoming the Google Places API limitation of only 5 reviews per place.

Features:
- Full review extraction (100+ reviews with scroll pagination)
- Review breakdown by star rating
- Popular times/busyness data
- Related places
- Review topics/themes
- Individual review metadata (author, date, photos, text)

Usage:
    python scripts/scrape_google_maps_reviews.py --name "Rijksmuseum" --city "Amsterdam"
    python scripts/scrape_google_maps_reviews.py --url "https://www.google.com/maps/place/..."
    python scripts/scrape_google_maps_reviews.py --ghcid "NL-NH-AMS-M-RM"

Output:
    JSON file with comprehensive Google Maps data

Note:
    This script is designed to be run interactively with MCP Playwright tools.
    For batch processing, use the GoogleMapsPlaywrightScraper class directly.
"""

import json
import re
import asyncio
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import Optional, List, Dict, Any
import logging
import argparse

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


@dataclass
class ReviewData:
    """Individual review data."""
    author_name: str
    author_url: Optional[str] = None
    rating: Optional[int] = None
    text: Optional[str] = None
    relative_time: Optional[str] = None
    absolute_time: Optional[str] = None
    photos_count: int = 0
    review_url: Optional[str] = None
    local_guide: bool = False
    reviews_by_author: Optional[int] = None
    photos_by_author: Optional[int] = None


@dataclass
class ReviewBreakdown:
    """Star rating breakdown."""
    five_star: int = 0
    four_star: int = 0
    three_star: int = 0
    two_star: int = 0
    one_star: int = 0


@dataclass
class PopularTimes:
    """Popular times data for a day."""
    day: str
    hours: Dict[int, int] = field(default_factory=dict)  # hour -> busyness percentage


@dataclass
class RelatedPlace:
    """Related/nearby place."""
    name: str
    rating: Optional[float] = None
    review_count: Optional[int] = None
    place_type: Optional[str] = None
    url: Optional[str] = None


@dataclass
class GoogleMapsScrapedData:
    """Complete scraped data from Google Maps."""
    # Basic info
    name: str
    place_id: Optional[str] = None
    google_maps_url: Optional[str] = None

    # Location
    address: Optional[str] = None
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    plus_code: Optional[str] = None

    # Contact
    phone: Optional[str] = None
    website: Optional[str] = None

    # Business info
    business_status: Optional[str] = None  # OPERATIONAL, CLOSED_TEMPORARILY, etc.
    price_level: Optional[str] = None
    place_types: List[str] = field(default_factory=list)

    # Hours
    hours_text: Optional[str] = None
    is_open_now: Optional[bool] = None
    opening_hours: Optional[Dict[str, str]] = None

    # Ratings
    rating: Optional[float] = None
    total_reviews: Optional[int] = None
    review_breakdown: Optional[ReviewBreakdown] = None

    # Reviews
    reviews: List[ReviewData] = field(default_factory=list)
    review_topics: Dict[str, int] = field(default_factory=dict)  # topic -> count

    # Popular times
    popular_times: List[PopularTimes] = field(default_factory=list)
    live_busyness: Optional[int] = None  # current busyness percentage
    typical_visit_duration: Optional[str] = None

    # Related
    related_places: List[RelatedPlace] = field(default_factory=list)

    # Photos
    photo_count: Optional[int] = None
    photo_categories: Dict[str, int] = field(default_factory=dict)

    # Accessibility
    accessibility_features: List[str] = field(default_factory=list)

    # Metadata
    scrape_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    scrape_method: str = "playwright"


def parse_snapshot_for_reviews(snapshot_text: str) -> GoogleMapsScrapedData:
    """
    Parse a Playwright accessibility snapshot for Google Maps data.

    This parser is designed to work with the YAML-like accessibility tree format
    returned by MCP Playwright browser_snapshot tool.

    Args:
        snapshot_text: The accessibility tree text from Playwright

    Returns:
        Parsed GoogleMapsScrapedData
    """
    data = GoogleMapsScrapedData(name="Unknown")

    lines = snapshot_text.split('\n')

    for i, line in enumerate(lines):
        # Extract place name from heading
        if 'heading "' in line and 'level=1' in line:
            name_match = re.search(r'heading "([^"]+)"', line)
            if name_match and data.name == "Unknown":
                data.name = name_match.group(1)

        # Rating and review count from img alt text like "4.7 stars" and "108,805 reviews"
        if 'img "' in line:
            # Rating pattern - "4.7 stars"
            rating_img = re.search(r'img "(\d+\.?\d*)\s*stars?"', line)
            if rating_img and data.rating is None:
                data.rating = float(rating_img.group(1))

            # Review count pattern - "(108,805)" or "108,805 reviews"
            reviews_img = re.search(r'img "[^"]*?([\d,]+)\s*[Rr]eviews?"', line)
            if reviews_img and data.total_reviews is None:
                data.total_reviews = int(reviews_img.group(1).replace(',', '').replace('.', ''))

            # Review breakdown pattern - "5 stars, 81,221 reviews"
            breakdown_match = re.search(r'img "(\d)\s*stars?,?\s*([\d,\.]+)\s*reviews?"', line)
            if breakdown_match:
                stars = int(breakdown_match.group(1))
                count = int(breakdown_match.group(2).replace(',', '').replace('.', ''))
                if data.review_breakdown is None:
                    data.review_breakdown = ReviewBreakdown()
                if stars == 5:
                    data.review_breakdown.five_star = count
                elif stars == 4:
                    data.review_breakdown.four_star = count
                elif stars == 3:
                    data.review_breakdown.three_star = count
                elif stars == 2:
                    data.review_breakdown.two_star = count
                elif stars == 1:
                    data.review_breakdown.one_star = count

            # Busyness data - "Currently 28% busy, usually 41% busy"
            busyness_match = re.search(r'Currently (\d+)% busy', line)
            if busyness_match:
                data.live_busyness = int(busyness_match.group(1))

        # Address from button - 'button "Address: Museumstraat 1, 1071 XX Amsterdam"'
        if 'Address:' in line:
            addr_match = re.search(r'Address:\s*([^"]+)"', line)
            if addr_match:
                data.address = addr_match.group(1).strip()

        # Plus code - 'button "Plus code: 9V5P+X3 Amsterdam"'
        if 'Plus code:' in line:
            plus_match = re.search(r'Plus code:\s*([^"]+)"', line)
            if plus_match:
                data.plus_code = plus_match.group(1).strip()

        # Phone from button - 'button "Phone: 020 674 7000"'
        if 'Phone:' in line:
            phone_match = re.search(r'Phone:\s*([^"]+)"', line)
            if phone_match:
                data.phone = phone_match.group(1).strip()

        # Website from link text - 'link "Website: rijksmuseum.nl"'
        if 'Website:' in line:
            website_match = re.search(r'Website:\s*([^"]+)"', line)
            if website_match:
                data.website = website_match.group(1).strip()

        # Hours from button - 'button "Open · Closes 5 pm'
        if 'Open · Closes' in line or 'Closed · Opens' in line:
            hours_match = re.search(r'(Open|Closed)\s*·\s*(Closes|Opens)\s+(\d+\s*(?:am|pm))', line, re.IGNORECASE)
            if hours_match:
                data.hours_text = f"{hours_match.group(1)} · {hours_match.group(2)} {hours_match.group(3)}"
                data.is_open_now = hours_match.group(1).lower() == 'open'

        # Review topics from radio buttons - 'radio "rembrandt, mentioned in 2,834 reviews"'
        topic_match = re.search(r'radio "([^,]+),\s*mentioned in ([\d,]+) reviews"', line)
        if topic_match:
            topic = topic_match.group(1).strip().lower()
            count = int(topic_match.group(2).replace(',', ''))
            data.review_topics[topic] = count

        # Related places from links - 'link "Van Gogh Museum · 4.6 stars · 101,806 reviews · Art museum"'
        related_match = re.search(r'link "([^·]+)·\s*(\d+\.?\d*)\s*stars?\s*·\s*([\d,]+)\s*reviews?\s*·\s*([^"]+)"', line)
        if related_match:
            place = RelatedPlace(
                name=related_match.group(1).strip(),
                rating=float(related_match.group(2)),
                review_count=int(related_match.group(3).replace(',', '')),
                place_type=related_match.group(4).strip()
            )
            data.related_places.append(place)

    # Extract coordinates from URL if present
    url_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', snapshot_text)
    if url_match:
        data.latitude = float(url_match.group(1))
        data.longitude = float(url_match.group(2))

    # Extract Google Maps URL
    maps_url_match = re.search(r'https://www\.google\.com/maps/place/[^\s\'"]+', snapshot_text)
    if maps_url_match:
        data.google_maps_url = maps_url_match.group(0)

    # Parse individual reviews from generic blocks with author names
    review_blocks = re.findall(r'generic "([^"]+)" \[ref=e\d+\]:\s*\n((?:.*?\n)*?)(?=generic "[^"]+" \[ref=e\d+\]:|$)', snapshot_text)
    for author_name, block in review_blocks:
        # Skip non-review blocks
        if 'Local Guide' in block or 'reviews' in block.lower():
            review = ReviewData(author_name=author_name)

            # Extract rating
            rating_match = re.search(r'img "(\d)\s*stars?"', block)
            if rating_match:
                review.rating = int(rating_match.group(1))

            # Extract time
            time_match = re.search(r'generic \[ref=e\d+\]: (a week ago|a month ago|a year ago|\d+\s*(?:day|week|month|year)s?\s*ago)', block)
            if time_match:
                review.relative_time = time_match.group(1)

            # Check if local guide
            if 'Local Guide' in block:
                review.local_guide = True

            # Extract author stats
            stats_match = re.search(r'(\d+)\s*reviews?\s*·\s*(\d+)\s*photos?', block)
            if stats_match:
                review.reviews_by_author = int(stats_match.group(1))
                review.photos_by_author = int(stats_match.group(2))

            # Extract review text (the longest text line)
            text_lines = re.findall(r'text: (.+)', block)
            if text_lines:
                review.text = max(text_lines, key=len)

            # Count photos in review
            photo_count = len(re.findall(r'Photo \d+ on .+\'s review', block))
            review.photos_count = photo_count

            if review.rating is not None:  # Only add if we found a rating
                data.reviews.append(review)

    return data


def extract_review_from_snapshot_block(block: str) -> Optional[ReviewData]:
    """Extract a single review from a snapshot text block."""
    review = ReviewData(author_name="Unknown")

    lines = block.strip().split('\n')

    for i, line in enumerate(lines):
        # Author name - typically first link text
        if 'link' in line.lower() and review.author_name == "Unknown":
            name_match = re.search(r'"([^"]+)"', line)
            if name_match:
                review.author_name = name_match.group(1)

        # Rating
        rating_match = re.search(r'(\d)\s*stars?', line)
        if rating_match:
            review.rating = int(rating_match.group(1))

        # Time
        time_match = re.search(r'(\d+\s*(?:day|week|month|year)s?\s*ago|a\s+(?:day|week|month|year)\s+ago)', line, re.IGNORECASE)
        if time_match:
            review.relative_time = time_match.group(1)

        # Local guide
        if 'local guide' in line.lower():
            review.local_guide = True

        # Reviews/photos by author
        author_stats = re.search(r'(\d+)\s*reviews?.*?(\d+)\s*photos?', line, re.IGNORECASE)
        if author_stats:
            review.reviews_by_author = int(author_stats.group(1))
            review.photos_by_author = int(author_stats.group(2))

    # Get review text - usually longest text block
    text_candidates = []
    for line in lines:
        if len(line) > 50 and 'stars' not in line.lower() and 'reviews' not in line.lower():
            text_candidates.append(line)
    if text_candidates:
        review.text = max(text_candidates, key=len)

    return review if review.author_name != "Unknown" else None


def to_json_serializable(data: GoogleMapsScrapedData) -> Dict[str, Any]:
    """Convert dataclass to JSON-serializable dict."""
    result: Dict[str, Any] = {
        "name": data.name,
        "scrape_timestamp": data.scrape_timestamp,
        "scrape_method": data.scrape_method,
    }

    # Add optional fields if present
    if data.place_id:
        result["place_id"] = data.place_id
    if data.google_maps_url:
        result["google_maps_url"] = data.google_maps_url
    if data.address:
        result["address"] = data.address
    if data.latitude and data.longitude:
        result["coordinates"] = {"latitude": data.latitude, "longitude": data.longitude}
    if data.phone:
        result["phone"] = data.phone
    if data.website:
        result["website"] = data.website
    if data.rating:
        result["rating"] = data.rating
    if data.total_reviews:
        result["total_reviews"] = data.total_reviews

    if data.review_breakdown:
        result["review_breakdown"] = {
            "5_star": data.review_breakdown.five_star,
            "4_star": data.review_breakdown.four_star,
            "3_star": data.review_breakdown.three_star,
            "2_star": data.review_breakdown.two_star,
            "1_star": data.review_breakdown.one_star,
        }

    if data.hours_text:
        result["hours_text"] = data.hours_text
    if data.is_open_now is not None:
        result["is_open_now"] = data.is_open_now

    if data.review_topics:
        result["review_topics"] = data.review_topics

    if data.reviews:
        result["reviews"] = [
            {
                "author_name": r.author_name,
                "rating": r.rating,
                "text": r.text,
                "relative_time": r.relative_time,
                "local_guide": r.local_guide,
                "reviews_by_author": r.reviews_by_author,
                "photos_by_author": r.photos_by_author,
            }
            for r in data.reviews
        ]

    if data.related_places:
        result["related_places"] = [
            {
                "name": p.name,
                "rating": p.rating,
                "review_count": p.review_count,
            }
            for p in data.related_places
        ]

    if data.photo_count:
        result["photo_count"] = data.photo_count

    if data.live_busyness:
        result["live_busyness_percent"] = data.live_busyness

    return result


# MCP Playwright interaction functions
# These generate the commands to be run via MCP tools

def get_search_url(name: str, city: str, country: str = "") -> str:
    """Generate Google Maps search URL."""
    query = f"{name} {city} {country}".strip()
    encoded = query.replace(' ', '+')
    return f"https://www.google.com/maps/search/{encoded}"


def get_place_url_from_place_id(place_id: str) -> str:
    """Generate Google Maps URL from place ID."""
    return f"https://www.google.com/maps/place/?q=place_id:{place_id}"


# Instructions for MCP Playwright usage
MCP_INSTRUCTIONS = """
## How to scrape Google Maps with MCP Playwright

1. Navigate to the place:
   playwright_browser_navigate(url="https://www.google.com/maps/search/Rijksmuseum+Amsterdam")

2. Wait for the page to load:
   playwright_browser_wait_for(time=3)

3. Take a snapshot to get the accessibility tree:
   playwright_browser_snapshot()

4. To get more reviews, click the reviews section:
   playwright_browser_click(element="Reviews", ref="<ref from snapshot>")

5. Scroll to load more reviews:
   playwright_browser_evaluate(function="() => { document.querySelector('[role=main]').scrollTo(0, 10000) }")

6. Take another snapshot to get loaded reviews:
   playwright_browser_snapshot()

7. Parse the snapshot using parse_snapshot_for_reviews()

## Data Available:
- Rating and total review count
- Review breakdown by stars (5★, 4★, 3★, 2★, 1★)
- Individual reviews with author, date, text, photos
- Review topics (e.g., "rembrandt (2,834)")
- Popular times / live busyness
- Related places
- Hours, phone, website
- Address
"""


def main():
    parser = argparse.ArgumentParser(description="Scrape Google Maps reviews with Playwright")
    parser.add_argument("--name", help="Institution name")
    parser.add_argument("--city", help="City name")
    parser.add_argument("--country", default="", help="Country name (optional)")
    parser.add_argument("--url", help="Direct Google Maps URL")
    parser.add_argument("--place-id", help="Google Place ID")
    parser.add_argument("--ghcid", help="GHCID to look up institution")
    parser.add_argument("--output", help="Output JSON file path")
    parser.add_argument("--instructions", action="store_true", help="Show MCP Playwright instructions")

    args = parser.parse_args()

    if args.instructions:
        print(MCP_INSTRUCTIONS)
        return 0

    if args.url:
        print(f"Use MCP Playwright to navigate to: {args.url}")
    elif args.place_id:
        url = get_place_url_from_place_id(args.place_id)
        print(f"Use MCP Playwright to navigate to: {url}")
    elif args.name and args.city:
        url = get_search_url(args.name, args.city, args.country)
        print(f"Use MCP Playwright to navigate to: {url}")
    else:
        print("Please provide --url, --place-id, or --name and --city")
        print("\nFor MCP Playwright usage instructions, run with --instructions")
        return 1

    print("\nRun the following MCP Playwright commands:")
    print("1. playwright_browser_navigate(url=<url above>)")
    print("2. playwright_browser_wait_for(time=3)")
    print("3. playwright_browser_snapshot()")
    print("\nThen parse the snapshot with this script's parse_snapshot_for_reviews() function")

    return 0


if __name__ == "__main__":
    exit(main())