glam/scripts/enrich_custodians_google_maps_playwright.py
2025-12-09 10:46:43 +01:00

305 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
Batch enrich custodians with Google Maps data using Playwright.
This script reads custodian YAML files and enriches them with comprehensive
Google Maps data scraped via Playwright, including:
- Full review breakdown (all star ratings)
- Review topics (what people talk about)
- Popular times / live busyness
- Related places
- Detailed reviews (more than API's 5 limit)
Usage:
# Enrich a single custodian by GHCID
python scripts/enrich_custodians_google_maps_playwright.py --ghcid NL-NH-AMS-M-RM
# Enrich multiple custodians
python scripts/enrich_custodians_google_maps_playwright.py --limit 10
# Dry run (show what would be done)
python scripts/enrich_custodians_google_maps_playwright.py --dry-run --limit 5
Note:
This script requires MCP Playwright to be running. It generates commands
that should be executed via MCP tools, or can be run with local Playwright.
"""
import json
import yaml
import sys
import time
import asyncio
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, List, Dict, Any, Tuple
import logging
import argparse
# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent))
from scrape_google_maps_reviews import (
parse_snapshot_for_reviews,
to_json_serializable,
get_search_url,
GoogleMapsScrapedData,
)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
OUTPUT_DIR = Path(__file__).parent.parent / "data" / "google_maps_enrichment"
DELAY_BETWEEN_REQUESTS = 3 # seconds
def load_custodian(ghcid: str) -> Optional[Dict[str, Any]]:
"""Load a custodian YAML file by GHCID."""
yaml_path = CUSTODIAN_DIR / f"{ghcid}.yaml"
if not yaml_path.exists():
logger.warning(f"Custodian file not found: {yaml_path}")
return None
with open(yaml_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def get_custodian_search_query(custodian: Dict[str, Any]) -> Tuple[str, str, str]:
"""
Extract search query components from custodian data.
Returns:
Tuple of (name, city, country)
"""
# Get name - try various fields
name = (
custodian.get('custodian_name', {}).get('emic_name') or
custodian.get('name') or
custodian.get('custodian_name', {}).get('english_name') or
custodian.get('original_entry', {}).get('organisatie', '') or
"Unknown"
)
# Get city
city = ""
location = custodian.get('location', {})
if isinstance(location, dict):
city = location.get('city', '')
elif isinstance(location, list) and location:
city = location[0].get('city', '')
# Fallback to location resolution
if not city:
location_resolution = custodian.get('ghcid', {}).get('location_resolution', {})
city = location_resolution.get('city_name', '')
# Get country
country = ""
if isinstance(location, dict):
country = location.get('country', '')
elif isinstance(location, list) and location:
country = location[0].get('country', '')
# Convert country code to name for better search
country_names = {
'NL': 'Netherlands',
'BE': 'Belgium',
'DE': 'Germany',
'FR': 'France',
'GB': 'United Kingdom',
'US': 'United States',
'IT': 'Italy',
'ES': 'Spain',
'JP': 'Japan',
'CN': 'China',
}
country = country_names.get(country, country)
return name, city, country
def find_custodians_to_enrich(
limit: Optional[int] = None,
skip_enriched: bool = True,
filter_country: Optional[str] = None,
) -> List[str]:
"""
Find custodian GHCIDs that need Google Maps enrichment.
Args:
limit: Maximum number of custodians to return
skip_enriched: Skip custodians that already have Google Maps data
filter_country: Only include custodians from this country code
Returns:
List of GHCIDs to enrich
"""
ghcids = []
if not CUSTODIAN_DIR.exists():
logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}")
return ghcids
yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
logger.info(f"Found {len(yaml_files)} custodian files")
for yaml_path in yaml_files:
ghcid = yaml_path.stem
# Filter by country if specified
if filter_country and not ghcid.startswith(filter_country + "-"):
continue
# Check if already enriched
if skip_enriched:
try:
with open(yaml_path, 'r', encoding='utf-8') as f:
custodian = yaml.safe_load(f)
if custodian and custodian.get('google_maps_playwright_enrichment'):
continue
except Exception as e:
logger.warning(f"Error reading {yaml_path}: {e}")
continue
ghcids.append(ghcid)
if limit and len(ghcids) >= limit:
break
return ghcids
def save_enrichment(ghcid: str, data: GoogleMapsScrapedData, custodian_path: Optional[Path] = None):
"""
Save Google Maps enrichment data.
Can either:
1. Save to separate JSON file in google_maps_enrichment/
2. Merge into custodian YAML file
"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Save to JSON
json_data = to_json_serializable(data)
json_path = OUTPUT_DIR / f"{ghcid}_google_maps.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
logger.info(f"Saved enrichment to: {json_path}")
# Optionally merge into custodian YAML
if custodian_path and custodian_path.exists():
try:
with open(custodian_path, 'r', encoding='utf-8') as f:
custodian = yaml.safe_load(f)
custodian['google_maps_playwright_enrichment'] = json_data
with open(custodian_path, 'w', encoding='utf-8') as f:
yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
logger.info(f"Merged enrichment into: {custodian_path}")
except Exception as e:
logger.error(f"Error merging into custodian: {e}")
def generate_mcp_commands(ghcids: List[str]) -> List[Dict[str, Any]]:
"""
Generate MCP Playwright commands for batch processing.
Returns a list of command dictionaries that can be executed via MCP.
"""
commands = []
for ghcid in ghcids:
custodian = load_custodian(ghcid)
if not custodian:
continue
name, city, country = get_custodian_search_query(custodian)
url = get_search_url(name, city, country)
commands.append({
"ghcid": ghcid,
"name": name,
"city": city,
"country": country,
"url": url,
"commands": [
{"tool": "playwright_browser_navigate", "args": {"url": url}},
{"tool": "playwright_browser_wait_for", "args": {"time": 3}},
{"tool": "playwright_browser_snapshot", "args": {}},
]
})
return commands
def main():
parser = argparse.ArgumentParser(
description="Batch enrich custodians with Google Maps data via Playwright"
)
parser.add_argument("--ghcid", help="Enrich a single custodian by GHCID")
parser.add_argument("--limit", type=int, default=10, help="Limit number of custodians")
parser.add_argument("--country", help="Filter by country code (e.g., NL, BE, DE)")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
parser.add_argument("--output-commands", action="store_true", help="Output MCP commands as JSON")
parser.add_argument("--skip-enriched", action="store_true", default=True, help="Skip already enriched")
args = parser.parse_args()
# Find custodians to enrich
if args.ghcid:
ghcids = [args.ghcid]
else:
ghcids = find_custodians_to_enrich(
limit=args.limit,
skip_enriched=args.skip_enriched,
filter_country=args.country,
)
if not ghcids:
logger.info("No custodians to enrich")
return 0
logger.info(f"Found {len(ghcids)} custodians to enrich")
# Generate MCP commands
commands = generate_mcp_commands(ghcids)
if args.output_commands:
print(json.dumps(commands, indent=2))
return 0
# Print instructions for manual MCP execution
print("\n" + "=" * 70)
print("MCP Playwright Enrichment Commands")
print("=" * 70)
for i, cmd in enumerate(commands):
print(f"\n{i+1}. {cmd['ghcid']} - {cmd['name']}")
print(f" City: {cmd['city']}, Country: {cmd['country']}")
print(f" URL: {cmd['url']}")
if args.dry_run:
print(" [DRY RUN - would execute MCP commands]")
else:
print(" Commands:")
for c in cmd['commands']:
print(f" {c['tool']}({c['args']})")
print("\n" + "=" * 70)
print("To execute these commands:")
print("1. Use MCP Playwright tools in your IDE")
print("2. Navigate to each URL, wait, and take snapshot")
print("3. Parse snapshot with parse_snapshot_for_reviews()")
print("4. Save result with save_enrichment()")
print("=" * 70)
return 0
if __name__ == "__main__":
sys.exit(main())