305 lines
9.6 KiB
Python
305 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch enrich custodians with Google Maps data using Playwright.
|
|
|
|
This script reads custodian YAML files and enriches them with comprehensive
|
|
Google Maps data scraped via Playwright, including:
|
|
- Full review breakdown (all star ratings)
|
|
- Review topics (what people talk about)
|
|
- Popular times / live busyness
|
|
- Related places
|
|
- Detailed reviews (more than API's 5 limit)
|
|
|
|
Usage:
|
|
# Enrich a single custodian by GHCID
|
|
python scripts/enrich_custodians_google_maps_playwright.py --ghcid NL-NH-AMS-M-RM
|
|
|
|
# Enrich multiple custodians
|
|
python scripts/enrich_custodians_google_maps_playwright.py --limit 10
|
|
|
|
# Dry run (show what would be done)
|
|
python scripts/enrich_custodians_google_maps_playwright.py --dry-run --limit 5
|
|
|
|
Note:
|
|
This script requires MCP Playwright to be running. It generates commands
|
|
that should be executed via MCP tools, or can be run with local Playwright.
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import sys
|
|
import time
|
|
import asyncio
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, List, Dict, Any, Tuple
|
|
import logging
|
|
import argparse
|
|
|
|
# Add scripts to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from scrape_google_maps_reviews import (
|
|
parse_snapshot_for_reviews,
|
|
to_json_serializable,
|
|
get_search_url,
|
|
GoogleMapsScrapedData,
|
|
)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
OUTPUT_DIR = Path(__file__).parent.parent / "data" / "google_maps_enrichment"
|
|
DELAY_BETWEEN_REQUESTS = 3 # seconds
|
|
|
|
|
|
def load_custodian(ghcid: str) -> Optional[Dict[str, Any]]:
|
|
"""Load a custodian YAML file by GHCID."""
|
|
yaml_path = CUSTODIAN_DIR / f"{ghcid}.yaml"
|
|
if not yaml_path.exists():
|
|
logger.warning(f"Custodian file not found: {yaml_path}")
|
|
return None
|
|
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def get_custodian_search_query(custodian: Dict[str, Any]) -> Tuple[str, str, str]:
|
|
"""
|
|
Extract search query components from custodian data.
|
|
|
|
Returns:
|
|
Tuple of (name, city, country)
|
|
"""
|
|
# Get name - try various fields
|
|
name = (
|
|
custodian.get('custodian_name', {}).get('emic_name') or
|
|
custodian.get('name') or
|
|
custodian.get('custodian_name', {}).get('english_name') or
|
|
custodian.get('original_entry', {}).get('organisatie', '') or
|
|
"Unknown"
|
|
)
|
|
|
|
# Get city
|
|
city = ""
|
|
location = custodian.get('location', {})
|
|
if isinstance(location, dict):
|
|
city = location.get('city', '')
|
|
elif isinstance(location, list) and location:
|
|
city = location[0].get('city', '')
|
|
|
|
# Fallback to location resolution
|
|
if not city:
|
|
location_resolution = custodian.get('ghcid', {}).get('location_resolution', {})
|
|
city = location_resolution.get('city_name', '')
|
|
|
|
# Get country
|
|
country = ""
|
|
if isinstance(location, dict):
|
|
country = location.get('country', '')
|
|
elif isinstance(location, list) and location:
|
|
country = location[0].get('country', '')
|
|
|
|
# Convert country code to name for better search
|
|
country_names = {
|
|
'NL': 'Netherlands',
|
|
'BE': 'Belgium',
|
|
'DE': 'Germany',
|
|
'FR': 'France',
|
|
'GB': 'United Kingdom',
|
|
'US': 'United States',
|
|
'IT': 'Italy',
|
|
'ES': 'Spain',
|
|
'JP': 'Japan',
|
|
'CN': 'China',
|
|
}
|
|
country = country_names.get(country, country)
|
|
|
|
return name, city, country
|
|
|
|
|
|
def find_custodians_to_enrich(
|
|
limit: Optional[int] = None,
|
|
skip_enriched: bool = True,
|
|
filter_country: Optional[str] = None,
|
|
) -> List[str]:
|
|
"""
|
|
Find custodian GHCIDs that need Google Maps enrichment.
|
|
|
|
Args:
|
|
limit: Maximum number of custodians to return
|
|
skip_enriched: Skip custodians that already have Google Maps data
|
|
filter_country: Only include custodians from this country code
|
|
|
|
Returns:
|
|
List of GHCIDs to enrich
|
|
"""
|
|
ghcids = []
|
|
|
|
if not CUSTODIAN_DIR.exists():
|
|
logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}")
|
|
return ghcids
|
|
|
|
yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
|
|
logger.info(f"Found {len(yaml_files)} custodian files")
|
|
|
|
for yaml_path in yaml_files:
|
|
ghcid = yaml_path.stem
|
|
|
|
# Filter by country if specified
|
|
if filter_country and not ghcid.startswith(filter_country + "-"):
|
|
continue
|
|
|
|
# Check if already enriched
|
|
if skip_enriched:
|
|
try:
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
custodian = yaml.safe_load(f)
|
|
if custodian and custodian.get('google_maps_playwright_enrichment'):
|
|
continue
|
|
except Exception as e:
|
|
logger.warning(f"Error reading {yaml_path}: {e}")
|
|
continue
|
|
|
|
ghcids.append(ghcid)
|
|
|
|
if limit and len(ghcids) >= limit:
|
|
break
|
|
|
|
return ghcids
|
|
|
|
|
|
def save_enrichment(ghcid: str, data: GoogleMapsScrapedData, custodian_path: Optional[Path] = None):
|
|
"""
|
|
Save Google Maps enrichment data.
|
|
|
|
Can either:
|
|
1. Save to separate JSON file in google_maps_enrichment/
|
|
2. Merge into custodian YAML file
|
|
"""
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save to JSON
|
|
json_data = to_json_serializable(data)
|
|
json_path = OUTPUT_DIR / f"{ghcid}_google_maps.json"
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Saved enrichment to: {json_path}")
|
|
|
|
# Optionally merge into custodian YAML
|
|
if custodian_path and custodian_path.exists():
|
|
try:
|
|
with open(custodian_path, 'r', encoding='utf-8') as f:
|
|
custodian = yaml.safe_load(f)
|
|
|
|
custodian['google_maps_playwright_enrichment'] = json_data
|
|
|
|
with open(custodian_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(custodian, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
logger.info(f"Merged enrichment into: {custodian_path}")
|
|
except Exception as e:
|
|
logger.error(f"Error merging into custodian: {e}")
|
|
|
|
|
|
def generate_mcp_commands(ghcids: List[str]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Generate MCP Playwright commands for batch processing.
|
|
|
|
Returns a list of command dictionaries that can be executed via MCP.
|
|
"""
|
|
commands = []
|
|
|
|
for ghcid in ghcids:
|
|
custodian = load_custodian(ghcid)
|
|
if not custodian:
|
|
continue
|
|
|
|
name, city, country = get_custodian_search_query(custodian)
|
|
url = get_search_url(name, city, country)
|
|
|
|
commands.append({
|
|
"ghcid": ghcid,
|
|
"name": name,
|
|
"city": city,
|
|
"country": country,
|
|
"url": url,
|
|
"commands": [
|
|
{"tool": "playwright_browser_navigate", "args": {"url": url}},
|
|
{"tool": "playwright_browser_wait_for", "args": {"time": 3}},
|
|
{"tool": "playwright_browser_snapshot", "args": {}},
|
|
]
|
|
})
|
|
|
|
return commands
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Batch enrich custodians with Google Maps data via Playwright"
|
|
)
|
|
parser.add_argument("--ghcid", help="Enrich a single custodian by GHCID")
|
|
parser.add_argument("--limit", type=int, default=10, help="Limit number of custodians")
|
|
parser.add_argument("--country", help="Filter by country code (e.g., NL, BE, DE)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
|
|
parser.add_argument("--output-commands", action="store_true", help="Output MCP commands as JSON")
|
|
parser.add_argument("--skip-enriched", action="store_true", default=True, help="Skip already enriched")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Find custodians to enrich
|
|
if args.ghcid:
|
|
ghcids = [args.ghcid]
|
|
else:
|
|
ghcids = find_custodians_to_enrich(
|
|
limit=args.limit,
|
|
skip_enriched=args.skip_enriched,
|
|
filter_country=args.country,
|
|
)
|
|
|
|
if not ghcids:
|
|
logger.info("No custodians to enrich")
|
|
return 0
|
|
|
|
logger.info(f"Found {len(ghcids)} custodians to enrich")
|
|
|
|
# Generate MCP commands
|
|
commands = generate_mcp_commands(ghcids)
|
|
|
|
if args.output_commands:
|
|
print(json.dumps(commands, indent=2))
|
|
return 0
|
|
|
|
# Print instructions for manual MCP execution
|
|
print("\n" + "=" * 70)
|
|
print("MCP Playwright Enrichment Commands")
|
|
print("=" * 70)
|
|
|
|
for i, cmd in enumerate(commands):
|
|
print(f"\n{i+1}. {cmd['ghcid']} - {cmd['name']}")
|
|
print(f" City: {cmd['city']}, Country: {cmd['country']}")
|
|
print(f" URL: {cmd['url']}")
|
|
if args.dry_run:
|
|
print(" [DRY RUN - would execute MCP commands]")
|
|
else:
|
|
print(" Commands:")
|
|
for c in cmd['commands']:
|
|
print(f" {c['tool']}({c['args']})")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("To execute these commands:")
|
|
print("1. Use MCP Playwright tools in your IDE")
|
|
print("2. Navigate to each URL, wait, and take snapshot")
|
|
print("3. Parse snapshot with parse_snapshot_for_reviews()")
|
|
print("4. Save result with save_enrichment()")
|
|
print("=" * 70)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|