#!/usr/bin/env python3 """ Enrich NDE Register entries with Google Maps/Places API (New) data. This script reads the enriched NDE YAML entries and adds comprehensive data from Google Places API (New), including: - Coordinates (latitude/longitude) - Place ID - Formatted address - Phone number - Website - Opening hours - Reviews and ratings - Photo references - Business status - Price level - Types/categories - Street View images Usage: python scripts/enrich_nde_google_maps.py [--dry-run] [--limit N] [--force] Environment Variables: GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled GOOGLE_STREETVIEW_STATIC_TOKEN - Optional: API key for Street View Static API Output: Updates YAML files in data/nde/enriched/entries/ with google_maps_enrichment section API Limitations (Google Places API New): - Reviews: Maximum 5 "most relevant" reviews per place (no pagination available) - Photos: Maximum 10 photo references per place (no pagination available) These are hard limits imposed by the Google Places API (New). The only way to get more reviews would be the Google My Business API, which requires OAuth2 authentication as the business owner/manager - not applicable for third-party heritage institutions. See: https://developers.google.com/maps/documentation/places/web-service/place-details See: https://featurable.com/blog/google-places-more-than-5-reviews """ import os import sys import time import json import yaml import httpx from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass, field import logging import argparse from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") GOOGLE_STREETVIEW_TOKEN = os.getenv("GOOGLE_STREETVIEW_STATIC_TOKEN", "") if not GOOGLE_PLACES_TOKEN: logger.error("GOOGLE_PLACES_TOKEN environment variable is required") logger.error("Please set it in your .env file or environment") sys.exit(1) if not GOOGLE_STREETVIEW_TOKEN: logger.warning("GOOGLE_STREETVIEW_STATIC_TOKEN not set - Street View images will not be fetched") # API Endpoints - Using Places API (New) TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" PLACE_DETAILS_URL = "https://places.googleapis.com/v1/places" # Rate limiting: Google Places API has varying limits depending on your plan REQUEST_DELAY = 0.2 # 5 requests per second to be safe # Fields to request from Places API (New) # See: https://developers.google.com/maps/documentation/places/web-service/place-details PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "addressComponents", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "regularOpeningHours", "currentOpeningHours", "websiteUri", "rating", "userRatingCount", "reviews", "priceLevel", "photos", "googleMapsUri", "utcOffsetMinutes", "primaryType", "primaryTypeDisplayName", "shortFormattedAddress", "editorialSummary", "iconMaskBaseUri", "iconBackgroundColor", ] @dataclass class GoogleMapsEnrichment: """Container for all Google Maps data extracted for an entity.""" place_id: str name: str formatted_address: Optional[str] = None short_address: Optional[str] = None latitude: Optional[float] = None longitude: Optional[float] = None types: List[str] = field(default_factory=list) primary_type: Optional[str] = None business_status: Optional[str] = None # Contact info national_phone_number: Optional[str] = None international_phone_number: Optional[str] = None website: Optional[str] = None # Opening hours opening_hours: Optional[Dict[str, Any]] = None # Ratings and reviews rating: Optional[float] = None user_ratings_total: Optional[int] = None reviews: Optional[List[Dict[str, Any]]] = None price_level: Optional[str] = None # Photos photos: Optional[List[Dict[str, Any]]] = None photo_urls: Optional[List[str]] = None # Additional google_maps_url: Optional[str] = None street_view_url: Optional[str] = None editorial_summary: Optional[str] = None address_components: Optional[List[Dict[str, Any]]] = None utc_offset_minutes: Optional[int] = None icon_mask_uri: Optional[str] = None icon_background_color: Optional[str] = None # Metadata fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) api_status: str = "OK" def search_place( query: str, client: httpx.Client, location_bias: Optional[Tuple[float, float]] = None, ) -> Optional[Dict[str, Any]]: """ Search for a place using the Text Search API (New). Args: query: Search query (organization name + address) client: HTTP client for making requests location_bias: Optional (lat, lng) to bias results toward Netherlands Returns: First matching place data if found, None otherwise """ headers = { "Content-Type": "application/json", "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]), } body = { "textQuery": query, "languageCode": "nl", "regionCode": "NL", "maxResultCount": 1, } # Add location bias for Netherlands if location_bias: lat, lng = location_bias body["locationBias"] = { "circle": { "center": { "latitude": lat, "longitude": lng, }, "radius": 50000.0 # 50km radius (max allowed by API) } } try: response = client.post(TEXT_SEARCH_URL, headers=headers, json=body) response.raise_for_status() data = response.json() places = data.get("places", []) if places: place = places[0] display_name = place.get("displayName", {}).get("text", "Unknown") logger.debug(f"Found place: {display_name}") return place else: logger.warning(f"No place found for query: {query}") return None except httpx.HTTPStatusError as e: error_data = {} try: error_data = e.response.json() except Exception: pass error_msg = error_data.get("error", {}).get("message", str(e)) status = error_data.get("error", {}).get("status", "UNKNOWN") if status == "PERMISSION_DENIED": logger.error(f"API PERMISSION_DENIED: {error_msg}") logger.error("Please ensure the Places API (New) is enabled in your Google Cloud Console:") logger.error(" 1. Go to https://console.cloud.google.com/apis/library/places.googleapis.com") logger.error(" 2. Enable the 'Places API (New)'") elif "RESOURCE_EXHAUSTED" in str(e): logger.error("API quota exhausted. Please wait and try again later.") else: logger.error(f"HTTP error searching for place: {status} - {error_msg}") return None except Exception as e: logger.error(f"Error searching for '{query}': {e}") return None def parse_place_data(place: Dict[str, Any]) -> GoogleMapsEnrichment: """ Parse place data from Places API (New) response. Args: place: Place data from API response Returns: GoogleMapsEnrichment object with all available data """ # Extract location location = place.get("location", {}) lat = location.get("latitude") lng = location.get("longitude") # Extract display name display_name = place.get("displayName", {}) name = display_name.get("text", "") # Extract opening hours opening_hours = place.get("regularOpeningHours") if opening_hours: opening_hours = { "open_now": place.get("currentOpeningHours", {}).get("openNow"), "periods": opening_hours.get("periods"), "weekday_text": opening_hours.get("weekdayDescriptions"), } # Extract reviews - API hard limit: max 5 "most relevant" reviews (no pagination) # See docstring for details on this limitation reviews = place.get("reviews", []) if reviews: reviews = [ { "author_name": r.get("authorAttribution", {}).get("displayName"), "author_uri": r.get("authorAttribution", {}).get("uri"), "rating": r.get("rating"), "relative_time_description": r.get("relativePublishTimeDescription"), "text": r.get("text", {}).get("text"), "publish_time": r.get("publishTime"), } for r in reviews ] # Extract photos - API hard limit: max 10 photo references (no pagination) # See docstring for details on this limitation photos = place.get("photos", []) photo_urls = [] photos_data = [] if photos: for photo in photos: # Get all photos photo_name = photo.get("name") if photo_name: # Generate photo URL (maxwidth 800) photo_url = f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}" photo_urls.append(photo_url) photos_data.append({ "name": photo_name, "height": photo.get("heightPx"), "width": photo.get("widthPx"), "author_attributions": photo.get("authorAttributions"), }) # Generate Street View URL if coordinates available street_view_url = None if lat and lng and GOOGLE_STREETVIEW_TOKEN: street_view_url = ( f"https://maps.googleapis.com/maps/api/streetview" f"?size=600x400&location={lat},{lng}&key={GOOGLE_STREETVIEW_TOKEN}" ) # Extract editorial summary editorial_summary = None if place.get("editorialSummary"): editorial_summary = place["editorialSummary"].get("text") # Extract address components address_components = place.get("addressComponents") if address_components: address_components = [ { "long_name": c.get("longText"), "short_name": c.get("shortText"), "types": c.get("types", []), } for c in address_components ] enrichment = GoogleMapsEnrichment( place_id=place.get("id", ""), name=name, formatted_address=place.get("formattedAddress"), short_address=place.get("shortFormattedAddress"), latitude=lat, longitude=lng, types=place.get("types", []), primary_type=place.get("primaryType"), business_status=place.get("businessStatus"), national_phone_number=place.get("nationalPhoneNumber"), international_phone_number=place.get("internationalPhoneNumber"), website=place.get("websiteUri"), opening_hours=opening_hours, rating=place.get("rating"), user_ratings_total=place.get("userRatingCount"), reviews=reviews if reviews else None, price_level=place.get("priceLevel"), photos=photos_data if photos_data else None, photo_urls=photo_urls if photo_urls else None, google_maps_url=place.get("googleMapsUri"), street_view_url=street_view_url, editorial_summary=editorial_summary, address_components=address_components, utc_offset_minutes=place.get("utcOffsetMinutes"), icon_mask_uri=place.get("iconMaskBaseUri"), icon_background_color=place.get("iconBackgroundColor"), api_status="OK", ) return enrichment def enrichment_to_dict(enrichment: GoogleMapsEnrichment) -> Dict[str, Any]: """Convert GoogleMapsEnrichment to a dictionary for YAML output.""" result: Dict[str, Any] = { "place_id": enrichment.place_id, "name": enrichment.name, "fetch_timestamp": enrichment.fetch_timestamp, "api_status": enrichment.api_status, } # Location data if enrichment.latitude is not None and enrichment.longitude is not None: result["coordinates"] = { "latitude": enrichment.latitude, "longitude": enrichment.longitude, } if enrichment.formatted_address: result["formatted_address"] = enrichment.formatted_address if enrichment.short_address: result["short_address"] = enrichment.short_address if enrichment.address_components: result["address_components"] = enrichment.address_components # Contact info if enrichment.national_phone_number: result["phone_local"] = enrichment.national_phone_number if enrichment.international_phone_number: result["phone_international"] = enrichment.international_phone_number if enrichment.website: result["website"] = enrichment.website # Business info if enrichment.types: result["google_place_types"] = enrichment.types if enrichment.primary_type: result["primary_type"] = enrichment.primary_type if enrichment.business_status: result["business_status"] = enrichment.business_status # Opening hours if enrichment.opening_hours: result["opening_hours"] = enrichment.opening_hours # Ratings and reviews if enrichment.rating is not None: result["rating"] = enrichment.rating if enrichment.user_ratings_total is not None: result["total_ratings"] = enrichment.user_ratings_total if enrichment.reviews: result["reviews"] = enrichment.reviews if enrichment.price_level: result["price_level"] = enrichment.price_level # Editorial summary if enrichment.editorial_summary: result["editorial_summary"] = enrichment.editorial_summary # Photos if enrichment.photo_urls: result["photo_urls"] = enrichment.photo_urls if enrichment.photos: result["photos_metadata"] = enrichment.photos # Links if enrichment.google_maps_url: result["google_maps_url"] = enrichment.google_maps_url if enrichment.street_view_url: result["street_view_url"] = enrichment.street_view_url if enrichment.icon_mask_uri: result["icon_mask_uri"] = enrichment.icon_mask_uri if enrichment.icon_background_color: result["icon_background_color"] = enrichment.icon_background_color # Timezone if enrichment.utc_offset_minutes is not None: result["utc_offset_minutes"] = enrichment.utc_offset_minutes return result def build_search_query(entry: Dict[str, Any]) -> str: """ Build a search query from an entry's data. Args: entry: The YAML entry data Returns: Search query string """ original = entry.get("original_entry", {}) # Get organization name org_name = original.get("organisatie", "") # Get address components street = original.get("straat_en_huisnummer_bezoekadres", "") city = original.get("plaatsnaam_bezoekadres", "") # Build query - organization name is most important query_parts = [org_name] if street: query_parts.append(street) if city: query_parts.append(city) # Add Netherlands to disambiguate query_parts.append("Netherlands") return ", ".join(filter(None, query_parts)) def should_enrich(entry: Dict[str, Any], force: bool = False) -> bool: """ Determine if an entry should be enriched with Google Maps data. Args: entry: The YAML entry data force: If True, re-enrich even if already enriched Returns: True if the entry should be enriched """ # Skip if already has Google Maps enrichment (unless force) if not force and entry.get("google_maps_enrichment"): return False # Skip entries that were previously marked as not found if not force and entry.get("google_maps_status") == "NOT_FOUND": return False # Need original entry data to build search query original = entry.get("original_entry", {}) if not original: return False # Need at least an organization name if not original.get("organisatie"): return False return True def enrich_entry( entry: Dict[str, Any], client: httpx.Client, ) -> Tuple[Dict[str, Any], bool]: """ Enrich a single entry with Google Maps data. Args: entry: The YAML entry data client: HTTP client for making requests Returns: Tuple of (enriched_entry, was_modified) """ # Build search query query = build_search_query(entry) logger.info(f"Searching for: {query}") # Netherlands center for location bias NL_CENTER = (52.1326, 5.2913) # Search for place place = search_place(query, client, location_bias=NL_CENTER) if not place: # Mark as not found entry["google_maps_status"] = "NOT_FOUND" entry["google_maps_search_query"] = query entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat() return entry, True # Parse place data enrichment = parse_place_data(place) # Add enrichment data entry["google_maps_enrichment"] = enrichment_to_dict(enrichment) entry["google_maps_status"] = "SUCCESS" entry["google_maps_search_query"] = query logger.info(f" → Found: {enrichment.name} ({enrichment.rating or 'no rating'}★, {enrichment.user_ratings_total or 0} reviews)") return entry, True def process_entries( entries_dir: Path, dry_run: bool = False, limit: Optional[int] = None, force: bool = False, ) -> Dict[str, int]: """ Process all entry files in the directory. Args: entries_dir: Path to entries directory dry_run: If True, don't save changes limit: Optional limit on number of entries to process force: If True, re-enrich even if already enriched Returns: Statistics dictionary """ stats = { "total_files": 0, "already_enriched": 0, "newly_enriched": 0, "not_found": 0, "errors": 0, "skipped": 0, } # Get all YAML files yaml_files = sorted(entries_dir.glob("*.yaml")) stats["total_files"] = len(yaml_files) if limit: yaml_files = yaml_files[:limit] logger.info(f"Processing {len(yaml_files)} files (limit: {limit or 'none'})") with httpx.Client(timeout=30.0) as client: for yaml_file in yaml_files: try: # Load entry with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: stats["skipped"] += 1 continue # Check if should enrich if not should_enrich(entry, force=force): if entry.get("google_maps_enrichment"): stats["already_enriched"] += 1 else: stats["skipped"] += 1 continue # Enrich logger.info(f"\nProcessing: {yaml_file.name}") enriched_entry, was_modified = enrich_entry(entry, client) if was_modified: if enriched_entry.get("google_maps_status") == "SUCCESS": stats["newly_enriched"] += 1 elif enriched_entry.get("google_maps_status") == "NOT_FOUND": stats["not_found"] += 1 else: stats["errors"] += 1 # Save if not dry run if not dry_run: with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(enriched_entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.debug(f"Saved: {yaml_file.name}") # Rate limiting time.sleep(REQUEST_DELAY) except Exception as e: logger.error(f"Error processing {yaml_file.name}: {e}") stats["errors"] += 1 return stats def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Enrich NDE entries with Google Maps data" ) parser.add_argument( "--dry-run", action="store_true", help="Don't save changes, just show what would be done" ) parser.add_argument( "--limit", type=int, help="Limit number of entries to process" ) parser.add_argument( "--force", action="store_true", help="Re-enrich entries that already have Google Maps data" ) parser.add_argument( "--entries-dir", type=Path, default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries", help="Path to entries directory" ) args = parser.parse_args() if args.dry_run: logger.info("DRY RUN MODE - no changes will be saved") if not args.entries_dir.exists(): logger.error(f"Entries directory not found: {args.entries_dir}") return 1 # Process entries stats = process_entries( entries_dir=args.entries_dir, dry_run=args.dry_run, limit=args.limit, force=args.force, ) # Print summary logger.info("\n" + "=" * 60) logger.info("ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total files: {stats['total_files']}") logger.info(f"Already enriched: {stats['already_enriched']}") logger.info(f"Newly enriched: {stats['newly_enriched']}") logger.info(f"Not found: {stats['not_found']}") logger.info(f"Errors: {stats['errors']}") logger.info(f"Skipped: {stats['skipped']}") # Save stats if not args.dry_run: stats_file = args.entries_dir.parent / f"google_maps_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(stats_file, 'w') as f: json.dump({ "timestamp": datetime.now(timezone.utc).isoformat(), "dry_run": args.dry_run, "limit": args.limit, "force": args.force, **stats }, f, indent=2) logger.info(f"Stats saved to: {stats_file}") return 0 if __name__ == "__main__": sys.exit(main())