glam/scripts/enrich_nde_google_maps.py

708 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Enrich NDE Register entries with Google Maps/Places API (New) data.
This script reads the enriched NDE YAML entries and adds comprehensive data from
Google Places API (New), including:
- Coordinates (latitude/longitude)
- Place ID
- Formatted address
- Phone number
- Website
- Opening hours
- Reviews and ratings
- Photo references
- Business status
- Price level
- Types/categories
- Street View images
Usage:
python scripts/enrich_nde_google_maps.py [--dry-run] [--limit N] [--force]
Environment Variables:
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled
GOOGLE_STREETVIEW_STATIC_TOKEN - Optional: API key for Street View Static API
Output:
Updates YAML files in data/nde/enriched/entries/ with google_maps_enrichment section
API Limitations (Google Places API New):
- Reviews: Maximum 5 "most relevant" reviews per place (no pagination available)
- Photos: Maximum 10 photo references per place (no pagination available)
These are hard limits imposed by the Google Places API (New). The only way to get
more reviews would be the Google My Business API, which requires OAuth2 authentication
as the business owner/manager - not applicable for third-party heritage institutions.
See: https://developers.google.com/maps/documentation/places/web-service/place-details
See: https://featurable.com/blog/google-places-more-than-5-reviews
"""
import os
import sys
import time
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
import logging
import argparse
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
GOOGLE_STREETVIEW_TOKEN = os.getenv("GOOGLE_STREETVIEW_STATIC_TOKEN", "")
if not GOOGLE_PLACES_TOKEN:
logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
logger.error("Please set it in your .env file or environment")
sys.exit(1)
if not GOOGLE_STREETVIEW_TOKEN:
logger.warning("GOOGLE_STREETVIEW_STATIC_TOKEN not set - Street View images will not be fetched")
# API Endpoints - Using Places API (New)
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
PLACE_DETAILS_URL = "https://places.googleapis.com/v1/places"
# Rate limiting: Google Places API has varying limits depending on your plan
REQUEST_DELAY = 0.2 # 5 requests per second to be safe
# Fields to request from Places API (New)
# See: https://developers.google.com/maps/documentation/places/web-service/place-details
PLACE_FIELDS = [
"id",
"displayName",
"formattedAddress",
"addressComponents",
"location",
"types",
"businessStatus",
"internationalPhoneNumber",
"nationalPhoneNumber",
"regularOpeningHours",
"currentOpeningHours",
"websiteUri",
"rating",
"userRatingCount",
"reviews",
"priceLevel",
"photos",
"googleMapsUri",
"utcOffsetMinutes",
"primaryType",
"primaryTypeDisplayName",
"shortFormattedAddress",
"editorialSummary",
"iconMaskBaseUri",
"iconBackgroundColor",
]
@dataclass
class GoogleMapsEnrichment:
"""Container for all Google Maps data extracted for an entity."""
place_id: str
name: str
formatted_address: Optional[str] = None
short_address: Optional[str] = None
latitude: Optional[float] = None
longitude: Optional[float] = None
types: List[str] = field(default_factory=list)
primary_type: Optional[str] = None
business_status: Optional[str] = None
# Contact info
national_phone_number: Optional[str] = None
international_phone_number: Optional[str] = None
website: Optional[str] = None
# Opening hours
opening_hours: Optional[Dict[str, Any]] = None
# Ratings and reviews
rating: Optional[float] = None
user_ratings_total: Optional[int] = None
reviews: Optional[List[Dict[str, Any]]] = None
price_level: Optional[str] = None
# Photos
photos: Optional[List[Dict[str, Any]]] = None
photo_urls: Optional[List[str]] = None
# Additional
google_maps_url: Optional[str] = None
street_view_url: Optional[str] = None
editorial_summary: Optional[str] = None
address_components: Optional[List[Dict[str, Any]]] = None
utc_offset_minutes: Optional[int] = None
icon_mask_uri: Optional[str] = None
icon_background_color: Optional[str] = None
# Metadata
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
api_status: str = "OK"
def search_place(
query: str,
client: httpx.Client,
location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
"""
Search for a place using the Text Search API (New).
Args:
query: Search query (organization name + address)
client: HTTP client for making requests
location_bias: Optional (lat, lng) to bias results toward Netherlands
Returns:
First matching place data if found, None otherwise
"""
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
}
body = {
"textQuery": query,
"languageCode": "nl",
"regionCode": "NL",
"maxResultCount": 1,
}
# Add location bias for Netherlands
if location_bias:
lat, lng = location_bias
body["locationBias"] = {
"circle": {
"center": {
"latitude": lat,
"longitude": lng,
},
"radius": 50000.0 # 50km radius (max allowed by API)
}
}
try:
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
response.raise_for_status()
data = response.json()
places = data.get("places", [])
if places:
place = places[0]
display_name = place.get("displayName", {}).get("text", "Unknown")
logger.debug(f"Found place: {display_name}")
return place
else:
logger.warning(f"No place found for query: {query}")
return None
except httpx.HTTPStatusError as e:
error_data = {}
try:
error_data = e.response.json()
except Exception:
pass
error_msg = error_data.get("error", {}).get("message", str(e))
status = error_data.get("error", {}).get("status", "UNKNOWN")
if status == "PERMISSION_DENIED":
logger.error(f"API PERMISSION_DENIED: {error_msg}")
logger.error("Please ensure the Places API (New) is enabled in your Google Cloud Console:")
logger.error(" 1. Go to https://console.cloud.google.com/apis/library/places.googleapis.com")
logger.error(" 2. Enable the 'Places API (New)'")
elif "RESOURCE_EXHAUSTED" in str(e):
logger.error("API quota exhausted. Please wait and try again later.")
else:
logger.error(f"HTTP error searching for place: {status} - {error_msg}")
return None
except Exception as e:
logger.error(f"Error searching for '{query}': {e}")
return None
def parse_place_data(place: Dict[str, Any]) -> GoogleMapsEnrichment:
"""
Parse place data from Places API (New) response.
Args:
place: Place data from API response
Returns:
GoogleMapsEnrichment object with all available data
"""
# Extract location
location = place.get("location", {})
lat = location.get("latitude")
lng = location.get("longitude")
# Extract display name
display_name = place.get("displayName", {})
name = display_name.get("text", "")
# Extract opening hours
opening_hours = place.get("regularOpeningHours")
if opening_hours:
opening_hours = {
"open_now": place.get("currentOpeningHours", {}).get("openNow"),
"periods": opening_hours.get("periods"),
"weekday_text": opening_hours.get("weekdayDescriptions"),
}
# Extract reviews - API hard limit: max 5 "most relevant" reviews (no pagination)
# See docstring for details on this limitation
reviews = place.get("reviews", [])
if reviews:
reviews = [
{
"author_name": r.get("authorAttribution", {}).get("displayName"),
"author_uri": r.get("authorAttribution", {}).get("uri"),
"rating": r.get("rating"),
"relative_time_description": r.get("relativePublishTimeDescription"),
"text": r.get("text", {}).get("text"),
"publish_time": r.get("publishTime"),
}
for r in reviews
]
# Extract photos - API hard limit: max 10 photo references (no pagination)
# See docstring for details on this limitation
photos = place.get("photos", [])
photo_urls = []
photos_data = []
if photos:
for photo in photos: # Get all photos
photo_name = photo.get("name")
if photo_name:
# Generate photo URL (maxwidth 800)
photo_url = f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}"
photo_urls.append(photo_url)
photos_data.append({
"name": photo_name,
"height": photo.get("heightPx"),
"width": photo.get("widthPx"),
"author_attributions": photo.get("authorAttributions"),
})
# Generate Street View URL if coordinates available
street_view_url = None
if lat and lng and GOOGLE_STREETVIEW_TOKEN:
street_view_url = (
f"https://maps.googleapis.com/maps/api/streetview"
f"?size=600x400&location={lat},{lng}&key={GOOGLE_STREETVIEW_TOKEN}"
)
# Extract editorial summary
editorial_summary = None
if place.get("editorialSummary"):
editorial_summary = place["editorialSummary"].get("text")
# Extract address components
address_components = place.get("addressComponents")
if address_components:
address_components = [
{
"long_name": c.get("longText"),
"short_name": c.get("shortText"),
"types": c.get("types", []),
}
for c in address_components
]
enrichment = GoogleMapsEnrichment(
place_id=place.get("id", ""),
name=name,
formatted_address=place.get("formattedAddress"),
short_address=place.get("shortFormattedAddress"),
latitude=lat,
longitude=lng,
types=place.get("types", []),
primary_type=place.get("primaryType"),
business_status=place.get("businessStatus"),
national_phone_number=place.get("nationalPhoneNumber"),
international_phone_number=place.get("internationalPhoneNumber"),
website=place.get("websiteUri"),
opening_hours=opening_hours,
rating=place.get("rating"),
user_ratings_total=place.get("userRatingCount"),
reviews=reviews if reviews else None,
price_level=place.get("priceLevel"),
photos=photos_data if photos_data else None,
photo_urls=photo_urls if photo_urls else None,
google_maps_url=place.get("googleMapsUri"),
street_view_url=street_view_url,
editorial_summary=editorial_summary,
address_components=address_components,
utc_offset_minutes=place.get("utcOffsetMinutes"),
icon_mask_uri=place.get("iconMaskBaseUri"),
icon_background_color=place.get("iconBackgroundColor"),
api_status="OK",
)
return enrichment
def enrichment_to_dict(enrichment: GoogleMapsEnrichment) -> Dict[str, Any]:
"""Convert GoogleMapsEnrichment to a dictionary for YAML output."""
result: Dict[str, Any] = {
"place_id": enrichment.place_id,
"name": enrichment.name,
"fetch_timestamp": enrichment.fetch_timestamp,
"api_status": enrichment.api_status,
}
# Location data
if enrichment.latitude is not None and enrichment.longitude is not None:
result["coordinates"] = {
"latitude": enrichment.latitude,
"longitude": enrichment.longitude,
}
if enrichment.formatted_address:
result["formatted_address"] = enrichment.formatted_address
if enrichment.short_address:
result["short_address"] = enrichment.short_address
if enrichment.address_components:
result["address_components"] = enrichment.address_components
# Contact info
if enrichment.national_phone_number:
result["phone_local"] = enrichment.national_phone_number
if enrichment.international_phone_number:
result["phone_international"] = enrichment.international_phone_number
if enrichment.website:
result["website"] = enrichment.website
# Business info
if enrichment.types:
result["google_place_types"] = enrichment.types
if enrichment.primary_type:
result["primary_type"] = enrichment.primary_type
if enrichment.business_status:
result["business_status"] = enrichment.business_status
# Opening hours
if enrichment.opening_hours:
result["opening_hours"] = enrichment.opening_hours
# Ratings and reviews
if enrichment.rating is not None:
result["rating"] = enrichment.rating
if enrichment.user_ratings_total is not None:
result["total_ratings"] = enrichment.user_ratings_total
if enrichment.reviews:
result["reviews"] = enrichment.reviews
if enrichment.price_level:
result["price_level"] = enrichment.price_level
# Editorial summary
if enrichment.editorial_summary:
result["editorial_summary"] = enrichment.editorial_summary
# Photos
if enrichment.photo_urls:
result["photo_urls"] = enrichment.photo_urls
if enrichment.photos:
result["photos_metadata"] = enrichment.photos
# Links
if enrichment.google_maps_url:
result["google_maps_url"] = enrichment.google_maps_url
if enrichment.street_view_url:
result["street_view_url"] = enrichment.street_view_url
if enrichment.icon_mask_uri:
result["icon_mask_uri"] = enrichment.icon_mask_uri
if enrichment.icon_background_color:
result["icon_background_color"] = enrichment.icon_background_color
# Timezone
if enrichment.utc_offset_minutes is not None:
result["utc_offset_minutes"] = enrichment.utc_offset_minutes
return result
def build_search_query(entry: Dict[str, Any]) -> str:
"""
Build a search query from an entry's data.
Args:
entry: The YAML entry data
Returns:
Search query string
"""
original = entry.get("original_entry", {})
# Get organization name
org_name = original.get("organisatie", "")
# Get address components
street = original.get("straat_en_huisnummer_bezoekadres", "")
city = original.get("plaatsnaam_bezoekadres", "")
# Build query - organization name is most important
query_parts = [org_name]
if street:
query_parts.append(street)
if city:
query_parts.append(city)
# Add Netherlands to disambiguate
query_parts.append("Netherlands")
return ", ".join(filter(None, query_parts))
def should_enrich(entry: Dict[str, Any], force: bool = False) -> bool:
"""
Determine if an entry should be enriched with Google Maps data.
Args:
entry: The YAML entry data
force: If True, re-enrich even if already enriched
Returns:
True if the entry should be enriched
"""
# Skip if already has Google Maps enrichment (unless force)
if not force and entry.get("google_maps_enrichment"):
return False
# Skip entries that were previously marked as not found
if not force and entry.get("google_maps_status") == "NOT_FOUND":
return False
# Need original entry data to build search query
original = entry.get("original_entry", {})
if not original:
return False
# Need at least an organization name
if not original.get("organisatie"):
return False
return True
def enrich_entry(
entry: Dict[str, Any],
client: httpx.Client,
) -> Tuple[Dict[str, Any], bool]:
"""
Enrich a single entry with Google Maps data.
Args:
entry: The YAML entry data
client: HTTP client for making requests
Returns:
Tuple of (enriched_entry, was_modified)
"""
# Build search query
query = build_search_query(entry)
logger.info(f"Searching for: {query}")
# Netherlands center for location bias
NL_CENTER = (52.1326, 5.2913)
# Search for place
place = search_place(query, client, location_bias=NL_CENTER)
if not place:
# Mark as not found
entry["google_maps_status"] = "NOT_FOUND"
entry["google_maps_search_query"] = query
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
return entry, True
# Parse place data
enrichment = parse_place_data(place)
# Add enrichment data
entry["google_maps_enrichment"] = enrichment_to_dict(enrichment)
entry["google_maps_status"] = "SUCCESS"
entry["google_maps_search_query"] = query
logger.info(f" → Found: {enrichment.name} ({enrichment.rating or 'no rating'}★, {enrichment.user_ratings_total or 0} reviews)")
return entry, True
def process_entries(
entries_dir: Path,
dry_run: bool = False,
limit: Optional[int] = None,
force: bool = False,
) -> Dict[str, int]:
"""
Process all entry files in the directory.
Args:
entries_dir: Path to entries directory
dry_run: If True, don't save changes
limit: Optional limit on number of entries to process
force: If True, re-enrich even if already enriched
Returns:
Statistics dictionary
"""
stats = {
"total_files": 0,
"already_enriched": 0,
"newly_enriched": 0,
"not_found": 0,
"errors": 0,
"skipped": 0,
}
# Get all YAML files
yaml_files = sorted(entries_dir.glob("*.yaml"))
stats["total_files"] = len(yaml_files)
if limit:
yaml_files = yaml_files[:limit]
logger.info(f"Processing {len(yaml_files)} files (limit: {limit or 'none'})")
with httpx.Client(timeout=30.0) as client:
for yaml_file in yaml_files:
try:
# Load entry
with open(yaml_file, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
stats["skipped"] += 1
continue
# Check if should enrich
if not should_enrich(entry, force=force):
if entry.get("google_maps_enrichment"):
stats["already_enriched"] += 1
else:
stats["skipped"] += 1
continue
# Enrich
logger.info(f"\nProcessing: {yaml_file.name}")
enriched_entry, was_modified = enrich_entry(entry, client)
if was_modified:
if enriched_entry.get("google_maps_status") == "SUCCESS":
stats["newly_enriched"] += 1
elif enriched_entry.get("google_maps_status") == "NOT_FOUND":
stats["not_found"] += 1
else:
stats["errors"] += 1
# Save if not dry run
if not dry_run:
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched_entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
logger.debug(f"Saved: {yaml_file.name}")
# Rate limiting
time.sleep(REQUEST_DELAY)
except Exception as e:
logger.error(f"Error processing {yaml_file.name}: {e}")
stats["errors"] += 1
return stats
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Enrich NDE entries with Google Maps data"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Don't save changes, just show what would be done"
)
parser.add_argument(
"--limit",
type=int,
help="Limit number of entries to process"
)
parser.add_argument(
"--force",
action="store_true",
help="Re-enrich entries that already have Google Maps data"
)
parser.add_argument(
"--entries-dir",
type=Path,
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
help="Path to entries directory"
)
args = parser.parse_args()
if args.dry_run:
logger.info("DRY RUN MODE - no changes will be saved")
if not args.entries_dir.exists():
logger.error(f"Entries directory not found: {args.entries_dir}")
return 1
# Process entries
stats = process_entries(
entries_dir=args.entries_dir,
dry_run=args.dry_run,
limit=args.limit,
force=args.force,
)
# Print summary
logger.info("\n" + "=" * 60)
logger.info("ENRICHMENT COMPLETE")
logger.info("=" * 60)
logger.info(f"Total files: {stats['total_files']}")
logger.info(f"Already enriched: {stats['already_enriched']}")
logger.info(f"Newly enriched: {stats['newly_enriched']}")
logger.info(f"Not found: {stats['not_found']}")
logger.info(f"Errors: {stats['errors']}")
logger.info(f"Skipped: {stats['skipped']}")
# Save stats
if not args.dry_run:
stats_file = args.entries_dir.parent / f"google_maps_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(stats_file, 'w') as f:
json.dump({
"timestamp": datetime.now(timezone.utc).isoformat(),
"dry_run": args.dry_run,
"limit": args.limit,
"force": args.force,
**stats
}, f, indent=2)
logger.info(f"Stats saved to: {stats_file}")
return 0
if __name__ == "__main__":
sys.exit(main())