#!/usr/bin/env python3 """ Enrich Palestinian and Lebanese heritage institutions with Google Maps/Places API data. This script reads the consolidated Palestinian heritage JSON and adds data from Google Places API (New), including: - Coordinates (latitude/longitude) - to verify/supplement Wikidata coords - Place ID - Formatted address - Phone number - Website - Opening hours - Reviews and ratings - Photo references - Business status Usage: python scripts/enrich_palestinian_google_maps.py [--dry-run] [--limit N] Environment Variables: GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled Output: Updates data/extracted/palestinian_heritage_consolidated.json with google_maps_enrichment """ import os import sys import time import json from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass, field import logging import argparse try: import httpx except ImportError: print("Please install httpx: pip install httpx") sys.exit(1) try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv is optional # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") if not GOOGLE_PLACES_TOKEN: logger.error("GOOGLE_PLACES_TOKEN environment variable is required") logger.error("Please set it in your .env file or environment") sys.exit(1) # API Endpoints - Using Places API (New) TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" # Rate limiting REQUEST_DELAY = 0.3 # seconds between requests # Location bias radius (max 50000 meters = 50km according to API) LOCATION_BIAS_RADIUS = 50000.0 # Fields to request PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "addressComponents", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "regularOpeningHours", "websiteUri", "rating", "userRatingCount", "reviews", "photos", "googleMapsUri", "editorialSummary", ] # Region codes for location bias REGION_COORDS = { "PS": (31.9, 35.2), # Palestine (West Bank center) "LB": (33.9, 35.5), # Lebanon (Beirut) "US": (38.9, -77.0), # USA (Washington DC for Museum of Palestinian People) } @dataclass class GoogleMapsEnrichment: """Container for Google Maps data.""" place_id: str name: str formatted_address: Optional[str] = None latitude: Optional[float] = None longitude: Optional[float] = None types: List[str] = field(default_factory=list) business_status: Optional[str] = None phone_international: Optional[str] = None phone_local: Optional[str] = None website: Optional[str] = None opening_hours: Optional[Dict[str, Any]] = None rating: Optional[float] = None total_ratings: Optional[int] = None reviews: Optional[List[Dict[str, Any]]] = None photos: Optional[List[Dict[str, Any]]] = None google_maps_url: Optional[str] = None editorial_summary: Optional[str] = None address_components: Optional[List[Dict[str, Any]]] = None fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) api_status: str = "OK" def search_place( query: str, client: httpx.Client, region_code: str = "PS", location_bias: Optional[Tuple[float, float]] = None, ) -> Optional[Dict[str, Any]]: """Search for a place using the Text Search API (New).""" headers = { "Content-Type": "application/json", "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]), } # Determine language based on region language = "ar" if region_code in ["PS", "LB"] else "en" body = { "textQuery": query, "languageCode": language, "maxResultCount": 1, } # Add location bias if location_bias: lat, lng = location_bias body["locationBias"] = { "circle": { "center": {"latitude": lat, "longitude": lng}, "radius": LOCATION_BIAS_RADIUS # 50km max per API spec } } try: response = client.post(TEXT_SEARCH_URL, headers=headers, json=body) response.raise_for_status() data = response.json() places = data.get("places", []) if places: return places[0] return None except httpx.HTTPStatusError as e: error_data = {} try: error_data = e.response.json() except Exception: pass error_msg = error_data.get("error", {}).get("message", str(e)) logger.error(f"API error: {error_msg}") return None except Exception as e: logger.error(f"Error searching for '{query}': {e}") return None def parse_place_data(place: Dict[str, Any]) -> Dict[str, Any]: """Parse place data from API response to dictionary.""" location = place.get("location", {}) display_name = place.get("displayName", {}) # Extract opening hours opening_hours = place.get("regularOpeningHours") if opening_hours: opening_hours = { "weekday_text": opening_hours.get("weekdayDescriptions"), "periods": opening_hours.get("periods"), } # Extract reviews (max 5 from API) reviews = place.get("reviews", []) if reviews: reviews = [ { "author_name": r.get("authorAttribution", {}).get("displayName"), "rating": r.get("rating"), "text": r.get("text", {}).get("text"), "relative_time": r.get("relativePublishTimeDescription"), } for r in reviews ] # Extract photos (max 10 from API) photos = place.get("photos", []) photo_data = [] if photos: for photo in photos[:5]: # Limit to 5 photos photo_name = photo.get("name") if photo_name: photo_data.append({ "photo_reference": photo_name, "height": photo.get("heightPx"), "width": photo.get("widthPx"), "url": f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}" }) # Extract address components address_components = place.get("addressComponents") if address_components: address_components = [ { "long_name": c.get("longText"), "short_name": c.get("shortText"), "types": c.get("types", []), } for c in address_components ] result = { "place_id": place.get("id", ""), "name": display_name.get("text", ""), "formatted_address": place.get("formattedAddress"), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), "api_status": "OK", } # Add coordinates if location.get("latitude") and location.get("longitude"): result["coordinates"] = { "latitude": location["latitude"], "longitude": location["longitude"], } # Add optional fields if place.get("internationalPhoneNumber"): result["phone_international"] = place["internationalPhoneNumber"] if place.get("nationalPhoneNumber"): result["phone_local"] = place["nationalPhoneNumber"] if place.get("websiteUri"): result["website"] = place["websiteUri"] if place.get("types"): result["google_place_types"] = place["types"] if place.get("businessStatus"): result["business_status"] = place["businessStatus"] if opening_hours: result["opening_hours"] = opening_hours if place.get("rating"): result["rating"] = place["rating"] if place.get("userRatingCount"): result["total_ratings"] = place["userRatingCount"] if reviews: result["reviews"] = reviews if photo_data: result["photos"] = photo_data if place.get("googleMapsUri"): result["google_maps_url"] = place["googleMapsUri"] if place.get("editorialSummary"): result["editorial_summary"] = place["editorialSummary"].get("text") if address_components: result["address_components"] = address_components return result def build_search_query(institution: Dict[str, Any]) -> Tuple[str, str]: """Build search query from institution data. Returns (query, region_code).""" name = institution.get("name", "") city = institution.get("city", "") country = institution.get("country", "PS") location = institution.get("location", "") # Build query query_parts = [name] if city: query_parts.append(city) # Add country name for disambiguation country_names = { "PS": "Palestine", "LB": "Lebanon", "US": "United States", } if country in country_names: query_parts.append(country_names[country]) elif location and "Lebanon" in location: country = "LB" query_parts.append("Lebanon") elif location and "Palestine" in location: country = "PS" query_parts.append("Palestine") return ", ".join(filter(None, query_parts)), country def should_enrich(institution: Dict[str, Any]) -> bool: """Determine if institution should be enriched.""" # Skip if already has Google Maps enrichment if institution.get("google_maps_enrichment"): return False # Skip digital-only platforms if institution.get("location") == "Online": return False # Skip if no name if not institution.get("name"): return False return True def enrich_institutions( data: Dict[str, Any], dry_run: bool = False, limit: Optional[int] = None, ) -> Dict[str, int]: """Enrich all institutions with Google Maps data.""" stats = { "total": 0, "enriched": 0, "not_found": 0, "skipped": 0, "already_enriched": 0, "errors": 0, } institutions = data.get("institutions", []) stats["total"] = len(institutions) to_process = [] for inst in institutions: if should_enrich(inst): to_process.append(inst) elif inst.get("google_maps_enrichment"): stats["already_enriched"] += 1 else: stats["skipped"] += 1 if limit: to_process = to_process[:limit] logger.info(f"Processing {len(to_process)} institutions (of {stats['total']} total)") with httpx.Client(timeout=30.0) as client: for inst in to_process: try: query, region = build_search_query(inst) location_bias = REGION_COORDS.get(region, REGION_COORDS["PS"]) logger.info(f"Searching: {query}") if dry_run: logger.info(" [DRY RUN] Would search Google Places API") continue place = search_place(query, client, region_code=region, location_bias=location_bias) if place: enrichment = parse_place_data(place) inst["google_maps_enrichment"] = enrichment stats["enriched"] += 1 rating = enrichment.get("rating", "N/A") reviews = enrichment.get("total_ratings", 0) logger.info(f" Found: {enrichment.get('name')} ({rating} stars, {reviews} reviews)") else: inst["google_maps_status"] = "NOT_FOUND" inst["google_maps_search_query"] = query stats["not_found"] += 1 logger.info(f" Not found") time.sleep(REQUEST_DELAY) except Exception as e: logger.error(f"Error processing {inst.get('name')}: {e}") stats["errors"] += 1 return stats def main(): parser = argparse.ArgumentParser( description="Enrich Palestinian heritage institutions with Google Maps data" ) parser.add_argument("--dry-run", action="store_true", help="Don't save changes") parser.add_argument("--limit", type=int, help="Limit number of institutions to process") args = parser.parse_args() # Load consolidated data data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json" if not data_file.exists(): logger.error(f"Data file not found: {data_file}") return 1 logger.info(f"Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: data = json.load(f) # Enrich stats = enrich_institutions(data, dry_run=args.dry_run, limit=args.limit) # Update metadata if not args.dry_run and stats["enriched"] > 0: data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat() data["metadata"]["version"] = "2.1.0" # Update statistics if "google_maps_enriched" not in data["metadata"].get("statistics", {}): data["metadata"]["statistics"]["google_maps_enriched"] = 0 data["metadata"]["statistics"]["google_maps_enriched"] = stats["enriched"] + stats["already_enriched"] # Save with open(data_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.info(f"Saved: {data_file}") # Print summary logger.info("\n" + "=" * 60) logger.info("ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total institutions: {stats['total']}") logger.info(f"Already enriched: {stats['already_enriched']}") logger.info(f"Newly enriched: {stats['enriched']}") logger.info(f"Not found: {stats['not_found']}") logger.info(f"Skipped (online/no name): {stats['skipped']}") logger.info(f"Errors: {stats['errors']}") return 0 if __name__ == "__main__": sys.exit(main())