glam/scripts/enrich_palestinian_google_maps.py

#!/usr/bin/env python3
"""
Enrich Palestinian and Lebanese heritage institutions with Google Maps/Places API data.

This script reads the consolidated Palestinian heritage JSON and adds data from
Google Places API (New), including:
- Coordinates (latitude/longitude) - to verify/supplement Wikidata coords
- Place ID
- Formatted address
- Phone number
- Website
- Opening hours
- Reviews and ratings
- Photo references
- Business status

Usage:
    python scripts/enrich_palestinian_google_maps.py [--dry-run] [--limit N]

Environment Variables:
    GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled

Output:
    Updates data/extracted/palestinian_heritage_consolidated.json with google_maps_enrichment
"""

import os
import sys
import time
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
import logging
import argparse

try:
    import httpx
except ImportError:
    print("Please install httpx: pip install httpx")
    sys.exit(1)

try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass  # dotenv is optional

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")

if not GOOGLE_PLACES_TOKEN:
    logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
    logger.error("Please set it in your .env file or environment")
    sys.exit(1)

# API Endpoints - Using Places API (New)
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"

# Rate limiting
REQUEST_DELAY = 0.3  # seconds between requests

# Location bias radius (max 50000 meters = 50km according to API)
LOCATION_BIAS_RADIUS = 50000.0

# Fields to request
PLACE_FIELDS = [
    "id",
    "displayName",
    "formattedAddress",
    "addressComponents",
    "location",
    "types",
    "businessStatus",
    "internationalPhoneNumber",
    "nationalPhoneNumber",
    "regularOpeningHours",
    "websiteUri",
    "rating",
    "userRatingCount",
    "reviews",
    "photos",
    "googleMapsUri",
    "editorialSummary",
]

# Region codes for location bias
REGION_COORDS = {
    "PS": (31.9, 35.2),  # Palestine (West Bank center)
    "LB": (33.9, 35.5),  # Lebanon (Beirut)
    "US": (38.9, -77.0),  # USA (Washington DC for Museum of Palestinian People)
}


@dataclass
class GoogleMapsEnrichment:
    """Container for Google Maps data."""
    place_id: str
    name: str
    formatted_address: Optional[str] = None
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    types: List[str] = field(default_factory=list)
    business_status: Optional[str] = None
    phone_international: Optional[str] = None
    phone_local: Optional[str] = None
    website: Optional[str] = None
    opening_hours: Optional[Dict[str, Any]] = None
    rating: Optional[float] = None
    total_ratings: Optional[int] = None
    reviews: Optional[List[Dict[str, Any]]] = None
    photos: Optional[List[Dict[str, Any]]] = None
    google_maps_url: Optional[str] = None
    editorial_summary: Optional[str] = None
    address_components: Optional[List[Dict[str, Any]]] = None
    fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    api_status: str = "OK"


def search_place(
    query: str,
    client: httpx.Client,
    region_code: str = "PS",
    location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
    """Search for a place using the Text Search API (New)."""
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
        "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
    }

    # Determine language based on region
    language = "ar" if region_code in ["PS", "LB"] else "en"

    body = {
        "textQuery": query,
        "languageCode": language,
        "maxResultCount": 1,
    }

    # Add location bias
    if location_bias:
        lat, lng = location_bias
        body["locationBias"] = {
            "circle": {
                "center": {"latitude": lat, "longitude": lng},
                "radius": LOCATION_BIAS_RADIUS  # 50km max per API spec
            }
        }

    try:
        response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
        response.raise_for_status()
        data = response.json()

        places = data.get("places", [])
        if places:
            return places[0]
        return None

    except httpx.HTTPStatusError as e:
        error_data = {}
        try:
            error_data = e.response.json()
        except Exception:
            pass
        error_msg = error_data.get("error", {}).get("message", str(e))
        logger.error(f"API error: {error_msg}")
        return None
    except Exception as e:
        logger.error(f"Error searching for '{query}': {e}")
        return None


def parse_place_data(place: Dict[str, Any]) -> Dict[str, Any]:
    """Parse place data from API response to dictionary."""
    location = place.get("location", {})
    display_name = place.get("displayName", {})

    # Extract opening hours
    opening_hours = place.get("regularOpeningHours")
    if opening_hours:
        opening_hours = {
            "weekday_text": opening_hours.get("weekdayDescriptions"),
            "periods": opening_hours.get("periods"),
        }

    # Extract reviews (max 5 from API)
    reviews = place.get("reviews", [])
    if reviews:
        reviews = [
            {
                "author_name": r.get("authorAttribution", {}).get("displayName"),
                "rating": r.get("rating"),
                "text": r.get("text", {}).get("text"),
                "relative_time": r.get("relativePublishTimeDescription"),
            }
            for r in reviews
        ]

    # Extract photos (max 10 from API)
    photos = place.get("photos", [])
    photo_data = []
    if photos:
        for photo in photos[:5]:  # Limit to 5 photos
            photo_name = photo.get("name")
            if photo_name:
                photo_data.append({
                    "photo_reference": photo_name,
                    "height": photo.get("heightPx"),
                    "width": photo.get("widthPx"),
                    "url": f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}"
                })

    # Extract address components
    address_components = place.get("addressComponents")
    if address_components:
        address_components = [
            {
                "long_name": c.get("longText"),
                "short_name": c.get("shortText"),
                "types": c.get("types", []),
            }
            for c in address_components
        ]

    result = {
        "place_id": place.get("id", ""),
        "name": display_name.get("text", ""),
        "formatted_address": place.get("formattedAddress"),
        "fetch_timestamp": datetime.now(timezone.utc).isoformat(),
        "api_status": "OK",
    }

    # Add coordinates
    if location.get("latitude") and location.get("longitude"):
        result["coordinates"] = {
            "latitude": location["latitude"],
            "longitude": location["longitude"],
        }

    # Add optional fields
    if place.get("internationalPhoneNumber"):
        result["phone_international"] = place["internationalPhoneNumber"]
    if place.get("nationalPhoneNumber"):
        result["phone_local"] = place["nationalPhoneNumber"]
    if place.get("websiteUri"):
        result["website"] = place["websiteUri"]
    if place.get("types"):
        result["google_place_types"] = place["types"]
    if place.get("businessStatus"):
        result["business_status"] = place["businessStatus"]
    if opening_hours:
        result["opening_hours"] = opening_hours
    if place.get("rating"):
        result["rating"] = place["rating"]
    if place.get("userRatingCount"):
        result["total_ratings"] = place["userRatingCount"]
    if reviews:
        result["reviews"] = reviews
    if photo_data:
        result["photos"] = photo_data
    if place.get("googleMapsUri"):
        result["google_maps_url"] = place["googleMapsUri"]
    if place.get("editorialSummary"):
        result["editorial_summary"] = place["editorialSummary"].get("text")
    if address_components:
        result["address_components"] = address_components

    return result


def build_search_query(institution: Dict[str, Any]) -> Tuple[str, str]:
    """Build search query from institution data. Returns (query, region_code)."""
    name = institution.get("name", "")
    city = institution.get("city", "")
    country = institution.get("country", "PS")
    location = institution.get("location", "")

    # Build query
    query_parts = [name]
    if city:
        query_parts.append(city)

    # Add country name for disambiguation
    country_names = {
        "PS": "Palestine",
        "LB": "Lebanon",
        "US": "United States",
    }
    if country in country_names:
        query_parts.append(country_names[country])
    elif location and "Lebanon" in location:
        country = "LB"
        query_parts.append("Lebanon")
    elif location and "Palestine" in location:
        country = "PS"
        query_parts.append("Palestine")

    return ", ".join(filter(None, query_parts)), country


def should_enrich(institution: Dict[str, Any]) -> bool:
    """Determine if institution should be enriched."""
    # Skip if already has Google Maps enrichment
    if institution.get("google_maps_enrichment"):
        return False

    # Skip digital-only platforms
    if institution.get("location") == "Online":
        return False

    # Skip if no name
    if not institution.get("name"):
        return False

    return True


def enrich_institutions(
    data: Dict[str, Any],
    dry_run: bool = False,
    limit: Optional[int] = None,
) -> Dict[str, int]:
    """Enrich all institutions with Google Maps data."""
    stats = {
        "total": 0,
        "enriched": 0,
        "not_found": 0,
        "skipped": 0,
        "already_enriched": 0,
        "errors": 0,
    }

    institutions = data.get("institutions", [])
    stats["total"] = len(institutions)

    to_process = []
    for inst in institutions:
        if should_enrich(inst):
            to_process.append(inst)
        elif inst.get("google_maps_enrichment"):
            stats["already_enriched"] += 1
        else:
            stats["skipped"] += 1

    if limit:
        to_process = to_process[:limit]

    logger.info(f"Processing {len(to_process)} institutions (of {stats['total']} total)")

    with httpx.Client(timeout=30.0) as client:
        for inst in to_process:
            try:
                query, region = build_search_query(inst)
                location_bias = REGION_COORDS.get(region, REGION_COORDS["PS"])

                logger.info(f"Searching: {query}")

                if dry_run:
                    logger.info("  [DRY RUN] Would search Google Places API")
                    continue

                place = search_place(query, client, region_code=region, location_bias=location_bias)

                if place:
                    enrichment = parse_place_data(place)
                    inst["google_maps_enrichment"] = enrichment
                    stats["enriched"] += 1

                    rating = enrichment.get("rating", "N/A")
                    reviews = enrichment.get("total_ratings", 0)
                    logger.info(f"  Found: {enrichment.get('name')} ({rating} stars, {reviews} reviews)")
                else:
                    inst["google_maps_status"] = "NOT_FOUND"
                    inst["google_maps_search_query"] = query
                    stats["not_found"] += 1
                    logger.info(f"  Not found")

                time.sleep(REQUEST_DELAY)

            except Exception as e:
                logger.error(f"Error processing {inst.get('name')}: {e}")
                stats["errors"] += 1

    return stats


def main():
    parser = argparse.ArgumentParser(
        description="Enrich Palestinian heritage institutions with Google Maps data"
    )
    parser.add_argument("--dry-run", action="store_true", help="Don't save changes")
    parser.add_argument("--limit", type=int, help="Limit number of institutions to process")

    args = parser.parse_args()

    # Load consolidated data
    data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"

    if not data_file.exists():
        logger.error(f"Data file not found: {data_file}")
        return 1

    logger.info(f"Loading: {data_file}")
    with open(data_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Enrich
    stats = enrich_institutions(data, dry_run=args.dry_run, limit=args.limit)

    # Update metadata
    if not args.dry_run and stats["enriched"] > 0:
        data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat()
        data["metadata"]["version"] = "2.1.0"

        # Update statistics
        if "google_maps_enriched" not in data["metadata"].get("statistics", {}):
            data["metadata"]["statistics"]["google_maps_enriched"] = 0
        data["metadata"]["statistics"]["google_maps_enriched"] = stats["enriched"] + stats["already_enriched"]

        # Save
        with open(data_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved: {data_file}")

    # Print summary
    logger.info("\n" + "=" * 60)
    logger.info("ENRICHMENT COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Total institutions: {stats['total']}")
    logger.info(f"Already enriched: {stats['already_enriched']}")
    logger.info(f"Newly enriched: {stats['enriched']}")
    logger.info(f"Not found: {stats['not_found']}")
    logger.info(f"Skipped (online/no name): {stats['skipped']}")
    logger.info(f"Errors: {stats['errors']}")

    return 0


if __name__ == "__main__":
    sys.exit(main())