glam/scripts/enrich_palestinian_google_maps.py
2025-12-06 19:50:04 +01:00

451 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enrich Palestinian and Lebanese heritage institutions with Google Maps/Places API data.
This script reads the consolidated Palestinian heritage JSON and adds data from
Google Places API (New), including:
- Coordinates (latitude/longitude) - to verify/supplement Wikidata coords
- Place ID
- Formatted address
- Phone number
- Website
- Opening hours
- Reviews and ratings
- Photo references
- Business status
Usage:
python scripts/enrich_palestinian_google_maps.py [--dry-run] [--limit N]
Environment Variables:
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled
Output:
Updates data/extracted/palestinian_heritage_consolidated.json with google_maps_enrichment
"""
import os
import sys
import time
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
import logging
import argparse
try:
import httpx
except ImportError:
print("Please install httpx: pip install httpx")
sys.exit(1)
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass # dotenv is optional
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
if not GOOGLE_PLACES_TOKEN:
logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
logger.error("Please set it in your .env file or environment")
sys.exit(1)
# API Endpoints - Using Places API (New)
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
# Rate limiting
REQUEST_DELAY = 0.3 # seconds between requests
# Location bias radius (max 50000 meters = 50km according to API)
LOCATION_BIAS_RADIUS = 50000.0
# Fields to request
PLACE_FIELDS = [
"id",
"displayName",
"formattedAddress",
"addressComponents",
"location",
"types",
"businessStatus",
"internationalPhoneNumber",
"nationalPhoneNumber",
"regularOpeningHours",
"websiteUri",
"rating",
"userRatingCount",
"reviews",
"photos",
"googleMapsUri",
"editorialSummary",
]
# Region codes for location bias
REGION_COORDS = {
"PS": (31.9, 35.2), # Palestine (West Bank center)
"LB": (33.9, 35.5), # Lebanon (Beirut)
"US": (38.9, -77.0), # USA (Washington DC for Museum of Palestinian People)
}
@dataclass
class GoogleMapsEnrichment:
"""Container for Google Maps data."""
place_id: str
name: str
formatted_address: Optional[str] = None
latitude: Optional[float] = None
longitude: Optional[float] = None
types: List[str] = field(default_factory=list)
business_status: Optional[str] = None
phone_international: Optional[str] = None
phone_local: Optional[str] = None
website: Optional[str] = None
opening_hours: Optional[Dict[str, Any]] = None
rating: Optional[float] = None
total_ratings: Optional[int] = None
reviews: Optional[List[Dict[str, Any]]] = None
photos: Optional[List[Dict[str, Any]]] = None
google_maps_url: Optional[str] = None
editorial_summary: Optional[str] = None
address_components: Optional[List[Dict[str, Any]]] = None
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
api_status: str = "OK"
def search_place(
query: str,
client: httpx.Client,
region_code: str = "PS",
location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
"""Search for a place using the Text Search API (New)."""
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
}
# Determine language based on region
language = "ar" if region_code in ["PS", "LB"] else "en"
body = {
"textQuery": query,
"languageCode": language,
"maxResultCount": 1,
}
# Add location bias
if location_bias:
lat, lng = location_bias
body["locationBias"] = {
"circle": {
"center": {"latitude": lat, "longitude": lng},
"radius": LOCATION_BIAS_RADIUS # 50km max per API spec
}
}
try:
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
response.raise_for_status()
data = response.json()
places = data.get("places", [])
if places:
return places[0]
return None
except httpx.HTTPStatusError as e:
error_data = {}
try:
error_data = e.response.json()
except Exception:
pass
error_msg = error_data.get("error", {}).get("message", str(e))
logger.error(f"API error: {error_msg}")
return None
except Exception as e:
logger.error(f"Error searching for '{query}': {e}")
return None
def parse_place_data(place: Dict[str, Any]) -> Dict[str, Any]:
"""Parse place data from API response to dictionary."""
location = place.get("location", {})
display_name = place.get("displayName", {})
# Extract opening hours
opening_hours = place.get("regularOpeningHours")
if opening_hours:
opening_hours = {
"weekday_text": opening_hours.get("weekdayDescriptions"),
"periods": opening_hours.get("periods"),
}
# Extract reviews (max 5 from API)
reviews = place.get("reviews", [])
if reviews:
reviews = [
{
"author_name": r.get("authorAttribution", {}).get("displayName"),
"rating": r.get("rating"),
"text": r.get("text", {}).get("text"),
"relative_time": r.get("relativePublishTimeDescription"),
}
for r in reviews
]
# Extract photos (max 10 from API)
photos = place.get("photos", [])
photo_data = []
if photos:
for photo in photos[:5]: # Limit to 5 photos
photo_name = photo.get("name")
if photo_name:
photo_data.append({
"photo_reference": photo_name,
"height": photo.get("heightPx"),
"width": photo.get("widthPx"),
"url": f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}"
})
# Extract address components
address_components = place.get("addressComponents")
if address_components:
address_components = [
{
"long_name": c.get("longText"),
"short_name": c.get("shortText"),
"types": c.get("types", []),
}
for c in address_components
]
result = {
"place_id": place.get("id", ""),
"name": display_name.get("text", ""),
"formatted_address": place.get("formattedAddress"),
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
"api_status": "OK",
}
# Add coordinates
if location.get("latitude") and location.get("longitude"):
result["coordinates"] = {
"latitude": location["latitude"],
"longitude": location["longitude"],
}
# Add optional fields
if place.get("internationalPhoneNumber"):
result["phone_international"] = place["internationalPhoneNumber"]
if place.get("nationalPhoneNumber"):
result["phone_local"] = place["nationalPhoneNumber"]
if place.get("websiteUri"):
result["website"] = place["websiteUri"]
if place.get("types"):
result["google_place_types"] = place["types"]
if place.get("businessStatus"):
result["business_status"] = place["businessStatus"]
if opening_hours:
result["opening_hours"] = opening_hours
if place.get("rating"):
result["rating"] = place["rating"]
if place.get("userRatingCount"):
result["total_ratings"] = place["userRatingCount"]
if reviews:
result["reviews"] = reviews
if photo_data:
result["photos"] = photo_data
if place.get("googleMapsUri"):
result["google_maps_url"] = place["googleMapsUri"]
if place.get("editorialSummary"):
result["editorial_summary"] = place["editorialSummary"].get("text")
if address_components:
result["address_components"] = address_components
return result
def build_search_query(institution: Dict[str, Any]) -> Tuple[str, str]:
"""Build search query from institution data. Returns (query, region_code)."""
name = institution.get("name", "")
city = institution.get("city", "")
country = institution.get("country", "PS")
location = institution.get("location", "")
# Build query
query_parts = [name]
if city:
query_parts.append(city)
# Add country name for disambiguation
country_names = {
"PS": "Palestine",
"LB": "Lebanon",
"US": "United States",
}
if country in country_names:
query_parts.append(country_names[country])
elif location and "Lebanon" in location:
country = "LB"
query_parts.append("Lebanon")
elif location and "Palestine" in location:
country = "PS"
query_parts.append("Palestine")
return ", ".join(filter(None, query_parts)), country
def should_enrich(institution: Dict[str, Any]) -> bool:
"""Determine if institution should be enriched."""
# Skip if already has Google Maps enrichment
if institution.get("google_maps_enrichment"):
return False
# Skip digital-only platforms
if institution.get("location") == "Online":
return False
# Skip if no name
if not institution.get("name"):
return False
return True
def enrich_institutions(
data: Dict[str, Any],
dry_run: bool = False,
limit: Optional[int] = None,
) -> Dict[str, int]:
"""Enrich all institutions with Google Maps data."""
stats = {
"total": 0,
"enriched": 0,
"not_found": 0,
"skipped": 0,
"already_enriched": 0,
"errors": 0,
}
institutions = data.get("institutions", [])
stats["total"] = len(institutions)
to_process = []
for inst in institutions:
if should_enrich(inst):
to_process.append(inst)
elif inst.get("google_maps_enrichment"):
stats["already_enriched"] += 1
else:
stats["skipped"] += 1
if limit:
to_process = to_process[:limit]
logger.info(f"Processing {len(to_process)} institutions (of {stats['total']} total)")
with httpx.Client(timeout=30.0) as client:
for inst in to_process:
try:
query, region = build_search_query(inst)
location_bias = REGION_COORDS.get(region, REGION_COORDS["PS"])
logger.info(f"Searching: {query}")
if dry_run:
logger.info(" [DRY RUN] Would search Google Places API")
continue
place = search_place(query, client, region_code=region, location_bias=location_bias)
if place:
enrichment = parse_place_data(place)
inst["google_maps_enrichment"] = enrichment
stats["enriched"] += 1
rating = enrichment.get("rating", "N/A")
reviews = enrichment.get("total_ratings", 0)
logger.info(f" Found: {enrichment.get('name')} ({rating} stars, {reviews} reviews)")
else:
inst["google_maps_status"] = "NOT_FOUND"
inst["google_maps_search_query"] = query
stats["not_found"] += 1
logger.info(f" Not found")
time.sleep(REQUEST_DELAY)
except Exception as e:
logger.error(f"Error processing {inst.get('name')}: {e}")
stats["errors"] += 1
return stats
def main():
parser = argparse.ArgumentParser(
description="Enrich Palestinian heritage institutions with Google Maps data"
)
parser.add_argument("--dry-run", action="store_true", help="Don't save changes")
parser.add_argument("--limit", type=int, help="Limit number of institutions to process")
args = parser.parse_args()
# Load consolidated data
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
if not data_file.exists():
logger.error(f"Data file not found: {data_file}")
return 1
logger.info(f"Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Enrich
stats = enrich_institutions(data, dry_run=args.dry_run, limit=args.limit)
# Update metadata
if not args.dry_run and stats["enriched"] > 0:
data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat()
data["metadata"]["version"] = "2.1.0"
# Update statistics
if "google_maps_enriched" not in data["metadata"].get("statistics", {}):
data["metadata"]["statistics"]["google_maps_enriched"] = 0
data["metadata"]["statistics"]["google_maps_enriched"] = stats["enriched"] + stats["already_enriched"]
# Save
with open(data_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"Saved: {data_file}")
# Print summary
logger.info("\n" + "=" * 60)
logger.info("ENRICHMENT COMPLETE")
logger.info("=" * 60)
logger.info(f"Total institutions: {stats['total']}")
logger.info(f"Already enriched: {stats['already_enriched']}")
logger.info(f"Newly enriched: {stats['enriched']}")
logger.info(f"Not found: {stats['not_found']}")
logger.info(f"Skipped (online/no name): {stats['skipped']}")
logger.info(f"Errors: {stats['errors']}")
return 0
if __name__ == "__main__":
sys.exit(main())