451 lines
14 KiB
Python
451 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Palestinian and Lebanese heritage institutions with Google Maps/Places API data.
|
|
|
|
This script reads the consolidated Palestinian heritage JSON and adds data from
|
|
Google Places API (New), including:
|
|
- Coordinates (latitude/longitude) - to verify/supplement Wikidata coords
|
|
- Place ID
|
|
- Formatted address
|
|
- Phone number
|
|
- Website
|
|
- Opening hours
|
|
- Reviews and ratings
|
|
- Photo references
|
|
- Business status
|
|
|
|
Usage:
|
|
python scripts/enrich_palestinian_google_maps.py [--dry-run] [--limit N]
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API (New) enabled
|
|
|
|
Output:
|
|
Updates data/extracted/palestinian_heritage_consolidated.json with google_maps_enrichment
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import argparse
|
|
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
print("Please install httpx: pip install httpx")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
except ImportError:
|
|
pass # dotenv is optional
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
|
|
logger.error("Please set it in your .env file or environment")
|
|
sys.exit(1)
|
|
|
|
# API Endpoints - Using Places API (New)
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 0.3 # seconds between requests
|
|
|
|
# Location bias radius (max 50000 meters = 50km according to API)
|
|
LOCATION_BIAS_RADIUS = 50000.0
|
|
|
|
# Fields to request
|
|
PLACE_FIELDS = [
|
|
"id",
|
|
"displayName",
|
|
"formattedAddress",
|
|
"addressComponents",
|
|
"location",
|
|
"types",
|
|
"businessStatus",
|
|
"internationalPhoneNumber",
|
|
"nationalPhoneNumber",
|
|
"regularOpeningHours",
|
|
"websiteUri",
|
|
"rating",
|
|
"userRatingCount",
|
|
"reviews",
|
|
"photos",
|
|
"googleMapsUri",
|
|
"editorialSummary",
|
|
]
|
|
|
|
# Region codes for location bias
|
|
REGION_COORDS = {
|
|
"PS": (31.9, 35.2), # Palestine (West Bank center)
|
|
"LB": (33.9, 35.5), # Lebanon (Beirut)
|
|
"US": (38.9, -77.0), # USA (Washington DC for Museum of Palestinian People)
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class GoogleMapsEnrichment:
|
|
"""Container for Google Maps data."""
|
|
place_id: str
|
|
name: str
|
|
formatted_address: Optional[str] = None
|
|
latitude: Optional[float] = None
|
|
longitude: Optional[float] = None
|
|
types: List[str] = field(default_factory=list)
|
|
business_status: Optional[str] = None
|
|
phone_international: Optional[str] = None
|
|
phone_local: Optional[str] = None
|
|
website: Optional[str] = None
|
|
opening_hours: Optional[Dict[str, Any]] = None
|
|
rating: Optional[float] = None
|
|
total_ratings: Optional[int] = None
|
|
reviews: Optional[List[Dict[str, Any]]] = None
|
|
photos: Optional[List[Dict[str, Any]]] = None
|
|
google_maps_url: Optional[str] = None
|
|
editorial_summary: Optional[str] = None
|
|
address_components: Optional[List[Dict[str, Any]]] = None
|
|
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
api_status: str = "OK"
|
|
|
|
|
|
def search_place(
|
|
query: str,
|
|
client: httpx.Client,
|
|
region_code: str = "PS",
|
|
location_bias: Optional[Tuple[float, float]] = None,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Search for a place using the Text Search API (New)."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
|
|
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
|
|
}
|
|
|
|
# Determine language based on region
|
|
language = "ar" if region_code in ["PS", "LB"] else "en"
|
|
|
|
body = {
|
|
"textQuery": query,
|
|
"languageCode": language,
|
|
"maxResultCount": 1,
|
|
}
|
|
|
|
# Add location bias
|
|
if location_bias:
|
|
lat, lng = location_bias
|
|
body["locationBias"] = {
|
|
"circle": {
|
|
"center": {"latitude": lat, "longitude": lng},
|
|
"radius": LOCATION_BIAS_RADIUS # 50km max per API spec
|
|
}
|
|
}
|
|
|
|
try:
|
|
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
places = data.get("places", [])
|
|
if places:
|
|
return places[0]
|
|
return None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_data = {}
|
|
try:
|
|
error_data = e.response.json()
|
|
except Exception:
|
|
pass
|
|
error_msg = error_data.get("error", {}).get("message", str(e))
|
|
logger.error(f"API error: {error_msg}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error searching for '{query}': {e}")
|
|
return None
|
|
|
|
|
|
def parse_place_data(place: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Parse place data from API response to dictionary."""
|
|
location = place.get("location", {})
|
|
display_name = place.get("displayName", {})
|
|
|
|
# Extract opening hours
|
|
opening_hours = place.get("regularOpeningHours")
|
|
if opening_hours:
|
|
opening_hours = {
|
|
"weekday_text": opening_hours.get("weekdayDescriptions"),
|
|
"periods": opening_hours.get("periods"),
|
|
}
|
|
|
|
# Extract reviews (max 5 from API)
|
|
reviews = place.get("reviews", [])
|
|
if reviews:
|
|
reviews = [
|
|
{
|
|
"author_name": r.get("authorAttribution", {}).get("displayName"),
|
|
"rating": r.get("rating"),
|
|
"text": r.get("text", {}).get("text"),
|
|
"relative_time": r.get("relativePublishTimeDescription"),
|
|
}
|
|
for r in reviews
|
|
]
|
|
|
|
# Extract photos (max 10 from API)
|
|
photos = place.get("photos", [])
|
|
photo_data = []
|
|
if photos:
|
|
for photo in photos[:5]: # Limit to 5 photos
|
|
photo_name = photo.get("name")
|
|
if photo_name:
|
|
photo_data.append({
|
|
"photo_reference": photo_name,
|
|
"height": photo.get("heightPx"),
|
|
"width": photo.get("widthPx"),
|
|
"url": f"https://places.googleapis.com/v1/{photo_name}/media?maxWidthPx=800&key={GOOGLE_PLACES_TOKEN}"
|
|
})
|
|
|
|
# Extract address components
|
|
address_components = place.get("addressComponents")
|
|
if address_components:
|
|
address_components = [
|
|
{
|
|
"long_name": c.get("longText"),
|
|
"short_name": c.get("shortText"),
|
|
"types": c.get("types", []),
|
|
}
|
|
for c in address_components
|
|
]
|
|
|
|
result = {
|
|
"place_id": place.get("id", ""),
|
|
"name": display_name.get("text", ""),
|
|
"formatted_address": place.get("formattedAddress"),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"api_status": "OK",
|
|
}
|
|
|
|
# Add coordinates
|
|
if location.get("latitude") and location.get("longitude"):
|
|
result["coordinates"] = {
|
|
"latitude": location["latitude"],
|
|
"longitude": location["longitude"],
|
|
}
|
|
|
|
# Add optional fields
|
|
if place.get("internationalPhoneNumber"):
|
|
result["phone_international"] = place["internationalPhoneNumber"]
|
|
if place.get("nationalPhoneNumber"):
|
|
result["phone_local"] = place["nationalPhoneNumber"]
|
|
if place.get("websiteUri"):
|
|
result["website"] = place["websiteUri"]
|
|
if place.get("types"):
|
|
result["google_place_types"] = place["types"]
|
|
if place.get("businessStatus"):
|
|
result["business_status"] = place["businessStatus"]
|
|
if opening_hours:
|
|
result["opening_hours"] = opening_hours
|
|
if place.get("rating"):
|
|
result["rating"] = place["rating"]
|
|
if place.get("userRatingCount"):
|
|
result["total_ratings"] = place["userRatingCount"]
|
|
if reviews:
|
|
result["reviews"] = reviews
|
|
if photo_data:
|
|
result["photos"] = photo_data
|
|
if place.get("googleMapsUri"):
|
|
result["google_maps_url"] = place["googleMapsUri"]
|
|
if place.get("editorialSummary"):
|
|
result["editorial_summary"] = place["editorialSummary"].get("text")
|
|
if address_components:
|
|
result["address_components"] = address_components
|
|
|
|
return result
|
|
|
|
|
|
def build_search_query(institution: Dict[str, Any]) -> Tuple[str, str]:
|
|
"""Build search query from institution data. Returns (query, region_code)."""
|
|
name = institution.get("name", "")
|
|
city = institution.get("city", "")
|
|
country = institution.get("country", "PS")
|
|
location = institution.get("location", "")
|
|
|
|
# Build query
|
|
query_parts = [name]
|
|
if city:
|
|
query_parts.append(city)
|
|
|
|
# Add country name for disambiguation
|
|
country_names = {
|
|
"PS": "Palestine",
|
|
"LB": "Lebanon",
|
|
"US": "United States",
|
|
}
|
|
if country in country_names:
|
|
query_parts.append(country_names[country])
|
|
elif location and "Lebanon" in location:
|
|
country = "LB"
|
|
query_parts.append("Lebanon")
|
|
elif location and "Palestine" in location:
|
|
country = "PS"
|
|
query_parts.append("Palestine")
|
|
|
|
return ", ".join(filter(None, query_parts)), country
|
|
|
|
|
|
def should_enrich(institution: Dict[str, Any]) -> bool:
|
|
"""Determine if institution should be enriched."""
|
|
# Skip if already has Google Maps enrichment
|
|
if institution.get("google_maps_enrichment"):
|
|
return False
|
|
|
|
# Skip digital-only platforms
|
|
if institution.get("location") == "Online":
|
|
return False
|
|
|
|
# Skip if no name
|
|
if not institution.get("name"):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def enrich_institutions(
|
|
data: Dict[str, Any],
|
|
dry_run: bool = False,
|
|
limit: Optional[int] = None,
|
|
) -> Dict[str, int]:
|
|
"""Enrich all institutions with Google Maps data."""
|
|
stats = {
|
|
"total": 0,
|
|
"enriched": 0,
|
|
"not_found": 0,
|
|
"skipped": 0,
|
|
"already_enriched": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
institutions = data.get("institutions", [])
|
|
stats["total"] = len(institutions)
|
|
|
|
to_process = []
|
|
for inst in institutions:
|
|
if should_enrich(inst):
|
|
to_process.append(inst)
|
|
elif inst.get("google_maps_enrichment"):
|
|
stats["already_enriched"] += 1
|
|
else:
|
|
stats["skipped"] += 1
|
|
|
|
if limit:
|
|
to_process = to_process[:limit]
|
|
|
|
logger.info(f"Processing {len(to_process)} institutions (of {stats['total']} total)")
|
|
|
|
with httpx.Client(timeout=30.0) as client:
|
|
for inst in to_process:
|
|
try:
|
|
query, region = build_search_query(inst)
|
|
location_bias = REGION_COORDS.get(region, REGION_COORDS["PS"])
|
|
|
|
logger.info(f"Searching: {query}")
|
|
|
|
if dry_run:
|
|
logger.info(" [DRY RUN] Would search Google Places API")
|
|
continue
|
|
|
|
place = search_place(query, client, region_code=region, location_bias=location_bias)
|
|
|
|
if place:
|
|
enrichment = parse_place_data(place)
|
|
inst["google_maps_enrichment"] = enrichment
|
|
stats["enriched"] += 1
|
|
|
|
rating = enrichment.get("rating", "N/A")
|
|
reviews = enrichment.get("total_ratings", 0)
|
|
logger.info(f" Found: {enrichment.get('name')} ({rating} stars, {reviews} reviews)")
|
|
else:
|
|
inst["google_maps_status"] = "NOT_FOUND"
|
|
inst["google_maps_search_query"] = query
|
|
stats["not_found"] += 1
|
|
logger.info(f" Not found")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {inst.get('name')}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich Palestinian heritage institutions with Google Maps data"
|
|
)
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't save changes")
|
|
parser.add_argument("--limit", type=int, help="Limit number of institutions to process")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load consolidated data
|
|
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
|
|
|
|
if not data_file.exists():
|
|
logger.error(f"Data file not found: {data_file}")
|
|
return 1
|
|
|
|
logger.info(f"Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Enrich
|
|
stats = enrich_institutions(data, dry_run=args.dry_run, limit=args.limit)
|
|
|
|
# Update metadata
|
|
if not args.dry_run and stats["enriched"] > 0:
|
|
data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat()
|
|
data["metadata"]["version"] = "2.1.0"
|
|
|
|
# Update statistics
|
|
if "google_maps_enriched" not in data["metadata"].get("statistics", {}):
|
|
data["metadata"]["statistics"]["google_maps_enriched"] = 0
|
|
data["metadata"]["statistics"]["google_maps_enriched"] = stats["enriched"] + stats["already_enriched"]
|
|
|
|
# Save
|
|
with open(data_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
logger.info(f"Saved: {data_file}")
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("ENRICHMENT COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total institutions: {stats['total']}")
|
|
logger.info(f"Already enriched: {stats['already_enriched']}")
|
|
logger.info(f"Newly enriched: {stats['enriched']}")
|
|
logger.info(f"Not found: {stats['not_found']}")
|
|
logger.info(f"Skipped (online/no name): {stats['skipped']}")
|
|
logger.info(f"Errors: {stats['errors']}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|