#!/usr/bin/env python3 """ Enrich KB Netherlands library entries with Google Maps data. This script reads the KB ISIL library entries and enriches them with Google Places API data. Usage: python scripts/enrich_kb_libraries_google_maps.py [--dry-run] [--limit N] """ import os import sys import time import json import yaml import httpx from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass, field import logging import argparse from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") if not GOOGLE_PLACES_TOKEN: logger.error("GOOGLE_PLACES_TOKEN environment variable is required") logger.error("Please set it in your .env file or environment") sys.exit(1) # API Endpoints - Using Places API (New) TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" # Rate limiting REQUEST_DELAY = 0.3 # Fields to request PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "addressComponents", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "regularOpeningHours", "websiteUri", "rating", "userRatingCount", "googleMapsUri", "primaryType", "shortFormattedAddress", ] @dataclass class GoogleMapsEnrichment: """Container for Google Maps data.""" place_id: str name: str formatted_address: Optional[str] = None short_address: Optional[str] = None latitude: Optional[float] = None longitude: Optional[float] = None types: List[str] = field(default_factory=list) primary_type: Optional[str] = None business_status: Optional[str] = None national_phone_number: Optional[str] = None international_phone_number: Optional[str] = None website: Optional[str] = None opening_hours: Optional[Dict[str, Any]] = None rating: Optional[float] = None user_ratings_total: Optional[int] = None google_maps_url: Optional[str] = None address_components: Optional[List[Dict[str, Any]]] = None fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) api_status: str = "OK" def search_place( query: str, client: httpx.Client, location_bias: Optional[Tuple[float, float]] = None, ) -> Optional[Dict[str, Any]]: """Search for a place using the Text Search API (New).""" headers = { "Content-Type": "application/json", "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]), } body = { "textQuery": query, "languageCode": "nl", "regionCode": "NL", "maxResultCount": 1, } if location_bias: lat, lng = location_bias body["locationBias"] = { "circle": { "center": {"latitude": lat, "longitude": lng}, "radius": 50000.0 } } try: response = client.post(TEXT_SEARCH_URL, headers=headers, json=body) response.raise_for_status() data = response.json() places = data.get("places", []) if places: return places[0] else: logger.warning(f"No place found for query: {query}") return None except httpx.HTTPStatusError as e: error_data = {} try: error_data = e.response.json() except Exception: pass error_msg = error_data.get("error", {}).get("message", str(e)) logger.error(f"HTTP error searching for place: {error_msg}") return None except Exception as e: logger.error(f"Error searching for '{query}': {e}") return None def parse_place_data(place: Dict[str, Any]) -> GoogleMapsEnrichment: """Parse place data from API response.""" location = place.get("location", {}) lat = location.get("latitude") lng = location.get("longitude") display_name = place.get("displayName", {}) name = display_name.get("text", "") opening_hours = place.get("regularOpeningHours") if opening_hours: opening_hours = { "periods": opening_hours.get("periods"), "weekday_text": opening_hours.get("weekdayDescriptions"), } address_components = place.get("addressComponents") if address_components: address_components = [ { "long_name": c.get("longText"), "short_name": c.get("shortText"), "types": c.get("types", []), } for c in address_components ] return GoogleMapsEnrichment( place_id=place.get("id", ""), name=name, formatted_address=place.get("formattedAddress"), short_address=place.get("shortFormattedAddress"), latitude=lat, longitude=lng, types=place.get("types", []), primary_type=place.get("primaryType"), business_status=place.get("businessStatus"), national_phone_number=place.get("nationalPhoneNumber"), international_phone_number=place.get("internationalPhoneNumber"), website=place.get("websiteUri"), opening_hours=opening_hours, rating=place.get("rating"), user_ratings_total=place.get("userRatingCount"), google_maps_url=place.get("googleMapsUri"), address_components=address_components, api_status="OK", ) def enrichment_to_dict(enrichment: GoogleMapsEnrichment) -> Dict[str, Any]: """Convert GoogleMapsEnrichment to dictionary for YAML.""" result: Dict[str, Any] = { "place_id": enrichment.place_id, "name": enrichment.name, "fetch_timestamp": enrichment.fetch_timestamp, "api_status": enrichment.api_status, } if enrichment.latitude is not None and enrichment.longitude is not None: result["coordinates"] = { "latitude": enrichment.latitude, "longitude": enrichment.longitude, } if enrichment.formatted_address: result["formatted_address"] = enrichment.formatted_address if enrichment.short_address: result["short_address"] = enrichment.short_address if enrichment.address_components: result["address_components"] = enrichment.address_components if enrichment.national_phone_number: result["phone_local"] = enrichment.national_phone_number if enrichment.international_phone_number: result["phone_international"] = enrichment.international_phone_number if enrichment.website: result["website"] = enrichment.website if enrichment.types: result["google_place_types"] = enrichment.types if enrichment.primary_type: result["primary_type"] = enrichment.primary_type if enrichment.business_status: result["business_status"] = enrichment.business_status if enrichment.opening_hours: result["opening_hours"] = enrichment.opening_hours if enrichment.rating is not None: result["rating"] = enrichment.rating if enrichment.user_ratings_total is not None: result["total_ratings"] = enrichment.user_ratings_total if enrichment.google_maps_url: result["google_maps_url"] = enrichment.google_maps_url return result def build_search_query(entry: Dict[str, Any]) -> str: """Build a search query from entry data.""" kb_enrichment = entry.get("kb_enrichment", {}) original = entry.get("original_entry", {}) # Get organization name from KB enrichment or original entry org_name = kb_enrichment.get("name") or original.get("organisatie", "") city = kb_enrichment.get("city") or original.get("plaatsnaam_bezoekadres", "") # Add "bibliotheek" if not in name if org_name and "bibliotheek" not in org_name.lower(): org_name = f"Bibliotheek {org_name}" query_parts = [org_name] if city: query_parts.append(city) query_parts.append("Netherlands") return ", ".join(filter(None, query_parts)) def process_kb_entries( entries_dir: Path, dry_run: bool = False, limit: Optional[int] = None, ) -> Dict[str, int]: """Process all KB ISIL library entries.""" stats = { "total_files": 0, "already_enriched": 0, "newly_enriched": 0, "not_found": 0, "errors": 0, } # Find all KB ISIL files kb_files = sorted(entries_dir.glob("*_kb_isil.yaml")) stats["total_files"] = len(kb_files) if limit: kb_files = kb_files[:limit] logger.info(f"Found {stats['total_files']} KB library entries") logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})") # Netherlands center for location bias NL_CENTER = (52.1326, 5.2913) with httpx.Client(timeout=30.0) as client: for yaml_file in kb_files: try: # Load entry with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: continue # Check if already has Google Maps enrichment if entry.get("google_maps_enrichment"): stats["already_enriched"] += 1 continue # Build search query query = build_search_query(entry) logger.info(f"\nProcessing: {yaml_file.name}") logger.info(f" Query: {query}") # Search for place place = search_place(query, client, location_bias=NL_CENTER) if not place: entry["google_maps_status"] = "NOT_FOUND" entry["google_maps_search_query"] = query entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat() stats["not_found"] += 1 else: # Parse place data enrichment = parse_place_data(place) entry["google_maps_enrichment"] = enrichment_to_dict(enrichment) entry["google_maps_status"] = "SUCCESS" entry["google_maps_search_query"] = query logger.info(f" -> Found: {enrichment.name} ({enrichment.rating or 'no rating'}★)") stats["newly_enriched"] += 1 # Save if not dry run if not dry_run: with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rate limiting time.sleep(REQUEST_DELAY) except Exception as e: logger.error(f"Error processing {yaml_file.name}: {e}") stats["errors"] += 1 return stats def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Enrich KB library entries with Google Maps data" ) parser.add_argument( "--dry-run", action="store_true", help="Don't save changes, just show what would be done" ) parser.add_argument( "--limit", type=int, help="Limit number of entries to process" ) parser.add_argument( "--entries-dir", type=Path, default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries", help="Path to entries directory" ) args = parser.parse_args() if args.dry_run: logger.info("DRY RUN MODE - no changes will be saved") if not args.entries_dir.exists(): logger.error(f"Entries directory not found: {args.entries_dir}") return 1 # Process entries stats = process_kb_entries( entries_dir=args.entries_dir, dry_run=args.dry_run, limit=args.limit, ) # Print summary logger.info("\n" + "=" * 60) logger.info("GOOGLE MAPS ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total KB library files: {stats['total_files']}") logger.info(f"Already enriched: {stats['already_enriched']}") logger.info(f"Newly enriched: {stats['newly_enriched']}") logger.info(f"Not found: {stats['not_found']}") logger.info(f"Errors: {stats['errors']}") # Save stats if not args.dry_run: stats_file = args.entries_dir.parent / f"kb_google_maps_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(stats_file, 'w') as f: json.dump({ "timestamp": datetime.now(timezone.utc).isoformat(), "dry_run": args.dry_run, "limit": args.limit, **stats }, f, indent=2) logger.info(f"Stats saved to: {stats_file}") return 0 if __name__ == "__main__": sys.exit(main())