glam/scripts/enrich_kb_libraries_google_maps.py
kempersc 30162e6526 Add script to validate KB library entries and generate enrichment report
- Implemented a Python script to validate KB library YAML files for required fields and data quality.
- Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics.
- Created a comprehensive markdown report summarizing validation results and enrichment quality.
- Included error handling for file loading and validation processes.
- Generated JSON statistics for further analysis.
2025-11-28 14:48:33 +01:00

415 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Enrich KB Netherlands library entries with Google Maps data.
This script reads the KB ISIL library entries and enriches them with
Google Places API data.
Usage:
python scripts/enrich_kb_libraries_google_maps.py [--dry-run] [--limit N]
"""
import os
import sys
import time
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
import logging
import argparse
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
if not GOOGLE_PLACES_TOKEN:
logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
logger.error("Please set it in your .env file or environment")
sys.exit(1)
# API Endpoints - Using Places API (New)
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
# Rate limiting
REQUEST_DELAY = 0.3
# Fields to request
PLACE_FIELDS = [
"id",
"displayName",
"formattedAddress",
"addressComponents",
"location",
"types",
"businessStatus",
"internationalPhoneNumber",
"nationalPhoneNumber",
"regularOpeningHours",
"websiteUri",
"rating",
"userRatingCount",
"googleMapsUri",
"primaryType",
"shortFormattedAddress",
]
@dataclass
class GoogleMapsEnrichment:
"""Container for Google Maps data."""
place_id: str
name: str
formatted_address: Optional[str] = None
short_address: Optional[str] = None
latitude: Optional[float] = None
longitude: Optional[float] = None
types: List[str] = field(default_factory=list)
primary_type: Optional[str] = None
business_status: Optional[str] = None
national_phone_number: Optional[str] = None
international_phone_number: Optional[str] = None
website: Optional[str] = None
opening_hours: Optional[Dict[str, Any]] = None
rating: Optional[float] = None
user_ratings_total: Optional[int] = None
google_maps_url: Optional[str] = None
address_components: Optional[List[Dict[str, Any]]] = None
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
api_status: str = "OK"
def search_place(
query: str,
client: httpx.Client,
location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
"""Search for a place using the Text Search API (New)."""
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
}
body = {
"textQuery": query,
"languageCode": "nl",
"regionCode": "NL",
"maxResultCount": 1,
}
if location_bias:
lat, lng = location_bias
body["locationBias"] = {
"circle": {
"center": {"latitude": lat, "longitude": lng},
"radius": 50000.0
}
}
try:
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
response.raise_for_status()
data = response.json()
places = data.get("places", [])
if places:
return places[0]
else:
logger.warning(f"No place found for query: {query}")
return None
except httpx.HTTPStatusError as e:
error_data = {}
try:
error_data = e.response.json()
except Exception:
pass
error_msg = error_data.get("error", {}).get("message", str(e))
logger.error(f"HTTP error searching for place: {error_msg}")
return None
except Exception as e:
logger.error(f"Error searching for '{query}': {e}")
return None
def parse_place_data(place: Dict[str, Any]) -> GoogleMapsEnrichment:
"""Parse place data from API response."""
location = place.get("location", {})
lat = location.get("latitude")
lng = location.get("longitude")
display_name = place.get("displayName", {})
name = display_name.get("text", "")
opening_hours = place.get("regularOpeningHours")
if opening_hours:
opening_hours = {
"periods": opening_hours.get("periods"),
"weekday_text": opening_hours.get("weekdayDescriptions"),
}
address_components = place.get("addressComponents")
if address_components:
address_components = [
{
"long_name": c.get("longText"),
"short_name": c.get("shortText"),
"types": c.get("types", []),
}
for c in address_components
]
return GoogleMapsEnrichment(
place_id=place.get("id", ""),
name=name,
formatted_address=place.get("formattedAddress"),
short_address=place.get("shortFormattedAddress"),
latitude=lat,
longitude=lng,
types=place.get("types", []),
primary_type=place.get("primaryType"),
business_status=place.get("businessStatus"),
national_phone_number=place.get("nationalPhoneNumber"),
international_phone_number=place.get("internationalPhoneNumber"),
website=place.get("websiteUri"),
opening_hours=opening_hours,
rating=place.get("rating"),
user_ratings_total=place.get("userRatingCount"),
google_maps_url=place.get("googleMapsUri"),
address_components=address_components,
api_status="OK",
)
def enrichment_to_dict(enrichment: GoogleMapsEnrichment) -> Dict[str, Any]:
"""Convert GoogleMapsEnrichment to dictionary for YAML."""
result: Dict[str, Any] = {
"place_id": enrichment.place_id,
"name": enrichment.name,
"fetch_timestamp": enrichment.fetch_timestamp,
"api_status": enrichment.api_status,
}
if enrichment.latitude is not None and enrichment.longitude is not None:
result["coordinates"] = {
"latitude": enrichment.latitude,
"longitude": enrichment.longitude,
}
if enrichment.formatted_address:
result["formatted_address"] = enrichment.formatted_address
if enrichment.short_address:
result["short_address"] = enrichment.short_address
if enrichment.address_components:
result["address_components"] = enrichment.address_components
if enrichment.national_phone_number:
result["phone_local"] = enrichment.national_phone_number
if enrichment.international_phone_number:
result["phone_international"] = enrichment.international_phone_number
if enrichment.website:
result["website"] = enrichment.website
if enrichment.types:
result["google_place_types"] = enrichment.types
if enrichment.primary_type:
result["primary_type"] = enrichment.primary_type
if enrichment.business_status:
result["business_status"] = enrichment.business_status
if enrichment.opening_hours:
result["opening_hours"] = enrichment.opening_hours
if enrichment.rating is not None:
result["rating"] = enrichment.rating
if enrichment.user_ratings_total is not None:
result["total_ratings"] = enrichment.user_ratings_total
if enrichment.google_maps_url:
result["google_maps_url"] = enrichment.google_maps_url
return result
def build_search_query(entry: Dict[str, Any]) -> str:
"""Build a search query from entry data."""
kb_enrichment = entry.get("kb_enrichment", {})
original = entry.get("original_entry", {})
# Get organization name from KB enrichment or original entry
org_name = kb_enrichment.get("name") or original.get("organisatie", "")
city = kb_enrichment.get("city") or original.get("plaatsnaam_bezoekadres", "")
# Add "bibliotheek" if not in name
if org_name and "bibliotheek" not in org_name.lower():
org_name = f"Bibliotheek {org_name}"
query_parts = [org_name]
if city:
query_parts.append(city)
query_parts.append("Netherlands")
return ", ".join(filter(None, query_parts))
def process_kb_entries(
entries_dir: Path,
dry_run: bool = False,
limit: Optional[int] = None,
) -> Dict[str, int]:
"""Process all KB ISIL library entries."""
stats = {
"total_files": 0,
"already_enriched": 0,
"newly_enriched": 0,
"not_found": 0,
"errors": 0,
}
# Find all KB ISIL files
kb_files = sorted(entries_dir.glob("*_kb_isil.yaml"))
stats["total_files"] = len(kb_files)
if limit:
kb_files = kb_files[:limit]
logger.info(f"Found {stats['total_files']} KB library entries")
logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})")
# Netherlands center for location bias
NL_CENTER = (52.1326, 5.2913)
with httpx.Client(timeout=30.0) as client:
for yaml_file in kb_files:
try:
# Load entry
with open(yaml_file, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
continue
# Check if already has Google Maps enrichment
if entry.get("google_maps_enrichment"):
stats["already_enriched"] += 1
continue
# Build search query
query = build_search_query(entry)
logger.info(f"\nProcessing: {yaml_file.name}")
logger.info(f" Query: {query}")
# Search for place
place = search_place(query, client, location_bias=NL_CENTER)
if not place:
entry["google_maps_status"] = "NOT_FOUND"
entry["google_maps_search_query"] = query
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
stats["not_found"] += 1
else:
# Parse place data
enrichment = parse_place_data(place)
entry["google_maps_enrichment"] = enrichment_to_dict(enrichment)
entry["google_maps_status"] = "SUCCESS"
entry["google_maps_search_query"] = query
logger.info(f" -> Found: {enrichment.name} ({enrichment.rating or 'no rating'}★)")
stats["newly_enriched"] += 1
# Save if not dry run
if not dry_run:
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rate limiting
time.sleep(REQUEST_DELAY)
except Exception as e:
logger.error(f"Error processing {yaml_file.name}: {e}")
stats["errors"] += 1
return stats
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Enrich KB library entries with Google Maps data"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Don't save changes, just show what would be done"
)
parser.add_argument(
"--limit",
type=int,
help="Limit number of entries to process"
)
parser.add_argument(
"--entries-dir",
type=Path,
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
help="Path to entries directory"
)
args = parser.parse_args()
if args.dry_run:
logger.info("DRY RUN MODE - no changes will be saved")
if not args.entries_dir.exists():
logger.error(f"Entries directory not found: {args.entries_dir}")
return 1
# Process entries
stats = process_kb_entries(
entries_dir=args.entries_dir,
dry_run=args.dry_run,
limit=args.limit,
)
# Print summary
logger.info("\n" + "=" * 60)
logger.info("GOOGLE MAPS ENRICHMENT COMPLETE")
logger.info("=" * 60)
logger.info(f"Total KB library files: {stats['total_files']}")
logger.info(f"Already enriched: {stats['already_enriched']}")
logger.info(f"Newly enriched: {stats['newly_enriched']}")
logger.info(f"Not found: {stats['not_found']}")
logger.info(f"Errors: {stats['errors']}")
# Save stats
if not args.dry_run:
stats_file = args.entries_dir.parent / f"kb_google_maps_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(stats_file, 'w') as f:
json.dump({
"timestamp": datetime.now(timezone.utc).isoformat(),
"dry_run": args.dry_run,
"limit": args.limit,
**stats
}, f, indent=2)
logger.info(f"Stats saved to: {stats_file}")
return 0
if __name__ == "__main__":
sys.exit(main())