- Implemented a Python script to validate KB library YAML files for required fields and data quality. - Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics. - Created a comprehensive markdown report summarizing validation results and enrichment quality. - Included error handling for file loading and validation processes. - Generated JSON statistics for further analysis.
415 lines
13 KiB
Python
415 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich KB Netherlands library entries with Google Maps data.
|
|
|
|
This script reads the KB ISIL library entries and enriches them with
|
|
Google Places API data.
|
|
|
|
Usage:
|
|
python scripts/enrich_kb_libraries_google_maps.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import yaml
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import argparse
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
logger.error("GOOGLE_PLACES_TOKEN environment variable is required")
|
|
logger.error("Please set it in your .env file or environment")
|
|
sys.exit(1)
|
|
|
|
# API Endpoints - Using Places API (New)
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 0.3
|
|
|
|
# Fields to request
|
|
PLACE_FIELDS = [
|
|
"id",
|
|
"displayName",
|
|
"formattedAddress",
|
|
"addressComponents",
|
|
"location",
|
|
"types",
|
|
"businessStatus",
|
|
"internationalPhoneNumber",
|
|
"nationalPhoneNumber",
|
|
"regularOpeningHours",
|
|
"websiteUri",
|
|
"rating",
|
|
"userRatingCount",
|
|
"googleMapsUri",
|
|
"primaryType",
|
|
"shortFormattedAddress",
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class GoogleMapsEnrichment:
|
|
"""Container for Google Maps data."""
|
|
place_id: str
|
|
name: str
|
|
formatted_address: Optional[str] = None
|
|
short_address: Optional[str] = None
|
|
latitude: Optional[float] = None
|
|
longitude: Optional[float] = None
|
|
types: List[str] = field(default_factory=list)
|
|
primary_type: Optional[str] = None
|
|
business_status: Optional[str] = None
|
|
national_phone_number: Optional[str] = None
|
|
international_phone_number: Optional[str] = None
|
|
website: Optional[str] = None
|
|
opening_hours: Optional[Dict[str, Any]] = None
|
|
rating: Optional[float] = None
|
|
user_ratings_total: Optional[int] = None
|
|
google_maps_url: Optional[str] = None
|
|
address_components: Optional[List[Dict[str, Any]]] = None
|
|
fetch_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
api_status: str = "OK"
|
|
|
|
|
|
def search_place(
|
|
query: str,
|
|
client: httpx.Client,
|
|
location_bias: Optional[Tuple[float, float]] = None,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Search for a place using the Text Search API (New)."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
|
|
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
|
|
}
|
|
|
|
body = {
|
|
"textQuery": query,
|
|
"languageCode": "nl",
|
|
"regionCode": "NL",
|
|
"maxResultCount": 1,
|
|
}
|
|
|
|
if location_bias:
|
|
lat, lng = location_bias
|
|
body["locationBias"] = {
|
|
"circle": {
|
|
"center": {"latitude": lat, "longitude": lng},
|
|
"radius": 50000.0
|
|
}
|
|
}
|
|
|
|
try:
|
|
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
places = data.get("places", [])
|
|
if places:
|
|
return places[0]
|
|
else:
|
|
logger.warning(f"No place found for query: {query}")
|
|
return None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_data = {}
|
|
try:
|
|
error_data = e.response.json()
|
|
except Exception:
|
|
pass
|
|
|
|
error_msg = error_data.get("error", {}).get("message", str(e))
|
|
logger.error(f"HTTP error searching for place: {error_msg}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error searching for '{query}': {e}")
|
|
return None
|
|
|
|
|
|
def parse_place_data(place: Dict[str, Any]) -> GoogleMapsEnrichment:
|
|
"""Parse place data from API response."""
|
|
location = place.get("location", {})
|
|
lat = location.get("latitude")
|
|
lng = location.get("longitude")
|
|
|
|
display_name = place.get("displayName", {})
|
|
name = display_name.get("text", "")
|
|
|
|
opening_hours = place.get("regularOpeningHours")
|
|
if opening_hours:
|
|
opening_hours = {
|
|
"periods": opening_hours.get("periods"),
|
|
"weekday_text": opening_hours.get("weekdayDescriptions"),
|
|
}
|
|
|
|
address_components = place.get("addressComponents")
|
|
if address_components:
|
|
address_components = [
|
|
{
|
|
"long_name": c.get("longText"),
|
|
"short_name": c.get("shortText"),
|
|
"types": c.get("types", []),
|
|
}
|
|
for c in address_components
|
|
]
|
|
|
|
return GoogleMapsEnrichment(
|
|
place_id=place.get("id", ""),
|
|
name=name,
|
|
formatted_address=place.get("formattedAddress"),
|
|
short_address=place.get("shortFormattedAddress"),
|
|
latitude=lat,
|
|
longitude=lng,
|
|
types=place.get("types", []),
|
|
primary_type=place.get("primaryType"),
|
|
business_status=place.get("businessStatus"),
|
|
national_phone_number=place.get("nationalPhoneNumber"),
|
|
international_phone_number=place.get("internationalPhoneNumber"),
|
|
website=place.get("websiteUri"),
|
|
opening_hours=opening_hours,
|
|
rating=place.get("rating"),
|
|
user_ratings_total=place.get("userRatingCount"),
|
|
google_maps_url=place.get("googleMapsUri"),
|
|
address_components=address_components,
|
|
api_status="OK",
|
|
)
|
|
|
|
|
|
def enrichment_to_dict(enrichment: GoogleMapsEnrichment) -> Dict[str, Any]:
|
|
"""Convert GoogleMapsEnrichment to dictionary for YAML."""
|
|
result: Dict[str, Any] = {
|
|
"place_id": enrichment.place_id,
|
|
"name": enrichment.name,
|
|
"fetch_timestamp": enrichment.fetch_timestamp,
|
|
"api_status": enrichment.api_status,
|
|
}
|
|
|
|
if enrichment.latitude is not None and enrichment.longitude is not None:
|
|
result["coordinates"] = {
|
|
"latitude": enrichment.latitude,
|
|
"longitude": enrichment.longitude,
|
|
}
|
|
|
|
if enrichment.formatted_address:
|
|
result["formatted_address"] = enrichment.formatted_address
|
|
if enrichment.short_address:
|
|
result["short_address"] = enrichment.short_address
|
|
|
|
if enrichment.address_components:
|
|
result["address_components"] = enrichment.address_components
|
|
|
|
if enrichment.national_phone_number:
|
|
result["phone_local"] = enrichment.national_phone_number
|
|
if enrichment.international_phone_number:
|
|
result["phone_international"] = enrichment.international_phone_number
|
|
if enrichment.website:
|
|
result["website"] = enrichment.website
|
|
|
|
if enrichment.types:
|
|
result["google_place_types"] = enrichment.types
|
|
if enrichment.primary_type:
|
|
result["primary_type"] = enrichment.primary_type
|
|
if enrichment.business_status:
|
|
result["business_status"] = enrichment.business_status
|
|
|
|
if enrichment.opening_hours:
|
|
result["opening_hours"] = enrichment.opening_hours
|
|
|
|
if enrichment.rating is not None:
|
|
result["rating"] = enrichment.rating
|
|
if enrichment.user_ratings_total is not None:
|
|
result["total_ratings"] = enrichment.user_ratings_total
|
|
|
|
if enrichment.google_maps_url:
|
|
result["google_maps_url"] = enrichment.google_maps_url
|
|
|
|
return result
|
|
|
|
|
|
def build_search_query(entry: Dict[str, Any]) -> str:
|
|
"""Build a search query from entry data."""
|
|
kb_enrichment = entry.get("kb_enrichment", {})
|
|
original = entry.get("original_entry", {})
|
|
|
|
# Get organization name from KB enrichment or original entry
|
|
org_name = kb_enrichment.get("name") or original.get("organisatie", "")
|
|
city = kb_enrichment.get("city") or original.get("plaatsnaam_bezoekadres", "")
|
|
|
|
# Add "bibliotheek" if not in name
|
|
if org_name and "bibliotheek" not in org_name.lower():
|
|
org_name = f"Bibliotheek {org_name}"
|
|
|
|
query_parts = [org_name]
|
|
if city:
|
|
query_parts.append(city)
|
|
query_parts.append("Netherlands")
|
|
|
|
return ", ".join(filter(None, query_parts))
|
|
|
|
|
|
def process_kb_entries(
|
|
entries_dir: Path,
|
|
dry_run: bool = False,
|
|
limit: Optional[int] = None,
|
|
) -> Dict[str, int]:
|
|
"""Process all KB ISIL library entries."""
|
|
stats = {
|
|
"total_files": 0,
|
|
"already_enriched": 0,
|
|
"newly_enriched": 0,
|
|
"not_found": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
# Find all KB ISIL files
|
|
kb_files = sorted(entries_dir.glob("*_kb_isil.yaml"))
|
|
stats["total_files"] = len(kb_files)
|
|
|
|
if limit:
|
|
kb_files = kb_files[:limit]
|
|
|
|
logger.info(f"Found {stats['total_files']} KB library entries")
|
|
logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})")
|
|
|
|
# Netherlands center for location bias
|
|
NL_CENTER = (52.1326, 5.2913)
|
|
|
|
with httpx.Client(timeout=30.0) as client:
|
|
for yaml_file in kb_files:
|
|
try:
|
|
# Load entry
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
continue
|
|
|
|
# Check if already has Google Maps enrichment
|
|
if entry.get("google_maps_enrichment"):
|
|
stats["already_enriched"] += 1
|
|
continue
|
|
|
|
# Build search query
|
|
query = build_search_query(entry)
|
|
logger.info(f"\nProcessing: {yaml_file.name}")
|
|
logger.info(f" Query: {query}")
|
|
|
|
# Search for place
|
|
place = search_place(query, client, location_bias=NL_CENTER)
|
|
|
|
if not place:
|
|
entry["google_maps_status"] = "NOT_FOUND"
|
|
entry["google_maps_search_query"] = query
|
|
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
stats["not_found"] += 1
|
|
else:
|
|
# Parse place data
|
|
enrichment = parse_place_data(place)
|
|
entry["google_maps_enrichment"] = enrichment_to_dict(enrichment)
|
|
entry["google_maps_status"] = "SUCCESS"
|
|
entry["google_maps_search_query"] = query
|
|
|
|
logger.info(f" -> Found: {enrichment.name} ({enrichment.rating or 'no rating'}★)")
|
|
stats["newly_enriched"] += 1
|
|
|
|
# Save if not dry run
|
|
if not dry_run:
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Rate limiting
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {yaml_file.name}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich KB library entries with Google Maps data"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Don't save changes, just show what would be done"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Limit number of entries to process"
|
|
)
|
|
parser.add_argument(
|
|
"--entries-dir",
|
|
type=Path,
|
|
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
|
|
help="Path to entries directory"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN MODE - no changes will be saved")
|
|
|
|
if not args.entries_dir.exists():
|
|
logger.error(f"Entries directory not found: {args.entries_dir}")
|
|
return 1
|
|
|
|
# Process entries
|
|
stats = process_kb_entries(
|
|
entries_dir=args.entries_dir,
|
|
dry_run=args.dry_run,
|
|
limit=args.limit,
|
|
)
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("GOOGLE MAPS ENRICHMENT COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total KB library files: {stats['total_files']}")
|
|
logger.info(f"Already enriched: {stats['already_enriched']}")
|
|
logger.info(f"Newly enriched: {stats['newly_enriched']}")
|
|
logger.info(f"Not found: {stats['not_found']}")
|
|
logger.info(f"Errors: {stats['errors']}")
|
|
|
|
# Save stats
|
|
if not args.dry_run:
|
|
stats_file = args.entries_dir.parent / f"kb_google_maps_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(stats_file, 'w') as f:
|
|
json.dump({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"dry_run": args.dry_run,
|
|
"limit": args.limit,
|
|
**stats
|
|
}, f, indent=2)
|
|
logger.info(f"Stats saved to: {stats_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|