glam/scripts/export_nde_map_json.py

#!/usr/bin/env python3
"""
Export NDE Enriched Institutions to JSON for Frontend Map

Reads the enriched YAML files and produces a lightweight JSON file
suitable for the React/Leaflet map component.

Now includes Google Maps enrichment data (ratings, photos, reviews, opening hours).
"""

import json
from pathlib import Path
from datetime import datetime, timezone
import sys

# Add project root to path for imports
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
    import yaml
except ImportError:
    print("Error: PyYAML not installed. Run: pip install pyyaml")
    sys.exit(1)

# Institution type mappings
TYPE_COLORS = {
    'G': '#00bcd4',   # Gallery - cyan
    'L': '#2ecc71',   # Library - green
    'A': '#3498db',   # Archive - blue
    'M': '#e74c3c',   # Museum - red
    'O': '#f39c12',   # Official - orange
    'R': '#1abc9c',   # Research - teal
    'C': '#795548',   # Corporation - brown
    'U': '#9e9e9e',   # Unknown - gray
    'B': '#4caf50',   # Botanical - green
    'E': '#ff9800',   # Education - amber
    'S': '#9b59b6',   # Society - purple
    'F': '#95a5a6',   # Features - gray
    'I': '#673ab7',   # Intangible - deep purple
    'X': '#607d8b',   # Mixed - blue gray
    'P': '#ff5722',   # Personal - deep orange
    'H': '#607d8b',   # Holy sites - blue gray
    'D': '#34495e',   # Digital - dark gray
    'N': '#e91e63',   # NGO - pink
    'T': '#ff5722',   # Taste/smell - deep orange
}

TYPE_NAMES = {
    'G': 'Gallery',
    'L': 'Library',
    'A': 'Archive',
    'M': 'Museum',
    'O': 'Official',
    'R': 'Research',
    'C': 'Corporation',
    'U': 'Unknown',
    'B': 'Botanical',
    'E': 'Education',
    'S': 'Society',
    'F': 'Features',
    'I': 'Intangible',
    'X': 'Mixed',
    'P': 'Personal',
    'H': 'Holy sites',
    'D': 'Digital',
    'N': 'NGO',
    'T': 'Taste/smell',
}


def extract_institution_data(entry_data: dict) -> dict | None:
    """Extract the relevant data for the map from an enriched entry."""

    # Get original entry data
    original = entry_data.get('original_entry', {})
    enrichment = entry_data.get('wikidata_enrichment', {})
    google_maps = entry_data.get('google_maps_enrichment', {})
    exa_data = entry_data.get('exa_enrichment', {})

    # Get coordinates - prefer Google Maps (more precise), fall back to Wikidata
    lat, lon = None, None

    # Try Google Maps coordinates first
    google_coords = google_maps.get('coordinates', {})
    if google_coords.get('latitude') and google_coords.get('longitude'):
        lat = google_coords['latitude']
        lon = google_coords['longitude']
    else:
        # Fall back to Wikidata coordinates
        wd_coords = enrichment.get('wikidata_coordinates', {})
        if wd_coords.get('latitude') and wd_coords.get('longitude'):
            lat = wd_coords['latitude']
            lon = wd_coords['longitude']

    # Skip if no coordinates from any source
    if not lat or not lon:
        return None

    # Get institution type (first one if list)
    types = original.get('type', [])
    inst_type = types[0] if types else 'U'

    # Get name - prefer Dutch label, fall back to original name
    name = (
        enrichment.get('wikidata_label_nl') or
        original.get('organisatie') or
        'Unknown Institution'
    )

    # Get city - prefer Google Maps short address
    city = original.get('plaatsnaam_bezoekadres', '')

    # Get description - prefer Dutch, fall back to English, then Exa, then Google editorial
    # Handle various types safely
    description = ''
    if enrichment.get('wikidata_description_nl'):
        description = enrichment['wikidata_description_nl']
    elif enrichment.get('wikidata_description_en'):
        description = enrichment['wikidata_description_en']
    elif exa_data.get('description'):
        description = exa_data['description']
    else:
        editorial = google_maps.get('editorial_summary')
        if editorial and isinstance(editorial, dict):
            description = editorial.get('text', '')
        elif isinstance(editorial, str):
            description = editorial

    # Ensure description is a string
    if not isinstance(description, str):
        description = ''

    # Get website - prefer Google Maps (more current), fall back to Wikidata
    website = (
        google_maps.get('website') or
        enrichment.get('wikidata_official_website') or
        original.get('webadres_organisatie') or
        ''
    )

    # Get Wikidata ID
    wikidata_id = enrichment.get('wikidata_entity_id', '')

    # Build result with base data
    result = {
        'lat': lat,
        'lon': lon,
        'name': name,
        'city': city,
        'type': inst_type,
        'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
        'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
        'website': website,
        'wikidata_id': wikidata_id,
        'description': description[:200] + '...' if len(description) > 200 else description,
    }

    # Add Google Maps enrichment data if available
    if google_maps:
        # Rating and reviews count
        if google_maps.get('rating'):
            result['rating'] = google_maps['rating']
            result['total_ratings'] = google_maps.get('total_ratings', 0)

        # Phone number
        if google_maps.get('phone_international'):
            result['phone'] = google_maps['phone_international']
        elif google_maps.get('phone_local'):
            result['phone'] = google_maps['phone_local']

        # Formatted address (more complete than city)
        if google_maps.get('formatted_address'):
            result['address'] = google_maps['formatted_address']

        # Opening hours (weekday text is human readable)
        opening_hours = google_maps.get('opening_hours', {})
        if opening_hours.get('weekday_text'):
            result['opening_hours'] = opening_hours['weekday_text']
            result['open_now'] = opening_hours.get('open_now', None)

        # Reviews (first 3 for popups)
        reviews = google_maps.get('reviews', [])
        if reviews:
            result['reviews'] = [
                {
                    'author': r.get('author_name', 'Anonymous'),
                    'rating': r.get('rating', 0),
                    'text': r.get('text', '')[:300] + '...' if len(r.get('text', '')) > 300 else r.get('text', ''),
                    'time': r.get('relative_time_description', '')
                }
                for r in reviews[:3]  # Only first 3 for popup
            ]

        # Photos (first 5) - check both possible keys
        photos = google_maps.get('photos', [])
        photo_urls = google_maps.get('photo_urls', [])

        if photo_urls:
            # Direct URL format
            result['photos'] = [{'url': url, 'attribution': ''} for url in photo_urls[:5]]
        elif photos:
            # Object format with attribution
            result['photos'] = [
                {
                    'url': p.get('url', ''),
                    'attribution': p.get('attributions', [''])[0] if p.get('attributions') else ''
                }
                for p in photos[:5]
            ]

        # Street View URL
        if google_maps.get('street_view_url'):
            result['street_view_url'] = google_maps['street_view_url']

        # Business status
        if google_maps.get('business_status'):
            result['business_status'] = google_maps['business_status']

        # Google Place ID for linking
        if google_maps.get('place_id'):
            result['google_place_id'] = google_maps['place_id']

    return result


def main():
    """Main export function."""

    # Paths
    enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
    output_dir = project_root / 'frontend' / 'public' / 'data'
    output_file = output_dir / 'nde_institutions.json'

    # Create output directory if needed
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Reading enriched entries from: {enriched_dir}")

    institutions = []
    files_processed = 0
    files_with_coords = 0

    # Process all YAML files
    yaml_files = sorted(enriched_dir.glob('*.yaml'))

    for yaml_file in yaml_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                entry_data = yaml.safe_load(f)

            files_processed += 1

            # Extract institution data
            inst_data = extract_institution_data(entry_data)
            if inst_data:
                institutions.append(inst_data)
                files_with_coords += 1

        except Exception as e:
            print(f"Warning: Error processing {yaml_file.name}: {e}")
            continue

    # Sort by name
    institutions.sort(key=lambda x: x['name'].lower())

    # Write JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(institutions, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Export complete!")
    print(f"   Files processed: {files_processed}")
    print(f"   Institutions with coordinates: {files_with_coords}")
    print(f"   Output file: {output_file}")

    # Print type distribution
    type_counts = {}
    for inst in institutions:
        t = inst['type']
        type_counts[t] = type_counts.get(t, 0) + 1

    print(f"\n📊 Distribution by type:")
    for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"   {TYPE_NAMES.get(t, t)}: {count}")

    # Print Google Maps enrichment stats
    with_rating = sum(1 for i in institutions if i.get('rating'))
    with_photos = sum(1 for i in institutions if i.get('photos'))
    with_reviews = sum(1 for i in institutions if i.get('reviews'))
    with_hours = sum(1 for i in institutions if i.get('opening_hours'))
    with_street_view = sum(1 for i in institutions if i.get('street_view_url'))

    print(f"\n🗺️  Google Maps enrichment coverage:")
    print(f"   With ratings: {with_rating} ({with_rating*100/len(institutions):.1f}%)")
    print(f"   With photos: {with_photos} ({with_photos*100/len(institutions):.1f}%)")
    print(f"   With reviews: {with_reviews} ({with_reviews*100/len(institutions):.1f}%)")
    print(f"   With opening hours: {with_hours} ({with_hours*100/len(institutions):.1f}%)")
    print(f"   With Street View: {with_street_view} ({with_street_view*100/len(institutions):.1f}%)")


if __name__ == '__main__':
    main()