glam/scripts/export_nde_stats_json.py

#!/usr/bin/env python3
"""
Export NDE Statistics to JSON for Frontend Visualizations

Reads the enriched YAML files and produces a comprehensive statistics JSON
suitable for D3.js visualizations in the React frontend.
"""

import json
from pathlib import Path
from datetime import datetime, timezone
from collections import Counter, defaultdict
import sys

# Add project root to path for imports
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
    import yaml
except ImportError:
    print("Error: PyYAML not installed. Run: pip install pyyaml")
    sys.exit(1)

# Netherlands Province bounding boxes (approximate) for coordinate lookup
# Format: (min_lat, max_lat, min_lon, max_lon)
PROVINCE_BOUNDS = {
    'Groningen': (53.05, 53.55, 6.15, 7.25),
    'Friesland': (52.85, 53.50, 5.05, 6.35),
    'Drenthe': (52.65, 53.15, 6.15, 7.10),
    'Overijssel': (52.15, 52.85, 5.75, 7.10),
    'Flevoland': (52.25, 52.75, 5.15, 6.00),
    'Gelderland': (51.75, 52.55, 5.05, 6.85),
    'Utrecht': (51.95, 52.35, 4.75, 5.65),
    'Noord-Holland': (52.25, 53.00, 4.50, 5.35),
    'Zuid-Holland': (51.65, 52.35, 3.85, 5.00),
    'Zeeland': (51.20, 51.75, 3.35, 4.30),
    'Noord-Brabant': (51.25, 51.85, 4.35, 6.05),
    'Limburg': (50.75, 51.80, 5.55, 6.25),
}

# Province colors for visualization
PROVINCE_COLORS = {
    'Groningen': '#1f77b4',
    'Friesland': '#ff7f0e',
    'Drenthe': '#2ca02c',
    'Overijssel': '#d62728',
    'Flevoland': '#9467bd',
    'Gelderland': '#8c564b',
    'Utrecht': '#e377c2',
    'Noord-Holland': '#7f7f7f',
    'Zuid-Holland': '#bcbd22',
    'Zeeland': '#17becf',
    'Noord-Brabant': '#aec7e8',
    'Limburg': '#ffbb78',
}


def get_province_from_coords(lat: float, lon: float) -> str | None:
    """Determine the Dutch province from coordinates using bounding box lookup."""
    if not lat or not lon:
        return None

    for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items():
        if min_lat <= lat <= max_lat and min_lon <= lon <= max_lon:
            return province

    # Fallback for edge cases - check if coordinates are in Netherlands at all
    if 50.7 <= lat <= 53.6 and 3.3 <= lon <= 7.3:
        # In Netherlands but didn't match bounds - find closest province
        best_province = None
        best_distance = float('inf')
        for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items():
            center_lat = (min_lat + max_lat) / 2
            center_lon = (min_lon + max_lon) / 2
            distance = ((lat - center_lat) ** 2 + (lon - center_lon) ** 2) ** 0.5
            if distance < best_distance:
                best_distance = distance
                best_province = province
        return best_province

    return None


# Institution type mappings
TYPE_INFO = {
    'G': {'name': 'Gallery', 'color': '#00bcd4'},
    'L': {'name': 'Library', 'color': '#2ecc71'},
    'A': {'name': 'Archive', 'color': '#3498db'},
    'M': {'name': 'Museum', 'color': '#e74c3c'},
    'O': {'name': 'Official', 'color': '#f39c12'},
    'R': {'name': 'Research', 'color': '#1abc9c'},
    'C': {'name': 'Corporation', 'color': '#795548'},
    'U': {'name': 'Unknown', 'color': '#9e9e9e'},
    'B': {'name': 'Botanical', 'color': '#4caf50'},
    'E': {'name': 'Education', 'color': '#ff9800'},
    'S': {'name': 'Society', 'color': '#9b59b6'},
    'F': {'name': 'Features', 'color': '#95a5a6'},
    'I': {'name': 'Intangible', 'color': '#673ab7'},
    'X': {'name': 'Mixed', 'color': '#607d8b'},
    'P': {'name': 'Personal', 'color': '#ff5722'},
    'H': {'name': 'Holy sites', 'color': '#607d8b'},
    'D': {'name': 'Digital', 'color': '#34495e'},
    'N': {'name': 'NGO', 'color': '#e91e63'},
    'T': {'name': 'Taste/smell', 'color': '#ff5722'},
}


def process_enriched_files(enriched_dir: Path) -> dict:
    """Process all enriched YAML files and collect statistics."""

    stats = {
        'total_entries': 0,
        'enrichment_status': Counter(),
        'enrichment_sources': Counter(),  # Track which sources enriched each entry
        'institution_types': Counter(),
        'cities': Counter(),
        'provinces': Counter(),
        'provinces_by_type': defaultdict(lambda: Counter()),  # province -> {type: count}
        'collection_systems': Counter(),
        'wikidata_types': Counter(),
        'identifiers': {
            'has_coordinates': 0,
            'has_isil': 0,
            'has_wikipedia_nl': 0,
            'has_image': 0,
            'has_website': 0,
        },
        'google_maps': {
            'has_rating': 0,
            'has_photos': 0,
            'has_reviews': 0,
            'has_opening_hours': 0,
            'has_street_view': 0,
            'status_success': 0,
            'status_not_found': 0,
        },
        # New enrichment source tracking
        'new_sources': {
            'has_nan_isil': 0,
            'has_museum_register': 0,
            'has_ghcid': 0,
            'has_web_claims': 0,
            'has_social_media': 0,
            'has_verified_name': 0,
        },
        'founding_decades': Counter(),
        'enriched_count': 0,
        'not_enriched_count': 0,
        # New: Rating distribution for histogram
        'rating_distribution': [],  # List of (rating, review_count, type) tuples
        # New: Type aggregates for bubble chart
        'type_rating_stats': defaultdict(lambda: {'ratings': [], 'review_counts': [], 'count': 0}),
        # New: Museum register by province
        'museum_register_provinces': Counter(),
        # New: Social media platforms
        'social_media_platforms': Counter(),
        # New: Enrichment certainty tracking
        'certainty': {
            'google_maps_invalid': [],  # Entries where Google Maps found wrong entity
            'low_name_confidence': [],  # custodian_name.confidence < 0.5
            'medium_name_confidence': [],  # 0.5 <= custodian_name.confidence < 0.8
            'high_name_confidence': [],  # custodian_name.confidence >= 0.8
            # NA ISIL (Nationaal Archief) - for archives
            'low_na_isil_confidence': [],  # nan_isil_enrichment.match_confidence < 0.8
            'high_na_isil_confidence': [],  # nan_isil_enrichment.match_confidence >= 0.8
            # KB ISIL (Koninklijke Bibliotheek) - for libraries
            'has_kb_isil': [],  # Entries with KB ISIL (authoritative, no confidence needed)
            'no_name_confidence': [],  # No custodian_name verification
        },
    }

    yaml_files = sorted(enriched_dir.glob('*.yaml'))

    for yaml_file in yaml_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            stats['total_entries'] += 1

            # Track enrichment sources for this entry
            entry_sources = []

            # Enrichment status (legacy)
            status = entry.get('enrichment_status', 'unknown')
            stats['enrichment_status'][status] += 1

            # Original entry data
            original = entry.get('original_entry', {})

            # Institution type
            types = original.get('type', [])
            if types:
                stats['institution_types'][types[0]] += 1

            # City
            city = original.get('plaatsnaam_bezoekadres', '')
            if city:
                stats['cities'][city] += 1

            # Collection system
            system = original.get('collectiebeheersysteem', '')
            if system and system.strip():
                stats['collection_systems'][system] += 1

            # Check for ISIL in original data
            if original.get('isil-code_na') or original.get('isil_code'):
                entry_sources.append('ISIL')
                stats['identifiers']['has_isil'] += 1

            # Check for website in original data
            if original.get('webadres_organisatie'):
                entry_sources.append('Website')

            # Wikidata enrichment
            enrichment = entry.get('wikidata_enrichment', {})
            has_wikidata = bool(enrichment and enrichment.get('wikidata_entity_id'))

            if has_wikidata:
                entry_sources.append('Wikidata')

            # Coordinates and Province (from Wikidata)
            coords = enrichment.get('wikidata_coordinates', {})
            inst_type = types[0] if types else 'U'
            province = None  # Initialize province for later use
            if coords and coords.get('latitude'):
                stats['identifiers']['has_coordinates'] += 1
                # Determine province from coordinates
                lat = coords.get('latitude')
                lon = coords.get('longitude')
                province = get_province_from_coords(lat, lon)
                if province:
                    stats['provinces'][province] += 1
                    stats['provinces_by_type'][province][inst_type] += 1

            # Website from Wikidata
            if enrichment.get('wikidata_official_website'):
                stats['identifiers']['has_website'] += 1

            # Image
            if enrichment.get('wikidata_image'):
                stats['identifiers']['has_image'] += 1

            # Wikipedia NL
            sitelinks = enrichment.get('wikidata_sitelinks', {})
            if 'nlwiki' in sitelinks:
                stats['identifiers']['has_wikipedia_nl'] += 1

            # ISIL from Wikidata
            wd_identifiers = enrichment.get('wikidata_identifiers', {})
            if 'isil' in wd_identifiers:
                if 'ISIL' not in entry_sources:
                    entry_sources.append('ISIL')
                    stats['identifiers']['has_isil'] += 1

            # Wikidata instance types
            instance_of = enrichment.get('wikidata_instance_of', [])
            for inst in instance_of:
                label = inst.get('label_en', inst.get('label_nl', 'Unknown'))
                stats['wikidata_types'][label] += 1

            # Google Maps enrichment
            google = entry.get('google_maps_enrichment', {})
            has_google_maps = False
            if google:
                # Check API status or presence of place_id
                api_status = google.get('api_status', '')
                if api_status == 'OK' or google.get('place_id'):
                    stats['google_maps']['status_success'] += 1
                    has_google_maps = True
                    entry_sources.append('Google Maps')
                elif api_status == 'NOT_FOUND':
                    stats['google_maps']['status_not_found'] += 1

                if google.get('rating'):
                    stats['google_maps']['has_rating'] += 1
                    # Collect rating data for histogram and bubble chart
                    rating = google.get('rating')
                    review_count = google.get('total_ratings', 0) or 0
                    # Get city and province for scatter plot filtering
                    city = original.get('plaats', original.get('city', ''))
                    prov = province if province else ''
                    # Get GHCID for linking to map
                    ghcid_uuid = ''
                    ghcid_current = ''
                    ghcid_entry = entry.get('ghcid', {})
                    if ghcid_entry:
                        ghcid_uuid = ghcid_entry.get('ghcid_uuid', '')
                        ghcid_current = ghcid_entry.get('ghcid_current', '')

                    stats['rating_distribution'].append({
                        'rating': rating,
                        'reviews': review_count,
                        'type': inst_type,
                        'name': original.get('naam_organisatie', yaml_file.stem),
                        'city': city,
                        'province': prov,
                        'ghcid_uuid': ghcid_uuid,
                        'ghcid_current': ghcid_current,
                    })
                    # Aggregate by type for bubble chart
                    stats['type_rating_stats'][inst_type]['ratings'].append(rating)
                    stats['type_rating_stats'][inst_type]['review_counts'].append(review_count)
                    stats['type_rating_stats'][inst_type]['count'] += 1
                # Check both 'photos' and 'photo_urls' fields
                if google.get('photos') or google.get('photo_urls'):
                    stats['google_maps']['has_photos'] += 1
                if google.get('reviews'):
                    stats['google_maps']['has_reviews'] += 1
                if google.get('opening_hours'):
                    stats['google_maps']['has_opening_hours'] += 1
                # Check for coordinates (lat/lon or coordinates dict)
                if google.get('coordinates') or google.get('latitude'):
                    stats['google_maps']['has_street_view'] += 1
                    # Also count as having coordinates if we don't have Wikidata coords
                    if not (coords and coords.get('latitude')):
                        gm_lat = google.get('latitude') or (google.get('coordinates', {}) or {}).get('latitude')
                        gm_lon = google.get('longitude') or (google.get('coordinates', {}) or {}).get('longitude')
                        if gm_lat and gm_lon:
                            stats['identifiers']['has_coordinates'] += 1
                            province = get_province_from_coords(gm_lat, gm_lon)
                            if province:
                                stats['provinces'][province] += 1
                                stats['provinces_by_type'][province][inst_type] += 1

            # Track new enrichment sources
            nan_isil = entry.get('nan_isil_enrichment', {})
            if isinstance(nan_isil, dict) and nan_isil.get('isil_code'):
                stats['new_sources']['has_nan_isil'] += 1
                if 'ISIL (NA)' not in entry_sources:
                    entry_sources.append('ISIL (NA)')

            museum_register = entry.get('museum_register_enrichment', {})
            if isinstance(museum_register, dict) and museum_register.get('museum_name'):
                stats['new_sources']['has_museum_register'] += 1
                if 'Museum Register' not in entry_sources:
                    entry_sources.append('Museum Register')
                mr_province = museum_register.get('province', '')
                if mr_province:
                    stats['museum_register_provinces'][mr_province] += 1

            ghcid_data = entry.get('ghcid', {})
            if isinstance(ghcid_data, dict) and ghcid_data.get('ghcid_current'):
                stats['new_sources']['has_ghcid'] += 1

            web_claims = entry.get('web_claims', {})
            if isinstance(web_claims, dict) and web_claims.get('claims'):
                stats['new_sources']['has_web_claims'] += 1
                entry_sources.append('Web Claims')
                # Track social media platforms
                for claim in web_claims.get('claims', []):
                    claim_type = claim.get('claim_type', '')
                    if claim_type.startswith('social_'):
                        platform = claim_type.replace('social_', '').capitalize()
                        stats['social_media_platforms'][platform] += 1
                        if not stats['new_sources'].get('has_social_media'):
                            stats['new_sources']['has_social_media'] = 0
                        # Only count once per institution
                if any(c.get('claim_type', '').startswith('social_') for c in web_claims.get('claims', [])):
                    stats['new_sources']['has_social_media'] += 1

            custodian_name = entry.get('custodian_name', {})
            if isinstance(custodian_name, dict) and custodian_name.get('claim_value'):
                stats['new_sources']['has_verified_name'] += 1

            # ============================================
            # Enrichment Certainty Tracking
            # ============================================
            # Build entry info for linking to map
            entry_info = {
                'name': original.get('organisatie', original.get('naam_organisatie', yaml_file.stem)),
                'ghcid_uuid': ghcid_data.get('ghcid_uuid', '') if ghcid_data else '',
                'ghcid_current': ghcid_data.get('ghcid_current', '') if ghcid_data else '',
                'type': inst_type,
                'city': original.get('plaatsnaam_bezoekadres', original.get('plaats', '')),
                'file': yaml_file.stem,
            }

            # Track Google Maps invalid matches
            if entry.get('google_maps_match_invalid'):
                entry_info['reason'] = entry.get('google_maps_match_invalid_reason', 'Google Maps found wrong entity')
                entry_info['google_maps_name'] = google.get('name', '') if google else ''
                stats['certainty']['google_maps_invalid'].append(entry_info.copy())

            # Track name confidence levels
            name_conf = custodian_name.get('confidence') if isinstance(custodian_name, dict) else None
            if name_conf is not None:
                entry_info['confidence'] = name_conf
                if name_conf < 0.5:
                    stats['certainty']['low_name_confidence'].append(entry_info.copy())
                elif name_conf < 0.8:
                    stats['certainty']['medium_name_confidence'].append(entry_info.copy())
                else:
                    stats['certainty']['high_name_confidence'].append(entry_info.copy())
            else:
                stats['certainty']['no_name_confidence'].append(entry_info.copy())

            # Track NA ISIL match confidence (Nationaal Archief - archives)
            na_isil_conf = nan_isil.get('match_confidence') if isinstance(nan_isil, dict) else None
            if na_isil_conf is not None:
                entry_info['isil_confidence'] = na_isil_conf
                entry_info['isil_code'] = nan_isil.get('isil_code', '')
                entry_info['isil_source'] = 'NA'
                if na_isil_conf < 0.8:
                    stats['certainty']['low_na_isil_confidence'].append(entry_info.copy())
                else:
                    stats['certainty']['high_na_isil_confidence'].append(entry_info.copy())

            # Track KB ISIL (Koninklijke Bibliotheek - libraries)
            kb_enrichment = entry.get('kb_enrichment', {})
            if isinstance(kb_enrichment, dict) and kb_enrichment.get('isil_code'):
                entry_info['isil_code'] = kb_enrichment.get('isil_code', '')
                entry_info['isil_source'] = 'KB'
                entry_info['registry'] = kb_enrichment.get('registry', 'KB Netherlands Library Network')
                stats['certainty']['has_kb_isil'].append(entry_info.copy())
                # Also track in new_sources
                if not stats['new_sources'].get('has_kb_isil'):
                    stats['new_sources']['has_kb_isil'] = 0
                stats['new_sources']['has_kb_isil'] += 1
                if 'ISIL (KB)' not in entry_sources:
                    entry_sources.append('ISIL (KB)')

            # Count enrichment sources
            for source in entry_sources:
                stats['enrichment_sources'][source] += 1

            # Determine if entry is enriched (has any external data source)
            is_enriched = len(entry_sources) > 0
            if is_enriched:
                stats['enriched_count'] += 1
            else:
                stats['not_enriched_count'] += 1

            # Founding date / inception
            inception = enrichment.get('wikidata_inception', {})
            if inception and inception.get('time'):
                time_str = inception['time']
                try:
                    # Extract year from time string like "+1815-00-00T00:00:00Z"
                    year = int(time_str[1:5])
                    decade = (year // 10) * 10
                    stats['founding_decades'][decade] += 1
                except (ValueError, IndexError):
                    pass

        except Exception as e:
            print(f"Warning: Error processing {yaml_file.name}: {e}")
            continue

    return stats


def format_for_d3(stats: dict) -> dict:
    """Format statistics for D3.js visualizations."""

    total = stats['total_entries']

    # Institution types for pie/donut chart
    type_data = []
    for code, count in sorted(stats['institution_types'].items(), key=lambda x: -x[1]):
        info = TYPE_INFO.get(code, {'name': code, 'color': '#9e9e9e'})
        type_data.append({
            'code': code,
            'name': info['name'],
            'count': count,
            'percentage': round(count / total * 100, 1),
            'color': info['color'],
        })

    # Top cities for bar chart
    top_cities = []
    for city, count in stats['cities'].most_common(20):
        top_cities.append({
            'city': city,
            'count': count,
        })

    # Collection systems for horizontal bar chart
    collection_systems = []
    for system, count in stats['collection_systems'].most_common(15):
        collection_systems.append({
            'system': system,
            'count': count,
        })

    # Wikidata types for treemap
    wikidata_types = []
    for type_name, count in stats['wikidata_types'].most_common(20):
        wikidata_types.append({
            'type': type_name,
            'count': count,
        })

    # Enrichment status - simplified to Enriched vs Not Enriched
    enriched_count = stats.get('enriched_count', 0)
    not_enriched_count = stats.get('not_enriched_count', 0)
    enrichment_status = [
        {
            'status': 'Enriched',
            'count': enriched_count,
            'percentage': round(enriched_count / total * 100, 1) if total > 0 else 0,
            'color': '#2ecc71',
        },
        {
            'status': 'Not Enriched',
            'count': not_enriched_count,
            'percentage': round(not_enriched_count / total * 100, 1) if total > 0 else 0,
            'color': '#e74c3c',
        },
    ]

    # Enrichment sources for pie chart
    enrichment_sources = []
    source_colors = {
        'Wikidata': '#3498db',
        'Google Maps': '#e74c3c',
        'ISIL': '#2ecc71',
        'ISIL (NA)': '#27ae60',
        'Website': '#9b59b6',
        'Museum Register': '#f39c12',
        'Web Claims': '#1abc9c',
    }
    for source, count in stats['enrichment_sources'].most_common():
        enrichment_sources.append({
            'source': source,
            'count': count,
            'percentage': round(count / total * 100, 1) if total > 0 else 0,
            'color': source_colors.get(source, '#9e9e9e'),
        })

    # Identifier coverage for bar chart
    identifier_coverage = []
    id_labels = {
        'has_coordinates': 'Coordinates',
        'has_wikipedia_nl': 'Wikipedia NL',
        'has_image': 'Image',
        'has_website': 'Website',
        'has_isil': 'ISIL Code',
    }
    for key, count in stats['identifiers'].items():
        identifier_coverage.append({
            'identifier': id_labels.get(key, key),
            'count': count,
            'percentage': round(count / total * 100, 1),
        })
    identifier_coverage.sort(key=lambda x: -x['count'])

    # Founding decades for line/area chart
    founding_timeline = []
    if stats['founding_decades']:
        min_decade = min(stats['founding_decades'].keys())
        max_decade = max(stats['founding_decades'].keys())
        for decade in range(min_decade, max_decade + 10, 10):
            founding_timeline.append({
                'decade': decade,
                'count': stats['founding_decades'].get(decade, 0),
            })

    # Province distribution for choropleth/cartogram
    province_data = []
    for province, count in sorted(stats['provinces'].items(), key=lambda x: -x[1]):
        # Get breakdown by institution type for this province
        type_breakdown = {}
        for inst_type, type_count in stats['provinces_by_type'][province].items():
            type_info = TYPE_INFO.get(inst_type, {'name': inst_type, 'color': '#9e9e9e'})
            type_breakdown[inst_type] = {
                'code': inst_type,
                'name': type_info['name'],
                'count': type_count,
                'color': type_info['color'],
            }

        province_data.append({
            'province': province,
            'count': count,
            'color': PROVINCE_COLORS.get(province, '#9e9e9e'),
            'types': type_breakdown,
        })

    # Google Maps coverage for bar chart
    google_maps_coverage = []
    gm_labels = {
        'has_rating': 'Rating',
        'has_photos': 'Photos',
        'has_reviews': 'Reviews',
        'has_opening_hours': 'Opening Hours',
        'has_street_view': 'Street View',
    }
    for key in ['has_rating', 'has_photos', 'has_reviews', 'has_opening_hours', 'has_street_view']:
        count = stats['google_maps'].get(key, 0)
        google_maps_coverage.append({
            'feature': gm_labels.get(key, key),
            'count': count,
            'percentage': round(count / total * 100, 1) if total > 0 else 0,
        })
    google_maps_coverage.sort(key=lambda x: -x['count'])

    # Google Maps status for summary
    gm_success = stats['google_maps'].get('status_success', 0)
    gm_not_found = stats['google_maps'].get('status_not_found', 0)

    # Rating distribution for histogram (binned by 0.5 increments)
    rating_bins = defaultdict(int)
    for item in stats['rating_distribution']:
        # Bin ratings to nearest 0.5
        binned = round(item['rating'] * 2) / 2
        rating_bins[binned] += 1

    rating_histogram = []
    for rating in [i / 2 for i in range(1, 11)]:  # 0.5 to 5.0
        rating_histogram.append({
            'rating': rating,
            'count': rating_bins.get(rating, 0),
        })

    # Bubble chart data: aggregate by institution type
    bubble_chart_data = []
    for type_code, type_stats in stats['type_rating_stats'].items():
        if type_stats['count'] > 0:
            avg_rating = sum(type_stats['ratings']) / len(type_stats['ratings'])
            avg_reviews = sum(type_stats['review_counts']) / len(type_stats['review_counts'])
            total_reviews = sum(type_stats['review_counts'])
            info = TYPE_INFO.get(type_code, {'name': type_code, 'color': '#9e9e9e'})
            bubble_chart_data.append({
                'type': type_code,
                'name': info['name'],
                'avg_rating': round(avg_rating, 2),
                'avg_reviews': round(avg_reviews, 1),
                'total_reviews': total_reviews,
                'count': type_stats['count'],
                'color': info['color'],
            })
    bubble_chart_data.sort(key=lambda x: -x['count'])

    # Sunburst data: Province -> Institution Type hierarchy
    sunburst_data = {
        'name': 'Netherlands',
        'children': []
    }
    for province, count in sorted(stats['provinces'].items(), key=lambda x: -x[1]):
        province_node = {
            'name': province,
            'color': PROVINCE_COLORS.get(province, '#9e9e9e'),
            'children': []
        }
        for inst_type, type_count in stats['provinces_by_type'][province].items():
            info = TYPE_INFO.get(inst_type, {'name': inst_type, 'color': '#9e9e9e'})
            province_node['children'].append({
                'name': info['name'],
                'code': inst_type,
                'value': type_count,
                'color': info['color'],
            })
        # Sort children by count
        province_node['children'].sort(key=lambda x: -x['value'])
        sunburst_data['children'].append(province_node)

    # Individual rating points for scatter plot (sample if too many)
    rating_scatter = stats['rating_distribution'][:500]  # Limit to 500 points

    # New enrichment sources coverage for bar chart
    new_sources_coverage = []
    new_source_labels = {
        'has_nan_isil': 'ISIL (NA - Archives)',
        'has_kb_isil': 'ISIL (KB - Libraries)',
        'has_museum_register': 'Museum Register',
        'has_ghcid': 'GHCID',
        'has_web_claims': 'Web Claims',
        'has_social_media': 'Social Media',
        'has_verified_name': 'Verified Name',
    }
    new_source_colors = {
        'has_nan_isil': '#27ae60',
        'has_kb_isil': '#16a085',
        'has_museum_register': '#f39c12',
        'has_ghcid': '#8e44ad',
        'has_web_claims': '#1abc9c',
        'has_social_media': '#3498db',
        'has_verified_name': '#2ecc71',
    }
    for key, count in stats['new_sources'].items():
        new_sources_coverage.append({
            'source': new_source_labels.get(key, key),
            'key': key,
            'count': count,
            'percentage': round(count / total * 100, 1) if total > 0 else 0,
            'color': new_source_colors.get(key, '#9e9e9e'),
        })
    new_sources_coverage.sort(key=lambda x: -x['count'])

    # Social media platforms breakdown
    social_media_data = []
    social_colors = {
        'Facebook': '#1877f2',
        'Twitter': '#1da1f2',
        'Instagram': '#e4405f',
        'Linkedin': '#0077b5',
        'Youtube': '#ff0000',
        'Tiktok': '#000000',
    }
    for platform, count in stats['social_media_platforms'].most_common():
        social_media_data.append({
            'platform': platform,
            'count': count,
            'color': social_colors.get(platform, '#9e9e9e'),
        })

    # Museum Register by province
    museum_register_by_province = []
    for province, count in stats['museum_register_provinces'].most_common():
        museum_register_by_province.append({
            'province': province,
            'count': count,
            'color': PROVINCE_COLORS.get(province, '#9e9e9e'),
        })

    # ============================================
    # Enrichment Certainty Chart Data
    # ============================================
    certainty_stats = stats.get('certainty', {})

    # Calculate NA ISIL and KB ISIL counts
    na_isil_high = len(certainty_stats.get('high_na_isil_confidence', []))
    na_isil_low = len(certainty_stats.get('low_na_isil_confidence', []))
    kb_isil_count = len(certainty_stats.get('has_kb_isil', []))

    # Summary counts for the stacked bar chart
    certainty_summary = [
        {
            'category': 'Name Verification',
            'high': len(certainty_stats.get('high_name_confidence', [])),
            'medium': len(certainty_stats.get('medium_name_confidence', [])),
            'low': len(certainty_stats.get('low_name_confidence', [])),
            'none': len(certainty_stats.get('no_name_confidence', [])),
        },
        {
            'category': 'ISIL (NA - Archives)',
            'high': na_isil_high,
            'low': na_isil_low,
            'none': total - na_isil_high - na_isil_low,
        },
        {
            'category': 'ISIL (KB - Libraries)',
            'authoritative': kb_isil_count,  # KB ISIL is authoritative (from source registry)
            'none': total - kb_isil_count,
        },
        {
            'category': 'Google Maps',
            'valid': gm_success - len(certainty_stats.get('google_maps_invalid', [])),
            'invalid': len(certainty_stats.get('google_maps_invalid', [])),
            'none': total - gm_success,
        },
    ]

    # Detailed lists for drill-down (limited to 100 items each for performance)
    certainty_details = {
        'google_maps_invalid': certainty_stats.get('google_maps_invalid', [])[:100],
        'low_name_confidence': certainty_stats.get('low_name_confidence', [])[:100],
        'medium_name_confidence': certainty_stats.get('medium_name_confidence', [])[:100],
        'low_na_isil_confidence': certainty_stats.get('low_na_isil_confidence', [])[:100],
        'has_kb_isil': certainty_stats.get('has_kb_isil', [])[:100],
    }

    # Color scheme for certainty levels
    certainty_colors = {
        'high': '#2ecc71',         # Green - high confidence
        'valid': '#2ecc71',        # Green - valid match
        'authoritative': '#27ae60', # Dark green - authoritative source
        'medium': '#f39c12',       # Orange - needs review
        'low': '#e74c3c',          # Red - doubtful
        'invalid': '#e74c3c',      # Red - wrong entity
        'none': '#9e9e9e',         # Gray - not available
    }

    return {
        'generated_at': datetime.now(timezone.utc).isoformat(),
        'total_entries': total,
        'summary': {
            'total_institutions': total,
            'enriched': enriched_count,
            'not_enriched': not_enriched_count,
            'with_coordinates': stats['identifiers']['has_coordinates'],
            'with_wikidata': stats['enrichment_sources'].get('Wikidata', 0),
            'with_google_maps': gm_success,
            'google_maps_not_found': gm_not_found,
            'unique_cities': len(stats['cities']),
            'unique_provinces': len(stats['provinces']),
            'institution_types': len(stats['institution_types']),
            # New enrichment source counts
            'with_nan_isil': stats['new_sources']['has_nan_isil'],
            'with_kb_isil': stats['new_sources'].get('has_kb_isil', 0),
            'with_museum_register': stats['new_sources']['has_museum_register'],
            'with_ghcid': stats['new_sources']['has_ghcid'],
            'with_web_claims': stats['new_sources']['has_web_claims'],
            'with_social_media': stats['new_sources']['has_social_media'],
            'with_verified_name': stats['new_sources']['has_verified_name'],
            # Certainty counts
            'google_maps_invalid': len(certainty_stats.get('google_maps_invalid', [])),
            'low_name_confidence': len(certainty_stats.get('low_name_confidence', [])),
            'medium_name_confidence': len(certainty_stats.get('medium_name_confidence', [])),
            'high_name_confidence': len(certainty_stats.get('high_name_confidence', [])),
            'low_na_isil_confidence': na_isil_low,
            'high_na_isil_confidence': na_isil_high,
            'has_kb_isil': kb_isil_count,
        },
        'charts': {
            'institution_types': type_data,
            'top_cities': top_cities,
            'collection_systems': collection_systems,
            'wikidata_types': wikidata_types,
            'enrichment_status': enrichment_status,
            'enrichment_sources': enrichment_sources,
            'identifier_coverage': identifier_coverage,
            'google_maps_coverage': google_maps_coverage,
            'founding_timeline': founding_timeline,
            'provinces': province_data,
            'rating_histogram': rating_histogram,
            'bubble_chart': bubble_chart_data,
            'sunburst': sunburst_data,
            'rating_scatter': rating_scatter,
            # New enrichment source charts
            'new_sources_coverage': new_sources_coverage,
            'social_media_platforms': social_media_data,
            'museum_register_by_province': museum_register_by_province,
            # Enrichment certainty charts
            'enrichment_certainty': {
                'summary': certainty_summary,
                'details': certainty_details,
                'colors': certainty_colors,
            },
        }
    }


def main():
    """Main export function."""

    # Paths
    enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
    output_dir = project_root / 'frontend' / 'public' / 'data'
    output_file = output_dir / 'nde_statistics.json'

    # Create output directory if needed
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Processing enriched entries from: {enriched_dir}")

    # Collect statistics
    stats = process_enriched_files(enriched_dir)

    # Format for D3.js
    d3_data = format_for_d3(stats)

    # Write JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(d3_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Statistics export complete!")
    print(f"   Total entries: {stats['total_entries']}")
    print(f"   Output file: {output_file}")

    # Print summary
    print(f"\n📊 Summary:")
    print(f"   Institution types: {len(stats['institution_types'])}")
    print(f"   Unique cities: {len(stats['cities'])}")
    print(f"   Provinces with data: {len(stats['provinces'])}")
    print(f"   Collection systems: {len(stats['collection_systems'])}")
    print(f"   Wikidata types: {len(stats['wikidata_types'])}")

    # Print new enrichment sources
    total = stats['total_entries']
    print(f"\n🆕 New Enrichment Sources:")
    print(f"   ISIL (NA - Archives): {stats['new_sources']['has_nan_isil']} ({stats['new_sources']['has_nan_isil']/total*100:.1f}%)")
    print(f"   ISIL (KB - Libraries): {stats['new_sources'].get('has_kb_isil', 0)} ({stats['new_sources'].get('has_kb_isil', 0)/total*100:.1f}%)")
    print(f"   Museum Register: {stats['new_sources']['has_museum_register']} ({stats['new_sources']['has_museum_register']/total*100:.1f}%)")
    print(f"   GHCID: {stats['new_sources']['has_ghcid']} ({stats['new_sources']['has_ghcid']/total*100:.1f}%)")
    print(f"   Web Claims: {stats['new_sources']['has_web_claims']} ({stats['new_sources']['has_web_claims']/total*100:.1f}%)")
    print(f"   Social Media: {stats['new_sources']['has_social_media']} ({stats['new_sources']['has_social_media']/total*100:.1f}%)")
    print(f"   Verified Name: {stats['new_sources']['has_verified_name']} ({stats['new_sources']['has_verified_name']/total*100:.1f}%)")

    # Print social media breakdown
    if stats['social_media_platforms']:
        print(f"\n📱 Social Media Platforms:")
        for platform, count in stats['social_media_platforms'].most_common():
            print(f"   {platform}: {count}")

    # Print province breakdown
    if stats['provinces']:
        print(f"\n🗺️  Province distribution:")
        for province, count in stats['provinces'].most_common():
            print(f"   {province}: {count}")


if __name__ == '__main__':
    main()