#!/usr/bin/env python3 """ Export NDE Statistics to JSON for Frontend Visualizations Reads the enriched YAML files and produces a comprehensive statistics JSON suitable for D3.js visualizations in the React frontend. """ import json from pathlib import Path from datetime import datetime, timezone from collections import Counter, defaultdict import sys # Add project root to path for imports project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) try: import yaml except ImportError: print("Error: PyYAML not installed. Run: pip install pyyaml") sys.exit(1) # Netherlands Province bounding boxes (approximate) for coordinate lookup # Format: (min_lat, max_lat, min_lon, max_lon) PROVINCE_BOUNDS = { 'Groningen': (53.05, 53.55, 6.15, 7.25), 'Friesland': (52.85, 53.50, 5.05, 6.35), 'Drenthe': (52.65, 53.15, 6.15, 7.10), 'Overijssel': (52.15, 52.85, 5.75, 7.10), 'Flevoland': (52.25, 52.75, 5.15, 6.00), 'Gelderland': (51.75, 52.55, 5.05, 6.85), 'Utrecht': (51.95, 52.35, 4.75, 5.65), 'Noord-Holland': (52.25, 53.00, 4.50, 5.35), 'Zuid-Holland': (51.65, 52.35, 3.85, 5.00), 'Zeeland': (51.20, 51.75, 3.35, 4.30), 'Noord-Brabant': (51.25, 51.85, 4.35, 6.05), 'Limburg': (50.75, 51.80, 5.55, 6.25), } # Province colors for visualization PROVINCE_COLORS = { 'Groningen': '#1f77b4', 'Friesland': '#ff7f0e', 'Drenthe': '#2ca02c', 'Overijssel': '#d62728', 'Flevoland': '#9467bd', 'Gelderland': '#8c564b', 'Utrecht': '#e377c2', 'Noord-Holland': '#7f7f7f', 'Zuid-Holland': '#bcbd22', 'Zeeland': '#17becf', 'Noord-Brabant': '#aec7e8', 'Limburg': '#ffbb78', } def get_province_from_coords(lat: float, lon: float) -> str | None: """Determine the Dutch province from coordinates using bounding box lookup.""" if not lat or not lon: return None for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items(): if min_lat <= lat <= max_lat and min_lon <= lon <= max_lon: return province # Fallback for edge cases - check if coordinates are in Netherlands at all if 50.7 <= lat <= 53.6 and 3.3 <= lon <= 7.3: # In Netherlands but didn't match bounds - find closest province best_province = None best_distance = float('inf') for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items(): center_lat = (min_lat + max_lat) / 2 center_lon = (min_lon + max_lon) / 2 distance = ((lat - center_lat) ** 2 + (lon - center_lon) ** 2) ** 0.5 if distance < best_distance: best_distance = distance best_province = province return best_province return None # Institution type mappings TYPE_INFO = { 'G': {'name': 'Gallery', 'color': '#00bcd4'}, 'L': {'name': 'Library', 'color': '#2ecc71'}, 'A': {'name': 'Archive', 'color': '#3498db'}, 'M': {'name': 'Museum', 'color': '#e74c3c'}, 'O': {'name': 'Official', 'color': '#f39c12'}, 'R': {'name': 'Research', 'color': '#1abc9c'}, 'C': {'name': 'Corporation', 'color': '#795548'}, 'U': {'name': 'Unknown', 'color': '#9e9e9e'}, 'B': {'name': 'Botanical', 'color': '#4caf50'}, 'E': {'name': 'Education', 'color': '#ff9800'}, 'S': {'name': 'Society', 'color': '#9b59b6'}, 'F': {'name': 'Features', 'color': '#95a5a6'}, 'I': {'name': 'Intangible', 'color': '#673ab7'}, 'X': {'name': 'Mixed', 'color': '#607d8b'}, 'P': {'name': 'Personal', 'color': '#ff5722'}, 'H': {'name': 'Holy sites', 'color': '#607d8b'}, 'D': {'name': 'Digital', 'color': '#34495e'}, 'N': {'name': 'NGO', 'color': '#e91e63'}, 'T': {'name': 'Taste/smell', 'color': '#ff5722'}, } def process_enriched_files(enriched_dir: Path) -> dict: """Process all enriched YAML files and collect statistics.""" stats = { 'total_entries': 0, 'enrichment_status': Counter(), 'enrichment_sources': Counter(), # Track which sources enriched each entry 'institution_types': Counter(), 'cities': Counter(), 'provinces': Counter(), 'provinces_by_type': defaultdict(lambda: Counter()), # province -> {type: count} 'collection_systems': Counter(), 'wikidata_types': Counter(), 'identifiers': { 'has_coordinates': 0, 'has_isil': 0, 'has_wikipedia_nl': 0, 'has_image': 0, 'has_website': 0, }, 'google_maps': { 'has_rating': 0, 'has_photos': 0, 'has_reviews': 0, 'has_opening_hours': 0, 'has_street_view': 0, 'status_success': 0, 'status_not_found': 0, }, # New enrichment source tracking 'new_sources': { 'has_nan_isil': 0, 'has_museum_register': 0, 'has_ghcid': 0, 'has_web_claims': 0, 'has_social_media': 0, 'has_verified_name': 0, }, 'founding_decades': Counter(), 'enriched_count': 0, 'not_enriched_count': 0, # New: Rating distribution for histogram 'rating_distribution': [], # List of (rating, review_count, type) tuples # New: Type aggregates for bubble chart 'type_rating_stats': defaultdict(lambda: {'ratings': [], 'review_counts': [], 'count': 0}), # New: Museum register by province 'museum_register_provinces': Counter(), # New: Social media platforms 'social_media_platforms': Counter(), # New: Enrichment certainty tracking 'certainty': { 'google_maps_invalid': [], # Entries where Google Maps found wrong entity 'low_name_confidence': [], # custodian_name.confidence < 0.5 'medium_name_confidence': [], # 0.5 <= custodian_name.confidence < 0.8 'high_name_confidence': [], # custodian_name.confidence >= 0.8 # NA ISIL (Nationaal Archief) - for archives 'low_na_isil_confidence': [], # nan_isil_enrichment.match_confidence < 0.8 'high_na_isil_confidence': [], # nan_isil_enrichment.match_confidence >= 0.8 # KB ISIL (Koninklijke Bibliotheek) - for libraries 'has_kb_isil': [], # Entries with KB ISIL (authoritative, no confidence needed) 'no_name_confidence': [], # No custodian_name verification }, } yaml_files = sorted(enriched_dir.glob('*.yaml')) for yaml_file in yaml_files: try: with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) stats['total_entries'] += 1 # Track enrichment sources for this entry entry_sources = [] # Enrichment status (legacy) status = entry.get('enrichment_status', 'unknown') stats['enrichment_status'][status] += 1 # Original entry data original = entry.get('original_entry', {}) # Institution type types = original.get('type', []) if types: stats['institution_types'][types[0]] += 1 # City city = original.get('plaatsnaam_bezoekadres', '') if city: stats['cities'][city] += 1 # Collection system system = original.get('collectiebeheersysteem', '') if system and system.strip(): stats['collection_systems'][system] += 1 # Check for ISIL in original data if original.get('isil-code_na') or original.get('isil_code'): entry_sources.append('ISIL') stats['identifiers']['has_isil'] += 1 # Check for website in original data if original.get('webadres_organisatie'): entry_sources.append('Website') # Wikidata enrichment enrichment = entry.get('wikidata_enrichment', {}) has_wikidata = bool(enrichment and enrichment.get('wikidata_entity_id')) if has_wikidata: entry_sources.append('Wikidata') # Coordinates and Province (from Wikidata) coords = enrichment.get('wikidata_coordinates', {}) inst_type = types[0] if types else 'U' province = None # Initialize province for later use if coords and coords.get('latitude'): stats['identifiers']['has_coordinates'] += 1 # Determine province from coordinates lat = coords.get('latitude') lon = coords.get('longitude') province = get_province_from_coords(lat, lon) if province: stats['provinces'][province] += 1 stats['provinces_by_type'][province][inst_type] += 1 # Website from Wikidata if enrichment.get('wikidata_official_website'): stats['identifiers']['has_website'] += 1 # Image if enrichment.get('wikidata_image'): stats['identifiers']['has_image'] += 1 # Wikipedia NL sitelinks = enrichment.get('wikidata_sitelinks', {}) if 'nlwiki' in sitelinks: stats['identifiers']['has_wikipedia_nl'] += 1 # ISIL from Wikidata wd_identifiers = enrichment.get('wikidata_identifiers', {}) if 'isil' in wd_identifiers: if 'ISIL' not in entry_sources: entry_sources.append('ISIL') stats['identifiers']['has_isil'] += 1 # Wikidata instance types instance_of = enrichment.get('wikidata_instance_of', []) for inst in instance_of: label = inst.get('label_en', inst.get('label_nl', 'Unknown')) stats['wikidata_types'][label] += 1 # Google Maps enrichment google = entry.get('google_maps_enrichment', {}) has_google_maps = False if google: # Check API status or presence of place_id api_status = google.get('api_status', '') if api_status == 'OK' or google.get('place_id'): stats['google_maps']['status_success'] += 1 has_google_maps = True entry_sources.append('Google Maps') elif api_status == 'NOT_FOUND': stats['google_maps']['status_not_found'] += 1 if google.get('rating'): stats['google_maps']['has_rating'] += 1 # Collect rating data for histogram and bubble chart rating = google.get('rating') review_count = google.get('total_ratings', 0) or 0 # Get city and province for scatter plot filtering city = original.get('plaats', original.get('city', '')) prov = province if province else '' # Get GHCID for linking to map ghcid_uuid = '' ghcid_current = '' ghcid_entry = entry.get('ghcid', {}) if ghcid_entry: ghcid_uuid = ghcid_entry.get('ghcid_uuid', '') ghcid_current = ghcid_entry.get('ghcid_current', '') stats['rating_distribution'].append({ 'rating': rating, 'reviews': review_count, 'type': inst_type, 'name': original.get('naam_organisatie', yaml_file.stem), 'city': city, 'province': prov, 'ghcid_uuid': ghcid_uuid, 'ghcid_current': ghcid_current, }) # Aggregate by type for bubble chart stats['type_rating_stats'][inst_type]['ratings'].append(rating) stats['type_rating_stats'][inst_type]['review_counts'].append(review_count) stats['type_rating_stats'][inst_type]['count'] += 1 # Check both 'photos' and 'photo_urls' fields if google.get('photos') or google.get('photo_urls'): stats['google_maps']['has_photos'] += 1 if google.get('reviews'): stats['google_maps']['has_reviews'] += 1 if google.get('opening_hours'): stats['google_maps']['has_opening_hours'] += 1 # Check for coordinates (lat/lon or coordinates dict) if google.get('coordinates') or google.get('latitude'): stats['google_maps']['has_street_view'] += 1 # Also count as having coordinates if we don't have Wikidata coords if not (coords and coords.get('latitude')): gm_lat = google.get('latitude') or (google.get('coordinates', {}) or {}).get('latitude') gm_lon = google.get('longitude') or (google.get('coordinates', {}) or {}).get('longitude') if gm_lat and gm_lon: stats['identifiers']['has_coordinates'] += 1 province = get_province_from_coords(gm_lat, gm_lon) if province: stats['provinces'][province] += 1 stats['provinces_by_type'][province][inst_type] += 1 # Track new enrichment sources nan_isil = entry.get('nan_isil_enrichment', {}) if isinstance(nan_isil, dict) and nan_isil.get('isil_code'): stats['new_sources']['has_nan_isil'] += 1 if 'ISIL (NA)' not in entry_sources: entry_sources.append('ISIL (NA)') museum_register = entry.get('museum_register_enrichment', {}) if isinstance(museum_register, dict) and museum_register.get('museum_name'): stats['new_sources']['has_museum_register'] += 1 if 'Museum Register' not in entry_sources: entry_sources.append('Museum Register') mr_province = museum_register.get('province', '') if mr_province: stats['museum_register_provinces'][mr_province] += 1 ghcid_data = entry.get('ghcid', {}) if isinstance(ghcid_data, dict) and ghcid_data.get('ghcid_current'): stats['new_sources']['has_ghcid'] += 1 web_claims = entry.get('web_claims', {}) if isinstance(web_claims, dict) and web_claims.get('claims'): stats['new_sources']['has_web_claims'] += 1 entry_sources.append('Web Claims') # Track social media platforms for claim in web_claims.get('claims', []): claim_type = claim.get('claim_type', '') if claim_type.startswith('social_'): platform = claim_type.replace('social_', '').capitalize() stats['social_media_platforms'][platform] += 1 if not stats['new_sources'].get('has_social_media'): stats['new_sources']['has_social_media'] = 0 # Only count once per institution if any(c.get('claim_type', '').startswith('social_') for c in web_claims.get('claims', [])): stats['new_sources']['has_social_media'] += 1 custodian_name = entry.get('custodian_name', {}) if isinstance(custodian_name, dict) and custodian_name.get('claim_value'): stats['new_sources']['has_verified_name'] += 1 # ============================================ # Enrichment Certainty Tracking # ============================================ # Build entry info for linking to map entry_info = { 'name': original.get('organisatie', original.get('naam_organisatie', yaml_file.stem)), 'ghcid_uuid': ghcid_data.get('ghcid_uuid', '') if ghcid_data else '', 'ghcid_current': ghcid_data.get('ghcid_current', '') if ghcid_data else '', 'type': inst_type, 'city': original.get('plaatsnaam_bezoekadres', original.get('plaats', '')), 'file': yaml_file.stem, } # Track Google Maps invalid matches if entry.get('google_maps_match_invalid'): entry_info['reason'] = entry.get('google_maps_match_invalid_reason', 'Google Maps found wrong entity') entry_info['google_maps_name'] = google.get('name', '') if google else '' stats['certainty']['google_maps_invalid'].append(entry_info.copy()) # Track name confidence levels name_conf = custodian_name.get('confidence') if isinstance(custodian_name, dict) else None if name_conf is not None: entry_info['confidence'] = name_conf if name_conf < 0.5: stats['certainty']['low_name_confidence'].append(entry_info.copy()) elif name_conf < 0.8: stats['certainty']['medium_name_confidence'].append(entry_info.copy()) else: stats['certainty']['high_name_confidence'].append(entry_info.copy()) else: stats['certainty']['no_name_confidence'].append(entry_info.copy()) # Track NA ISIL match confidence (Nationaal Archief - archives) na_isil_conf = nan_isil.get('match_confidence') if isinstance(nan_isil, dict) else None if na_isil_conf is not None: entry_info['isil_confidence'] = na_isil_conf entry_info['isil_code'] = nan_isil.get('isil_code', '') entry_info['isil_source'] = 'NA' if na_isil_conf < 0.8: stats['certainty']['low_na_isil_confidence'].append(entry_info.copy()) else: stats['certainty']['high_na_isil_confidence'].append(entry_info.copy()) # Track KB ISIL (Koninklijke Bibliotheek - libraries) kb_enrichment = entry.get('kb_enrichment', {}) if isinstance(kb_enrichment, dict) and kb_enrichment.get('isil_code'): entry_info['isil_code'] = kb_enrichment.get('isil_code', '') entry_info['isil_source'] = 'KB' entry_info['registry'] = kb_enrichment.get('registry', 'KB Netherlands Library Network') stats['certainty']['has_kb_isil'].append(entry_info.copy()) # Also track in new_sources if not stats['new_sources'].get('has_kb_isil'): stats['new_sources']['has_kb_isil'] = 0 stats['new_sources']['has_kb_isil'] += 1 if 'ISIL (KB)' not in entry_sources: entry_sources.append('ISIL (KB)') # Count enrichment sources for source in entry_sources: stats['enrichment_sources'][source] += 1 # Determine if entry is enriched (has any external data source) is_enriched = len(entry_sources) > 0 if is_enriched: stats['enriched_count'] += 1 else: stats['not_enriched_count'] += 1 # Founding date / inception inception = enrichment.get('wikidata_inception', {}) if inception and inception.get('time'): time_str = inception['time'] try: # Extract year from time string like "+1815-00-00T00:00:00Z" year = int(time_str[1:5]) decade = (year // 10) * 10 stats['founding_decades'][decade] += 1 except (ValueError, IndexError): pass except Exception as e: print(f"Warning: Error processing {yaml_file.name}: {e}") continue return stats def format_for_d3(stats: dict) -> dict: """Format statistics for D3.js visualizations.""" total = stats['total_entries'] # Institution types for pie/donut chart type_data = [] for code, count in sorted(stats['institution_types'].items(), key=lambda x: -x[1]): info = TYPE_INFO.get(code, {'name': code, 'color': '#9e9e9e'}) type_data.append({ 'code': code, 'name': info['name'], 'count': count, 'percentage': round(count / total * 100, 1), 'color': info['color'], }) # Top cities for bar chart top_cities = [] for city, count in stats['cities'].most_common(20): top_cities.append({ 'city': city, 'count': count, }) # Collection systems for horizontal bar chart collection_systems = [] for system, count in stats['collection_systems'].most_common(15): collection_systems.append({ 'system': system, 'count': count, }) # Wikidata types for treemap wikidata_types = [] for type_name, count in stats['wikidata_types'].most_common(20): wikidata_types.append({ 'type': type_name, 'count': count, }) # Enrichment status - simplified to Enriched vs Not Enriched enriched_count = stats.get('enriched_count', 0) not_enriched_count = stats.get('not_enriched_count', 0) enrichment_status = [ { 'status': 'Enriched', 'count': enriched_count, 'percentage': round(enriched_count / total * 100, 1) if total > 0 else 0, 'color': '#2ecc71', }, { 'status': 'Not Enriched', 'count': not_enriched_count, 'percentage': round(not_enriched_count / total * 100, 1) if total > 0 else 0, 'color': '#e74c3c', }, ] # Enrichment sources for pie chart enrichment_sources = [] source_colors = { 'Wikidata': '#3498db', 'Google Maps': '#e74c3c', 'ISIL': '#2ecc71', 'ISIL (NA)': '#27ae60', 'Website': '#9b59b6', 'Museum Register': '#f39c12', 'Web Claims': '#1abc9c', } for source, count in stats['enrichment_sources'].most_common(): enrichment_sources.append({ 'source': source, 'count': count, 'percentage': round(count / total * 100, 1) if total > 0 else 0, 'color': source_colors.get(source, '#9e9e9e'), }) # Identifier coverage for bar chart identifier_coverage = [] id_labels = { 'has_coordinates': 'Coordinates', 'has_wikipedia_nl': 'Wikipedia NL', 'has_image': 'Image', 'has_website': 'Website', 'has_isil': 'ISIL Code', } for key, count in stats['identifiers'].items(): identifier_coverage.append({ 'identifier': id_labels.get(key, key), 'count': count, 'percentage': round(count / total * 100, 1), }) identifier_coverage.sort(key=lambda x: -x['count']) # Founding decades for line/area chart founding_timeline = [] if stats['founding_decades']: min_decade = min(stats['founding_decades'].keys()) max_decade = max(stats['founding_decades'].keys()) for decade in range(min_decade, max_decade + 10, 10): founding_timeline.append({ 'decade': decade, 'count': stats['founding_decades'].get(decade, 0), }) # Province distribution for choropleth/cartogram province_data = [] for province, count in sorted(stats['provinces'].items(), key=lambda x: -x[1]): # Get breakdown by institution type for this province type_breakdown = {} for inst_type, type_count in stats['provinces_by_type'][province].items(): type_info = TYPE_INFO.get(inst_type, {'name': inst_type, 'color': '#9e9e9e'}) type_breakdown[inst_type] = { 'code': inst_type, 'name': type_info['name'], 'count': type_count, 'color': type_info['color'], } province_data.append({ 'province': province, 'count': count, 'color': PROVINCE_COLORS.get(province, '#9e9e9e'), 'types': type_breakdown, }) # Google Maps coverage for bar chart google_maps_coverage = [] gm_labels = { 'has_rating': 'Rating', 'has_photos': 'Photos', 'has_reviews': 'Reviews', 'has_opening_hours': 'Opening Hours', 'has_street_view': 'Street View', } for key in ['has_rating', 'has_photos', 'has_reviews', 'has_opening_hours', 'has_street_view']: count = stats['google_maps'].get(key, 0) google_maps_coverage.append({ 'feature': gm_labels.get(key, key), 'count': count, 'percentage': round(count / total * 100, 1) if total > 0 else 0, }) google_maps_coverage.sort(key=lambda x: -x['count']) # Google Maps status for summary gm_success = stats['google_maps'].get('status_success', 0) gm_not_found = stats['google_maps'].get('status_not_found', 0) # Rating distribution for histogram (binned by 0.5 increments) rating_bins = defaultdict(int) for item in stats['rating_distribution']: # Bin ratings to nearest 0.5 binned = round(item['rating'] * 2) / 2 rating_bins[binned] += 1 rating_histogram = [] for rating in [i / 2 for i in range(1, 11)]: # 0.5 to 5.0 rating_histogram.append({ 'rating': rating, 'count': rating_bins.get(rating, 0), }) # Bubble chart data: aggregate by institution type bubble_chart_data = [] for type_code, type_stats in stats['type_rating_stats'].items(): if type_stats['count'] > 0: avg_rating = sum(type_stats['ratings']) / len(type_stats['ratings']) avg_reviews = sum(type_stats['review_counts']) / len(type_stats['review_counts']) total_reviews = sum(type_stats['review_counts']) info = TYPE_INFO.get(type_code, {'name': type_code, 'color': '#9e9e9e'}) bubble_chart_data.append({ 'type': type_code, 'name': info['name'], 'avg_rating': round(avg_rating, 2), 'avg_reviews': round(avg_reviews, 1), 'total_reviews': total_reviews, 'count': type_stats['count'], 'color': info['color'], }) bubble_chart_data.sort(key=lambda x: -x['count']) # Sunburst data: Province -> Institution Type hierarchy sunburst_data = { 'name': 'Netherlands', 'children': [] } for province, count in sorted(stats['provinces'].items(), key=lambda x: -x[1]): province_node = { 'name': province, 'color': PROVINCE_COLORS.get(province, '#9e9e9e'), 'children': [] } for inst_type, type_count in stats['provinces_by_type'][province].items(): info = TYPE_INFO.get(inst_type, {'name': inst_type, 'color': '#9e9e9e'}) province_node['children'].append({ 'name': info['name'], 'code': inst_type, 'value': type_count, 'color': info['color'], }) # Sort children by count province_node['children'].sort(key=lambda x: -x['value']) sunburst_data['children'].append(province_node) # Individual rating points for scatter plot (sample if too many) rating_scatter = stats['rating_distribution'][:500] # Limit to 500 points # New enrichment sources coverage for bar chart new_sources_coverage = [] new_source_labels = { 'has_nan_isil': 'ISIL (NA - Archives)', 'has_kb_isil': 'ISIL (KB - Libraries)', 'has_museum_register': 'Museum Register', 'has_ghcid': 'GHCID', 'has_web_claims': 'Web Claims', 'has_social_media': 'Social Media', 'has_verified_name': 'Verified Name', } new_source_colors = { 'has_nan_isil': '#27ae60', 'has_kb_isil': '#16a085', 'has_museum_register': '#f39c12', 'has_ghcid': '#8e44ad', 'has_web_claims': '#1abc9c', 'has_social_media': '#3498db', 'has_verified_name': '#2ecc71', } for key, count in stats['new_sources'].items(): new_sources_coverage.append({ 'source': new_source_labels.get(key, key), 'key': key, 'count': count, 'percentage': round(count / total * 100, 1) if total > 0 else 0, 'color': new_source_colors.get(key, '#9e9e9e'), }) new_sources_coverage.sort(key=lambda x: -x['count']) # Social media platforms breakdown social_media_data = [] social_colors = { 'Facebook': '#1877f2', 'Twitter': '#1da1f2', 'Instagram': '#e4405f', 'Linkedin': '#0077b5', 'Youtube': '#ff0000', 'Tiktok': '#000000', } for platform, count in stats['social_media_platforms'].most_common(): social_media_data.append({ 'platform': platform, 'count': count, 'color': social_colors.get(platform, '#9e9e9e'), }) # Museum Register by province museum_register_by_province = [] for province, count in stats['museum_register_provinces'].most_common(): museum_register_by_province.append({ 'province': province, 'count': count, 'color': PROVINCE_COLORS.get(province, '#9e9e9e'), }) # ============================================ # Enrichment Certainty Chart Data # ============================================ certainty_stats = stats.get('certainty', {}) # Calculate NA ISIL and KB ISIL counts na_isil_high = len(certainty_stats.get('high_na_isil_confidence', [])) na_isil_low = len(certainty_stats.get('low_na_isil_confidence', [])) kb_isil_count = len(certainty_stats.get('has_kb_isil', [])) # Summary counts for the stacked bar chart certainty_summary = [ { 'category': 'Name Verification', 'high': len(certainty_stats.get('high_name_confidence', [])), 'medium': len(certainty_stats.get('medium_name_confidence', [])), 'low': len(certainty_stats.get('low_name_confidence', [])), 'none': len(certainty_stats.get('no_name_confidence', [])), }, { 'category': 'ISIL (NA - Archives)', 'high': na_isil_high, 'low': na_isil_low, 'none': total - na_isil_high - na_isil_low, }, { 'category': 'ISIL (KB - Libraries)', 'authoritative': kb_isil_count, # KB ISIL is authoritative (from source registry) 'none': total - kb_isil_count, }, { 'category': 'Google Maps', 'valid': gm_success - len(certainty_stats.get('google_maps_invalid', [])), 'invalid': len(certainty_stats.get('google_maps_invalid', [])), 'none': total - gm_success, }, ] # Detailed lists for drill-down (limited to 100 items each for performance) certainty_details = { 'google_maps_invalid': certainty_stats.get('google_maps_invalid', [])[:100], 'low_name_confidence': certainty_stats.get('low_name_confidence', [])[:100], 'medium_name_confidence': certainty_stats.get('medium_name_confidence', [])[:100], 'low_na_isil_confidence': certainty_stats.get('low_na_isil_confidence', [])[:100], 'has_kb_isil': certainty_stats.get('has_kb_isil', [])[:100], } # Color scheme for certainty levels certainty_colors = { 'high': '#2ecc71', # Green - high confidence 'valid': '#2ecc71', # Green - valid match 'authoritative': '#27ae60', # Dark green - authoritative source 'medium': '#f39c12', # Orange - needs review 'low': '#e74c3c', # Red - doubtful 'invalid': '#e74c3c', # Red - wrong entity 'none': '#9e9e9e', # Gray - not available } return { 'generated_at': datetime.now(timezone.utc).isoformat(), 'total_entries': total, 'summary': { 'total_institutions': total, 'enriched': enriched_count, 'not_enriched': not_enriched_count, 'with_coordinates': stats['identifiers']['has_coordinates'], 'with_wikidata': stats['enrichment_sources'].get('Wikidata', 0), 'with_google_maps': gm_success, 'google_maps_not_found': gm_not_found, 'unique_cities': len(stats['cities']), 'unique_provinces': len(stats['provinces']), 'institution_types': len(stats['institution_types']), # New enrichment source counts 'with_nan_isil': stats['new_sources']['has_nan_isil'], 'with_kb_isil': stats['new_sources'].get('has_kb_isil', 0), 'with_museum_register': stats['new_sources']['has_museum_register'], 'with_ghcid': stats['new_sources']['has_ghcid'], 'with_web_claims': stats['new_sources']['has_web_claims'], 'with_social_media': stats['new_sources']['has_social_media'], 'with_verified_name': stats['new_sources']['has_verified_name'], # Certainty counts 'google_maps_invalid': len(certainty_stats.get('google_maps_invalid', [])), 'low_name_confidence': len(certainty_stats.get('low_name_confidence', [])), 'medium_name_confidence': len(certainty_stats.get('medium_name_confidence', [])), 'high_name_confidence': len(certainty_stats.get('high_name_confidence', [])), 'low_na_isil_confidence': na_isil_low, 'high_na_isil_confidence': na_isil_high, 'has_kb_isil': kb_isil_count, }, 'charts': { 'institution_types': type_data, 'top_cities': top_cities, 'collection_systems': collection_systems, 'wikidata_types': wikidata_types, 'enrichment_status': enrichment_status, 'enrichment_sources': enrichment_sources, 'identifier_coverage': identifier_coverage, 'google_maps_coverage': google_maps_coverage, 'founding_timeline': founding_timeline, 'provinces': province_data, 'rating_histogram': rating_histogram, 'bubble_chart': bubble_chart_data, 'sunburst': sunburst_data, 'rating_scatter': rating_scatter, # New enrichment source charts 'new_sources_coverage': new_sources_coverage, 'social_media_platforms': social_media_data, 'museum_register_by_province': museum_register_by_province, # Enrichment certainty charts 'enrichment_certainty': { 'summary': certainty_summary, 'details': certainty_details, 'colors': certainty_colors, }, } } def main(): """Main export function.""" # Paths enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries' output_dir = project_root / 'frontend' / 'public' / 'data' output_file = output_dir / 'nde_statistics.json' # Create output directory if needed output_dir.mkdir(parents=True, exist_ok=True) print(f"Processing enriched entries from: {enriched_dir}") # Collect statistics stats = process_enriched_files(enriched_dir) # Format for D3.js d3_data = format_for_d3(stats) # Write JSON with open(output_file, 'w', encoding='utf-8') as f: json.dump(d3_data, f, ensure_ascii=False, indent=2) print(f"\n✅ Statistics export complete!") print(f" Total entries: {stats['total_entries']}") print(f" Output file: {output_file}") # Print summary print(f"\n📊 Summary:") print(f" Institution types: {len(stats['institution_types'])}") print(f" Unique cities: {len(stats['cities'])}") print(f" Provinces with data: {len(stats['provinces'])}") print(f" Collection systems: {len(stats['collection_systems'])}") print(f" Wikidata types: {len(stats['wikidata_types'])}") # Print new enrichment sources total = stats['total_entries'] print(f"\n🆕 New Enrichment Sources:") print(f" ISIL (NA - Archives): {stats['new_sources']['has_nan_isil']} ({stats['new_sources']['has_nan_isil']/total*100:.1f}%)") print(f" ISIL (KB - Libraries): {stats['new_sources'].get('has_kb_isil', 0)} ({stats['new_sources'].get('has_kb_isil', 0)/total*100:.1f}%)") print(f" Museum Register: {stats['new_sources']['has_museum_register']} ({stats['new_sources']['has_museum_register']/total*100:.1f}%)") print(f" GHCID: {stats['new_sources']['has_ghcid']} ({stats['new_sources']['has_ghcid']/total*100:.1f}%)") print(f" Web Claims: {stats['new_sources']['has_web_claims']} ({stats['new_sources']['has_web_claims']/total*100:.1f}%)") print(f" Social Media: {stats['new_sources']['has_social_media']} ({stats['new_sources']['has_social_media']/total*100:.1f}%)") print(f" Verified Name: {stats['new_sources']['has_verified_name']} ({stats['new_sources']['has_verified_name']/total*100:.1f}%)") # Print social media breakdown if stats['social_media_platforms']: print(f"\n📱 Social Media Platforms:") for platform, count in stats['social_media_platforms'].most_common(): print(f" {platform}: {count}") # Print province breakdown if stats['provinces']: print(f"\n🗺️ Province distribution:") for province, count in stats['provinces'].most_common(): print(f" {province}: {count}") if __name__ == '__main__': main()