- Introduced a comprehensive class diagram for the heritage custodian observation reconstruction schema. - Defined multiple classes including AllocationAgency, ArchiveOrganizationType, AuxiliaryDigitalPlatform, and others, with relevant attributes and relationships. - Established inheritance and associations among classes to represent complex relationships within the schema. - Generated on 2025-11-28, version 0.9.0, excluding the Container class.
446 lines
16 KiB
Python
446 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export NDE Statistics to JSON for Frontend Visualizations
|
|
|
|
Reads the enriched YAML files and produces a comprehensive statistics JSON
|
|
suitable for D3.js visualizations in the React frontend.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import Counter, defaultdict
|
|
import sys
|
|
|
|
# Add project root to path for imports
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("Error: PyYAML not installed. Run: pip install pyyaml")
|
|
sys.exit(1)
|
|
|
|
# Netherlands Province bounding boxes (approximate) for coordinate lookup
|
|
# Format: (min_lat, max_lat, min_lon, max_lon)
|
|
PROVINCE_BOUNDS = {
|
|
'Groningen': (53.05, 53.55, 6.15, 7.25),
|
|
'Friesland': (52.85, 53.50, 5.05, 6.35),
|
|
'Drenthe': (52.65, 53.15, 6.15, 7.10),
|
|
'Overijssel': (52.15, 52.85, 5.75, 7.10),
|
|
'Flevoland': (52.25, 52.75, 5.15, 6.00),
|
|
'Gelderland': (51.75, 52.55, 5.05, 6.85),
|
|
'Utrecht': (51.95, 52.35, 4.75, 5.65),
|
|
'Noord-Holland': (52.25, 53.00, 4.50, 5.35),
|
|
'Zuid-Holland': (51.65, 52.35, 3.85, 5.00),
|
|
'Zeeland': (51.20, 51.75, 3.35, 4.30),
|
|
'Noord-Brabant': (51.25, 51.85, 4.35, 6.05),
|
|
'Limburg': (50.75, 51.80, 5.55, 6.25),
|
|
}
|
|
|
|
# Province colors for visualization
|
|
PROVINCE_COLORS = {
|
|
'Groningen': '#1f77b4',
|
|
'Friesland': '#ff7f0e',
|
|
'Drenthe': '#2ca02c',
|
|
'Overijssel': '#d62728',
|
|
'Flevoland': '#9467bd',
|
|
'Gelderland': '#8c564b',
|
|
'Utrecht': '#e377c2',
|
|
'Noord-Holland': '#7f7f7f',
|
|
'Zuid-Holland': '#bcbd22',
|
|
'Zeeland': '#17becf',
|
|
'Noord-Brabant': '#aec7e8',
|
|
'Limburg': '#ffbb78',
|
|
}
|
|
|
|
|
|
def get_province_from_coords(lat: float, lon: float) -> str | None:
|
|
"""Determine the Dutch province from coordinates using bounding box lookup."""
|
|
if not lat or not lon:
|
|
return None
|
|
|
|
for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items():
|
|
if min_lat <= lat <= max_lat and min_lon <= lon <= max_lon:
|
|
return province
|
|
|
|
# Fallback for edge cases - check if coordinates are in Netherlands at all
|
|
if 50.7 <= lat <= 53.6 and 3.3 <= lon <= 7.3:
|
|
# In Netherlands but didn't match bounds - find closest province
|
|
best_province = None
|
|
best_distance = float('inf')
|
|
for province, (min_lat, max_lat, min_lon, max_lon) in PROVINCE_BOUNDS.items():
|
|
center_lat = (min_lat + max_lat) / 2
|
|
center_lon = (min_lon + max_lon) / 2
|
|
distance = ((lat - center_lat) ** 2 + (lon - center_lon) ** 2) ** 0.5
|
|
if distance < best_distance:
|
|
best_distance = distance
|
|
best_province = province
|
|
return best_province
|
|
|
|
return None
|
|
|
|
|
|
# Institution type mappings
|
|
TYPE_INFO = {
|
|
'G': {'name': 'Gallery', 'color': '#00bcd4'},
|
|
'L': {'name': 'Library', 'color': '#2ecc71'},
|
|
'A': {'name': 'Archive', 'color': '#3498db'},
|
|
'M': {'name': 'Museum', 'color': '#e74c3c'},
|
|
'O': {'name': 'Official', 'color': '#f39c12'},
|
|
'R': {'name': 'Research', 'color': '#1abc9c'},
|
|
'C': {'name': 'Corporation', 'color': '#795548'},
|
|
'U': {'name': 'Unknown', 'color': '#9e9e9e'},
|
|
'B': {'name': 'Botanical', 'color': '#4caf50'},
|
|
'E': {'name': 'Education', 'color': '#ff9800'},
|
|
'S': {'name': 'Society', 'color': '#9b59b6'},
|
|
'F': {'name': 'Features', 'color': '#95a5a6'},
|
|
'I': {'name': 'Intangible', 'color': '#673ab7'},
|
|
'X': {'name': 'Mixed', 'color': '#607d8b'},
|
|
'P': {'name': 'Personal', 'color': '#ff5722'},
|
|
'H': {'name': 'Holy sites', 'color': '#607d8b'},
|
|
'D': {'name': 'Digital', 'color': '#34495e'},
|
|
'N': {'name': 'NGO', 'color': '#e91e63'},
|
|
'T': {'name': 'Taste/smell', 'color': '#ff5722'},
|
|
}
|
|
|
|
|
|
def process_enriched_files(enriched_dir: Path) -> dict:
|
|
"""Process all enriched YAML files and collect statistics."""
|
|
|
|
stats = {
|
|
'total_entries': 0,
|
|
'enrichment_status': Counter(),
|
|
'institution_types': Counter(),
|
|
'cities': Counter(),
|
|
'provinces': Counter(),
|
|
'provinces_by_type': defaultdict(lambda: Counter()), # province -> {type: count}
|
|
'collection_systems': Counter(),
|
|
'wikidata_types': Counter(),
|
|
'identifiers': {
|
|
'has_coordinates': 0,
|
|
'has_isil': 0,
|
|
'has_wikipedia_nl': 0,
|
|
'has_image': 0,
|
|
'has_website': 0,
|
|
},
|
|
'google_maps': {
|
|
'has_rating': 0,
|
|
'has_photos': 0,
|
|
'has_reviews': 0,
|
|
'has_opening_hours': 0,
|
|
'has_street_view': 0,
|
|
'status_success': 0,
|
|
'status_not_found': 0,
|
|
},
|
|
'founding_decades': Counter(),
|
|
}
|
|
|
|
yaml_files = sorted(enriched_dir.glob('*.yaml'))
|
|
|
|
for yaml_file in yaml_files:
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
stats['total_entries'] += 1
|
|
|
|
# Enrichment status
|
|
status = entry.get('enrichment_status', 'unknown')
|
|
stats['enrichment_status'][status] += 1
|
|
|
|
# Original entry data
|
|
original = entry.get('original_entry', {})
|
|
|
|
# Institution type
|
|
types = original.get('type', [])
|
|
if types:
|
|
stats['institution_types'][types[0]] += 1
|
|
|
|
# City
|
|
city = original.get('plaatsnaam_bezoekadres', '')
|
|
if city:
|
|
stats['cities'][city] += 1
|
|
|
|
# Collection system
|
|
system = original.get('collectiebeheersysteem', '')
|
|
if system and system.strip():
|
|
stats['collection_systems'][system] += 1
|
|
|
|
# Wikidata enrichment
|
|
enrichment = entry.get('wikidata_enrichment', {})
|
|
|
|
# Coordinates and Province
|
|
coords = enrichment.get('wikidata_coordinates', {})
|
|
inst_type = types[0] if types else 'U'
|
|
if coords and coords.get('latitude'):
|
|
stats['identifiers']['has_coordinates'] += 1
|
|
# Determine province from coordinates
|
|
lat = coords.get('latitude')
|
|
lon = coords.get('longitude')
|
|
province = get_province_from_coords(lat, lon)
|
|
if province:
|
|
stats['provinces'][province] += 1
|
|
stats['provinces_by_type'][province][inst_type] += 1
|
|
|
|
# Website
|
|
if enrichment.get('wikidata_official_website'):
|
|
stats['identifiers']['has_website'] += 1
|
|
|
|
# Image
|
|
if enrichment.get('wikidata_image'):
|
|
stats['identifiers']['has_image'] += 1
|
|
|
|
# Wikipedia NL
|
|
sitelinks = enrichment.get('wikidata_sitelinks', {})
|
|
if 'nlwiki' in sitelinks:
|
|
stats['identifiers']['has_wikipedia_nl'] += 1
|
|
|
|
# ISIL
|
|
wd_identifiers = enrichment.get('wikidata_identifiers', {})
|
|
if 'isil' in wd_identifiers or original.get('isil_code'):
|
|
stats['identifiers']['has_isil'] += 1
|
|
|
|
# Wikidata instance types
|
|
instance_of = enrichment.get('wikidata_instance_of', [])
|
|
for inst in instance_of:
|
|
label = inst.get('label_en', inst.get('label_nl', 'Unknown'))
|
|
stats['wikidata_types'][label] += 1
|
|
|
|
# Google Maps enrichment
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google:
|
|
# Check API status or presence of place_id
|
|
api_status = google.get('api_status', '')
|
|
if api_status == 'OK' or google.get('place_id'):
|
|
stats['google_maps']['status_success'] += 1
|
|
elif api_status == 'NOT_FOUND':
|
|
stats['google_maps']['status_not_found'] += 1
|
|
|
|
if google.get('rating'):
|
|
stats['google_maps']['has_rating'] += 1
|
|
# Check both 'photos' and 'photo_urls' fields
|
|
if google.get('photos') or google.get('photo_urls'):
|
|
stats['google_maps']['has_photos'] += 1
|
|
if google.get('reviews'):
|
|
stats['google_maps']['has_reviews'] += 1
|
|
if google.get('opening_hours'):
|
|
stats['google_maps']['has_opening_hours'] += 1
|
|
# Check for coordinates (lat/lon or coordinates dict)
|
|
if google.get('coordinates') or google.get('latitude'):
|
|
stats['google_maps']['has_street_view'] += 1 # Approximate: if we have coords, Street View is available
|
|
|
|
# Founding date / inception
|
|
inception = enrichment.get('wikidata_inception', {})
|
|
if inception and inception.get('time'):
|
|
time_str = inception['time']
|
|
try:
|
|
# Extract year from time string like "+1815-00-00T00:00:00Z"
|
|
year = int(time_str[1:5])
|
|
decade = (year // 10) * 10
|
|
stats['founding_decades'][decade] += 1
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Error processing {yaml_file.name}: {e}")
|
|
continue
|
|
|
|
return stats
|
|
|
|
|
|
def format_for_d3(stats: dict) -> dict:
|
|
"""Format statistics for D3.js visualizations."""
|
|
|
|
total = stats['total_entries']
|
|
|
|
# Institution types for pie/donut chart
|
|
type_data = []
|
|
for code, count in sorted(stats['institution_types'].items(), key=lambda x: -x[1]):
|
|
info = TYPE_INFO.get(code, {'name': code, 'color': '#9e9e9e'})
|
|
type_data.append({
|
|
'code': code,
|
|
'name': info['name'],
|
|
'count': count,
|
|
'percentage': round(count / total * 100, 1),
|
|
'color': info['color'],
|
|
})
|
|
|
|
# Top cities for bar chart
|
|
top_cities = []
|
|
for city, count in stats['cities'].most_common(20):
|
|
top_cities.append({
|
|
'city': city,
|
|
'count': count,
|
|
})
|
|
|
|
# Collection systems for horizontal bar chart
|
|
collection_systems = []
|
|
for system, count in stats['collection_systems'].most_common(15):
|
|
collection_systems.append({
|
|
'system': system,
|
|
'count': count,
|
|
})
|
|
|
|
# Wikidata types for treemap
|
|
wikidata_types = []
|
|
for type_name, count in stats['wikidata_types'].most_common(20):
|
|
wikidata_types.append({
|
|
'type': type_name,
|
|
'count': count,
|
|
})
|
|
|
|
# Enrichment status for donut
|
|
enrichment_status = []
|
|
status_colors = {'success': '#2ecc71', 'skipped': '#f39c12', 'error': '#e74c3c', 'unknown': '#9e9e9e'}
|
|
for status, count in stats['enrichment_status'].items():
|
|
enrichment_status.append({
|
|
'status': status.capitalize(),
|
|
'count': count,
|
|
'percentage': round(count / total * 100, 1),
|
|
'color': status_colors.get(status, '#9e9e9e'),
|
|
})
|
|
|
|
# Identifier coverage for bar chart
|
|
identifier_coverage = []
|
|
id_labels = {
|
|
'has_coordinates': 'Coordinates',
|
|
'has_wikipedia_nl': 'Wikipedia NL',
|
|
'has_image': 'Image',
|
|
'has_website': 'Website',
|
|
'has_isil': 'ISIL Code',
|
|
}
|
|
for key, count in stats['identifiers'].items():
|
|
identifier_coverage.append({
|
|
'identifier': id_labels.get(key, key),
|
|
'count': count,
|
|
'percentage': round(count / total * 100, 1),
|
|
})
|
|
identifier_coverage.sort(key=lambda x: -x['count'])
|
|
|
|
# Founding decades for line/area chart
|
|
founding_timeline = []
|
|
if stats['founding_decades']:
|
|
min_decade = min(stats['founding_decades'].keys())
|
|
max_decade = max(stats['founding_decades'].keys())
|
|
for decade in range(min_decade, max_decade + 10, 10):
|
|
founding_timeline.append({
|
|
'decade': decade,
|
|
'count': stats['founding_decades'].get(decade, 0),
|
|
})
|
|
|
|
# Province distribution for choropleth/cartogram
|
|
province_data = []
|
|
for province, count in sorted(stats['provinces'].items(), key=lambda x: -x[1]):
|
|
# Get breakdown by institution type for this province
|
|
type_breakdown = {}
|
|
for inst_type, type_count in stats['provinces_by_type'][province].items():
|
|
type_info = TYPE_INFO.get(inst_type, {'name': inst_type, 'color': '#9e9e9e'})
|
|
type_breakdown[inst_type] = {
|
|
'code': inst_type,
|
|
'name': type_info['name'],
|
|
'count': type_count,
|
|
'color': type_info['color'],
|
|
}
|
|
|
|
province_data.append({
|
|
'province': province,
|
|
'count': count,
|
|
'color': PROVINCE_COLORS.get(province, '#9e9e9e'),
|
|
'types': type_breakdown,
|
|
})
|
|
|
|
# Google Maps coverage for bar chart
|
|
google_maps_coverage = []
|
|
gm_labels = {
|
|
'has_rating': 'Rating',
|
|
'has_photos': 'Photos',
|
|
'has_reviews': 'Reviews',
|
|
'has_opening_hours': 'Opening Hours',
|
|
'has_street_view': 'Street View',
|
|
}
|
|
for key in ['has_rating', 'has_photos', 'has_reviews', 'has_opening_hours', 'has_street_view']:
|
|
count = stats['google_maps'].get(key, 0)
|
|
google_maps_coverage.append({
|
|
'feature': gm_labels.get(key, key),
|
|
'count': count,
|
|
'percentage': round(count / total * 100, 1) if total > 0 else 0,
|
|
})
|
|
google_maps_coverage.sort(key=lambda x: -x['count'])
|
|
|
|
# Google Maps status for summary
|
|
gm_success = stats['google_maps'].get('status_success', 0)
|
|
gm_not_found = stats['google_maps'].get('status_not_found', 0)
|
|
|
|
return {
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'total_entries': total,
|
|
'summary': {
|
|
'total_institutions': total,
|
|
'with_coordinates': stats['identifiers']['has_coordinates'],
|
|
'with_wikidata': stats['enrichment_status'].get('success', 0),
|
|
'with_google_maps': gm_success,
|
|
'google_maps_not_found': gm_not_found,
|
|
'unique_cities': len(stats['cities']),
|
|
'unique_provinces': len(stats['provinces']),
|
|
'institution_types': len(stats['institution_types']),
|
|
},
|
|
'charts': {
|
|
'institution_types': type_data,
|
|
'top_cities': top_cities,
|
|
'collection_systems': collection_systems,
|
|
'wikidata_types': wikidata_types,
|
|
'enrichment_status': enrichment_status,
|
|
'identifier_coverage': identifier_coverage,
|
|
'google_maps_coverage': google_maps_coverage,
|
|
'founding_timeline': founding_timeline,
|
|
'provinces': province_data,
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main export function."""
|
|
|
|
# Paths
|
|
enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
|
|
output_dir = project_root / 'frontend' / 'public' / 'data'
|
|
output_file = output_dir / 'nde_statistics.json'
|
|
|
|
# Create output directory if needed
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Processing enriched entries from: {enriched_dir}")
|
|
|
|
# Collect statistics
|
|
stats = process_enriched_files(enriched_dir)
|
|
|
|
# Format for D3.js
|
|
d3_data = format_for_d3(stats)
|
|
|
|
# Write JSON
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(d3_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n✅ Statistics export complete!")
|
|
print(f" Total entries: {stats['total_entries']}")
|
|
print(f" Output file: {output_file}")
|
|
|
|
# Print summary
|
|
print(f"\n📊 Summary:")
|
|
print(f" Institution types: {len(stats['institution_types'])}")
|
|
print(f" Unique cities: {len(stats['cities'])}")
|
|
print(f" Provinces with data: {len(stats['provinces'])}")
|
|
print(f" Collection systems: {len(stats['collection_systems'])}")
|
|
print(f" Wikidata types: {len(stats['wikidata_types'])}")
|
|
|
|
# Print province breakdown
|
|
if stats['provinces']:
|
|
print(f"\n🗺️ Province distribution:")
|
|
for province, count in stats['provinces'].most_common():
|
|
print(f" {province}: {count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|