glam/scripts/export_nde_map_json.py
kempersc 5cdce584b2 Add complete schema for heritage custodian observation reconstruction
- Introduced a comprehensive class diagram for the heritage custodian observation reconstruction schema.
- Defined multiple classes including AllocationAgency, ArchiveOrganizationType, AuxiliaryDigitalPlatform, and others, with relevant attributes and relationships.
- Established inheritance and associations among classes to represent complex relationships within the schema.
- Generated on 2025-11-28, version 0.9.0, excluding the Container class.
2025-11-28 13:13:23 +01:00

302 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Export NDE Enriched Institutions to JSON for Frontend Map
Reads the enriched YAML files and produces a lightweight JSON file
suitable for the React/Leaflet map component.
Now includes Google Maps enrichment data (ratings, photos, reviews, opening hours).
"""
import json
from pathlib import Path
from datetime import datetime, timezone
import sys
# Add project root to path for imports
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
try:
import yaml
except ImportError:
print("Error: PyYAML not installed. Run: pip install pyyaml")
sys.exit(1)
# Institution type mappings
TYPE_COLORS = {
'G': '#00bcd4', # Gallery - cyan
'L': '#2ecc71', # Library - green
'A': '#3498db', # Archive - blue
'M': '#e74c3c', # Museum - red
'O': '#f39c12', # Official - orange
'R': '#1abc9c', # Research - teal
'C': '#795548', # Corporation - brown
'U': '#9e9e9e', # Unknown - gray
'B': '#4caf50', # Botanical - green
'E': '#ff9800', # Education - amber
'S': '#9b59b6', # Society - purple
'F': '#95a5a6', # Features - gray
'I': '#673ab7', # Intangible - deep purple
'X': '#607d8b', # Mixed - blue gray
'P': '#ff5722', # Personal - deep orange
'H': '#607d8b', # Holy sites - blue gray
'D': '#34495e', # Digital - dark gray
'N': '#e91e63', # NGO - pink
'T': '#ff5722', # Taste/smell - deep orange
}
TYPE_NAMES = {
'G': 'Gallery',
'L': 'Library',
'A': 'Archive',
'M': 'Museum',
'O': 'Official',
'R': 'Research',
'C': 'Corporation',
'U': 'Unknown',
'B': 'Botanical',
'E': 'Education',
'S': 'Society',
'F': 'Features',
'I': 'Intangible',
'X': 'Mixed',
'P': 'Personal',
'H': 'Holy sites',
'D': 'Digital',
'N': 'NGO',
'T': 'Taste/smell',
}
def extract_institution_data(entry_data: dict) -> dict | None:
"""Extract the relevant data for the map from an enriched entry."""
# Get original entry data
original = entry_data.get('original_entry', {})
enrichment = entry_data.get('wikidata_enrichment', {})
google_maps = entry_data.get('google_maps_enrichment', {})
exa_data = entry_data.get('exa_enrichment', {})
# Get coordinates - prefer Google Maps (more precise), fall back to Wikidata
lat, lon = None, None
# Try Google Maps coordinates first
google_coords = google_maps.get('coordinates', {})
if google_coords.get('latitude') and google_coords.get('longitude'):
lat = google_coords['latitude']
lon = google_coords['longitude']
else:
# Fall back to Wikidata coordinates
wd_coords = enrichment.get('wikidata_coordinates', {})
if wd_coords.get('latitude') and wd_coords.get('longitude'):
lat = wd_coords['latitude']
lon = wd_coords['longitude']
# Skip if no coordinates from any source
if not lat or not lon:
return None
# Get institution type (first one if list)
types = original.get('type', [])
inst_type = types[0] if types else 'U'
# Get name - prefer Dutch label, fall back to original name
name = (
enrichment.get('wikidata_label_nl') or
original.get('organisatie') or
'Unknown Institution'
)
# Get city - prefer Google Maps short address
city = original.get('plaatsnaam_bezoekadres', '')
# Get description - prefer Dutch, fall back to English, then Exa, then Google editorial
# Handle various types safely
description = ''
if enrichment.get('wikidata_description_nl'):
description = enrichment['wikidata_description_nl']
elif enrichment.get('wikidata_description_en'):
description = enrichment['wikidata_description_en']
elif exa_data.get('description'):
description = exa_data['description']
else:
editorial = google_maps.get('editorial_summary')
if editorial and isinstance(editorial, dict):
description = editorial.get('text', '')
elif isinstance(editorial, str):
description = editorial
# Ensure description is a string
if not isinstance(description, str):
description = ''
# Get website - prefer Google Maps (more current), fall back to Wikidata
website = (
google_maps.get('website') or
enrichment.get('wikidata_official_website') or
original.get('webadres_organisatie') or
''
)
# Get Wikidata ID
wikidata_id = enrichment.get('wikidata_entity_id', '')
# Build result with base data
result = {
'lat': lat,
'lon': lon,
'name': name,
'city': city,
'type': inst_type,
'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
'website': website,
'wikidata_id': wikidata_id,
'description': description[:200] + '...' if len(description) > 200 else description,
}
# Add Google Maps enrichment data if available
if google_maps:
# Rating and reviews count
if google_maps.get('rating'):
result['rating'] = google_maps['rating']
result['total_ratings'] = google_maps.get('total_ratings', 0)
# Phone number
if google_maps.get('phone_international'):
result['phone'] = google_maps['phone_international']
elif google_maps.get('phone_local'):
result['phone'] = google_maps['phone_local']
# Formatted address (more complete than city)
if google_maps.get('formatted_address'):
result['address'] = google_maps['formatted_address']
# Opening hours (weekday text is human readable)
opening_hours = google_maps.get('opening_hours', {})
if opening_hours.get('weekday_text'):
result['opening_hours'] = opening_hours['weekday_text']
result['open_now'] = opening_hours.get('open_now', None)
# Reviews (first 3 for popups)
reviews = google_maps.get('reviews', [])
if reviews:
result['reviews'] = [
{
'author': r.get('author_name', 'Anonymous'),
'rating': r.get('rating', 0),
'text': r.get('text', '')[:300] + '...' if len(r.get('text', '')) > 300 else r.get('text', ''),
'time': r.get('relative_time_description', '')
}
for r in reviews[:3] # Only first 3 for popup
]
# Photos (first 5) - check both possible keys
photos = google_maps.get('photos', [])
photo_urls = google_maps.get('photo_urls', [])
if photo_urls:
# Direct URL format
result['photos'] = [{'url': url, 'attribution': ''} for url in photo_urls[:5]]
elif photos:
# Object format with attribution
result['photos'] = [
{
'url': p.get('url', ''),
'attribution': p.get('attributions', [''])[0] if p.get('attributions') else ''
}
for p in photos[:5]
]
# Street View URL
if google_maps.get('street_view_url'):
result['street_view_url'] = google_maps['street_view_url']
# Business status
if google_maps.get('business_status'):
result['business_status'] = google_maps['business_status']
# Google Place ID for linking
if google_maps.get('place_id'):
result['google_place_id'] = google_maps['place_id']
return result
def main():
"""Main export function."""
# Paths
enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
output_dir = project_root / 'frontend' / 'public' / 'data'
output_file = output_dir / 'nde_institutions.json'
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Reading enriched entries from: {enriched_dir}")
institutions = []
files_processed = 0
files_with_coords = 0
# Process all YAML files
yaml_files = sorted(enriched_dir.glob('*.yaml'))
for yaml_file in yaml_files:
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
entry_data = yaml.safe_load(f)
files_processed += 1
# Extract institution data
inst_data = extract_institution_data(entry_data)
if inst_data:
institutions.append(inst_data)
files_with_coords += 1
except Exception as e:
print(f"Warning: Error processing {yaml_file.name}: {e}")
continue
# Sort by name
institutions.sort(key=lambda x: x['name'].lower())
# Write JSON
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, ensure_ascii=False, indent=2)
print(f"\n✅ Export complete!")
print(f" Files processed: {files_processed}")
print(f" Institutions with coordinates: {files_with_coords}")
print(f" Output file: {output_file}")
# Print type distribution
type_counts = {}
for inst in institutions:
t = inst['type']
type_counts[t] = type_counts.get(t, 0) + 1
print(f"\n📊 Distribution by type:")
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {TYPE_NAMES.get(t, t)}: {count}")
# Print Google Maps enrichment stats
with_rating = sum(1 for i in institutions if i.get('rating'))
with_photos = sum(1 for i in institutions if i.get('photos'))
with_reviews = sum(1 for i in institutions if i.get('reviews'))
with_hours = sum(1 for i in institutions if i.get('opening_hours'))
with_street_view = sum(1 for i in institutions if i.get('street_view_url'))
print(f"\n🗺️ Google Maps enrichment coverage:")
print(f" With ratings: {with_rating} ({with_rating*100/len(institutions):.1f}%)")
print(f" With photos: {with_photos} ({with_photos*100/len(institutions):.1f}%)")
print(f" With reviews: {with_reviews} ({with_reviews*100/len(institutions):.1f}%)")
print(f" With opening hours: {with_hours} ({with_hours*100/len(institutions):.1f}%)")
print(f" With Street View: {with_street_view} ({with_street_view*100/len(institutions):.1f}%)")
if __name__ == '__main__':
main()