431 lines
16 KiB
Python
431 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export NDE Enriched Institutions to JSON for Frontend Map
|
|
|
|
Reads the enriched YAML files and produces a lightweight JSON file
|
|
suitable for the React/Leaflet map component.
|
|
|
|
Now includes Google Maps enrichment data (ratings, photos, reviews, opening hours).
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import sys
|
|
|
|
# Add project root to path for imports
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("Error: PyYAML not installed. Run: pip install pyyaml")
|
|
sys.exit(1)
|
|
|
|
# Institution type mappings
|
|
TYPE_COLORS = {
|
|
'G': '#00bcd4', # Gallery - cyan
|
|
'L': '#2ecc71', # Library - green
|
|
'A': '#3498db', # Archive - blue
|
|
'M': '#e74c3c', # Museum - red
|
|
'O': '#f39c12', # Official - orange
|
|
'R': '#1abc9c', # Research - teal
|
|
'C': '#795548', # Corporation - brown
|
|
'U': '#9e9e9e', # Unknown - gray
|
|
'B': '#4caf50', # Botanical - green
|
|
'E': '#ff9800', # Education - amber
|
|
'S': '#9b59b6', # Society - purple
|
|
'F': '#95a5a6', # Features - gray
|
|
'I': '#673ab7', # Intangible - deep purple
|
|
'X': '#607d8b', # Mixed - blue gray
|
|
'P': '#ff5722', # Personal - deep orange
|
|
'H': '#607d8b', # Holy sites - blue gray
|
|
'D': '#34495e', # Digital - dark gray
|
|
'N': '#e91e63', # NGO - pink
|
|
'T': '#ff5722', # Taste/smell - deep orange
|
|
}
|
|
|
|
TYPE_NAMES = {
|
|
'G': 'Gallery',
|
|
'L': 'Library',
|
|
'A': 'Archive',
|
|
'M': 'Museum',
|
|
'O': 'Official',
|
|
'R': 'Research',
|
|
'C': 'Corporation',
|
|
'U': 'Unknown',
|
|
'B': 'Botanical',
|
|
'E': 'Education',
|
|
'S': 'Society',
|
|
'F': 'Features',
|
|
'I': 'Intangible',
|
|
'X': 'Mixed',
|
|
'P': 'Personal',
|
|
'H': 'Holy sites',
|
|
'D': 'Digital',
|
|
'N': 'NGO',
|
|
'T': 'Taste/smell',
|
|
}
|
|
|
|
|
|
def extract_institution_data(entry_data: dict) -> dict | None:
|
|
"""Extract the relevant data for the map from an enriched entry."""
|
|
|
|
# Get original entry data
|
|
original = entry_data.get('original_entry', {})
|
|
enrichment = entry_data.get('wikidata_enrichment', {})
|
|
google_maps = entry_data.get('google_maps_enrichment', {})
|
|
exa_data = entry_data.get('exa_enrichment', {})
|
|
|
|
# New enrichment sources
|
|
nan_isil = entry_data.get('nan_isil_enrichment', {})
|
|
museum_register = entry_data.get('museum_register_enrichment', {})
|
|
web_claims_data = entry_data.get('web_claims', {})
|
|
ghcid_data = entry_data.get('ghcid', {})
|
|
identifiers = entry_data.get('identifiers', [])
|
|
custodian_name = entry_data.get('custodian_name', {})
|
|
|
|
# Get coordinates - prefer Google Maps (more precise), fall back to Wikidata
|
|
lat, lon = None, None
|
|
|
|
# Try Google Maps coordinates first
|
|
google_coords = google_maps.get('coordinates', {})
|
|
if google_coords.get('latitude') and google_coords.get('longitude'):
|
|
lat = google_coords['latitude']
|
|
lon = google_coords['longitude']
|
|
else:
|
|
# Fall back to Wikidata coordinates
|
|
wd_coords = enrichment.get('wikidata_coordinates', {})
|
|
if wd_coords.get('latitude') and wd_coords.get('longitude'):
|
|
lat = wd_coords['latitude']
|
|
lon = wd_coords['longitude']
|
|
|
|
# Skip if no coordinates from any source
|
|
if not lat or not lon:
|
|
return None
|
|
|
|
# Get institution type (first one if list)
|
|
types = original.get('type', [])
|
|
inst_type = types[0] if types else 'U'
|
|
|
|
# Get name - prefer Dutch label, fall back to original name
|
|
name = (
|
|
enrichment.get('wikidata_label_nl') or
|
|
original.get('organisatie') or
|
|
'Unknown Institution'
|
|
)
|
|
|
|
# Get city - prefer Google Maps short address
|
|
city = original.get('plaatsnaam_bezoekadres', '')
|
|
|
|
# Get province from Google Maps address components (administrative_area_level_1)
|
|
province = None
|
|
address_components = google_maps.get('address_components', [])
|
|
for component in address_components:
|
|
component_types = component.get('types', [])
|
|
if 'administrative_area_level_1' in component_types:
|
|
province = component.get('long_name')
|
|
break
|
|
|
|
# Fall back to Wikidata located_in if no Google Maps province
|
|
if not province:
|
|
located_in = enrichment.get('wikidata_located_in', {})
|
|
if located_in:
|
|
# Check if it's a municipality in a known province
|
|
desc = located_in.get('description_nl', '')
|
|
# Extract province from description like "gemeente in Drenthe, Nederland"
|
|
if 'gemeente in ' in desc:
|
|
parts = desc.split('gemeente in ')
|
|
if len(parts) > 1:
|
|
province_part = parts[1].split(',')[0].strip()
|
|
if province_part and province_part != 'Nederland':
|
|
province = province_part
|
|
|
|
# Get description - prefer Dutch, fall back to English, then Exa, then Google editorial
|
|
# Handle various types safely
|
|
description = ''
|
|
if enrichment.get('wikidata_description_nl'):
|
|
description = enrichment['wikidata_description_nl']
|
|
elif enrichment.get('wikidata_description_en'):
|
|
description = enrichment['wikidata_description_en']
|
|
elif exa_data.get('description'):
|
|
description = exa_data['description']
|
|
else:
|
|
editorial = google_maps.get('editorial_summary')
|
|
if editorial and isinstance(editorial, dict):
|
|
description = editorial.get('text', '')
|
|
elif isinstance(editorial, str):
|
|
description = editorial
|
|
|
|
# Ensure description is a string
|
|
if not isinstance(description, str):
|
|
description = ''
|
|
|
|
# Get website - prefer Google Maps (more current), fall back to Wikidata
|
|
website = (
|
|
google_maps.get('website') or
|
|
enrichment.get('wikidata_official_website') or
|
|
original.get('webadres_organisatie') or
|
|
''
|
|
)
|
|
|
|
# Get Wikidata ID
|
|
wikidata_id = enrichment.get('wikidata_entity_id', '')
|
|
|
|
# Get Wikidata instance_of types (P31)
|
|
# This gives us fine-grained types like "museum", "historical society", "regional archive"
|
|
wikidata_types = []
|
|
instance_of_list = enrichment.get('wikidata_instance_of', [])
|
|
for wd_type in instance_of_list:
|
|
# Prefer English label, fall back to Dutch
|
|
label = wd_type.get('label_en') or wd_type.get('label_nl')
|
|
if label:
|
|
wikidata_types.append(label)
|
|
|
|
# Build result with base data
|
|
result = {
|
|
'lat': lat,
|
|
'lon': lon,
|
|
'name': name,
|
|
'city': city,
|
|
'province': province, # Add province field
|
|
'type': inst_type,
|
|
'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
|
|
'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
|
|
'website': website,
|
|
'wikidata_id': wikidata_id,
|
|
'wikidata_types': wikidata_types, # Fine-grained Wikidata types (P31)
|
|
'description': description, # Keep full description
|
|
}
|
|
|
|
# Add Google Maps enrichment data if available
|
|
if google_maps:
|
|
# Rating and reviews count
|
|
if google_maps.get('rating'):
|
|
result['rating'] = google_maps['rating']
|
|
result['total_ratings'] = google_maps.get('total_ratings', 0)
|
|
|
|
# Phone number
|
|
if google_maps.get('phone_international'):
|
|
result['phone'] = google_maps['phone_international']
|
|
elif google_maps.get('phone_local'):
|
|
result['phone'] = google_maps['phone_local']
|
|
|
|
# Formatted address (more complete than city)
|
|
if google_maps.get('formatted_address'):
|
|
result['address'] = google_maps['formatted_address']
|
|
|
|
# Opening hours (weekday text is human readable)
|
|
opening_hours = google_maps.get('opening_hours', {})
|
|
if opening_hours.get('weekday_text'):
|
|
result['opening_hours'] = opening_hours['weekday_text']
|
|
result['open_now'] = opening_hours.get('open_now', None)
|
|
|
|
# Reviews - keep all reviews with full text
|
|
reviews = google_maps.get('reviews', [])
|
|
if reviews:
|
|
result['reviews'] = [
|
|
{
|
|
'author': r.get('author_name', 'Anonymous'),
|
|
'rating': r.get('rating', 0),
|
|
'text': r.get('text', ''), # Keep full text
|
|
'time': r.get('relative_time_description', '')
|
|
}
|
|
for r in reviews # Keep all reviews
|
|
]
|
|
|
|
# Photos - keep all photos
|
|
photos = google_maps.get('photos', [])
|
|
photo_urls = google_maps.get('photo_urls', [])
|
|
|
|
if photo_urls:
|
|
# Direct URL format
|
|
result['photos'] = [{'url': url, 'attribution': ''} for url in photo_urls]
|
|
elif photos:
|
|
# Object format with attribution
|
|
result['photos'] = [
|
|
{
|
|
'url': p.get('url', ''),
|
|
'attribution': p.get('attributions', [''])[0] if p.get('attributions') else ''
|
|
}
|
|
for p in photos
|
|
]
|
|
|
|
# Street View URL
|
|
if google_maps.get('street_view_url'):
|
|
result['street_view_url'] = google_maps['street_view_url']
|
|
|
|
# Business status
|
|
if google_maps.get('business_status'):
|
|
result['business_status'] = google_maps['business_status']
|
|
|
|
# Google Place ID for linking
|
|
if google_maps.get('place_id'):
|
|
result['google_place_id'] = google_maps['place_id']
|
|
|
|
# Add ISIL data from Nationaal Archief enrichment
|
|
if nan_isil:
|
|
result['isil'] = {
|
|
'code': nan_isil.get('isil_code', ''),
|
|
'name': nan_isil.get('nan_name', ''),
|
|
'city': nan_isil.get('nan_city', ''),
|
|
'assigned_date': nan_isil.get('nan_toegekend_op', ''),
|
|
'source': 'Nationaal Archief ISIL Registry',
|
|
}
|
|
|
|
# Add Museum Register data
|
|
if museum_register:
|
|
result['museum_register'] = {
|
|
'name': museum_register.get('museum_name', ''),
|
|
'province': museum_register.get('province', ''),
|
|
'registered_since': museum_register.get('registered_since', ''),
|
|
'website': museum_register.get('website_url', ''),
|
|
}
|
|
|
|
# Add GHCID (Global Heritage Custodian Identifier)
|
|
if ghcid_data:
|
|
result['ghcid'] = {
|
|
'current': ghcid_data.get('ghcid_current', ''),
|
|
'uuid': ghcid_data.get('ghcid_uuid', ''),
|
|
}
|
|
|
|
# Add standardized identifiers
|
|
if identifiers:
|
|
result['identifiers'] = [
|
|
{
|
|
'scheme': id_entry.get('identifier_scheme', ''),
|
|
'value': id_entry.get('identifier_value', ''),
|
|
'url': id_entry.get('identifier_url', ''),
|
|
}
|
|
for id_entry in identifiers
|
|
if id_entry.get('identifier_scheme') in ('ISIL', 'GHCID', 'Wikidata', 'VIAF')
|
|
]
|
|
|
|
# Add web claims (social media, description from website)
|
|
if web_claims_data and web_claims_data.get('claims'):
|
|
web_claims_list = web_claims_data.get('claims', [])
|
|
social_links = {}
|
|
web_description = None
|
|
|
|
for claim in web_claims_list:
|
|
claim_type = claim.get('claim_type', '')
|
|
claim_value = claim.get('claim_value', '')
|
|
|
|
if claim_type == 'social_facebook':
|
|
social_links['facebook'] = claim_value
|
|
elif claim_type == 'social_instagram':
|
|
social_links['instagram'] = claim_value
|
|
elif claim_type == 'social_twitter':
|
|
social_links['twitter'] = claim_value
|
|
elif claim_type == 'social_linkedin':
|
|
social_links['linkedin'] = claim_value
|
|
elif claim_type == 'social_youtube':
|
|
social_links['youtube'] = claim_value
|
|
elif claim_type == 'description_short' and not web_description:
|
|
web_description = claim_value
|
|
|
|
if social_links:
|
|
result['social_media'] = social_links
|
|
|
|
if web_description and not result.get('description'):
|
|
result['description'] = web_description
|
|
|
|
# Add verified custodian name if available
|
|
if custodian_name and custodian_name.get('claim_value'):
|
|
result['verified_name'] = custodian_name.get('claim_value')
|
|
result['name_source'] = custodian_name.get('extraction_method', 'unknown')
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
"""Main export function."""
|
|
|
|
# Paths
|
|
enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
|
|
output_dir = project_root / 'frontend' / 'public' / 'data'
|
|
output_file = output_dir / 'nde_institutions.json'
|
|
|
|
# Create output directory if needed
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Reading enriched entries from: {enriched_dir}")
|
|
|
|
institutions = []
|
|
files_processed = 0
|
|
files_with_coords = 0
|
|
|
|
# Process all YAML files
|
|
yaml_files = sorted(enriched_dir.glob('*.yaml'))
|
|
|
|
for yaml_file in yaml_files:
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry_data = yaml.safe_load(f)
|
|
|
|
files_processed += 1
|
|
|
|
# Extract institution data
|
|
inst_data = extract_institution_data(entry_data)
|
|
if inst_data:
|
|
institutions.append(inst_data)
|
|
files_with_coords += 1
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Error processing {yaml_file.name}: {e}")
|
|
continue
|
|
|
|
# Sort by name
|
|
institutions.sort(key=lambda x: x['name'].lower())
|
|
|
|
# Write JSON
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n✅ Export complete!")
|
|
print(f" Files processed: {files_processed}")
|
|
print(f" Institutions with coordinates: {files_with_coords}")
|
|
print(f" Output file: {output_file}")
|
|
|
|
# Print type distribution
|
|
type_counts = {}
|
|
for inst in institutions:
|
|
t = inst['type']
|
|
type_counts[t] = type_counts.get(t, 0) + 1
|
|
|
|
print(f"\n📊 Distribution by type:")
|
|
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {TYPE_NAMES.get(t, t)}: {count}")
|
|
|
|
# Print Google Maps enrichment stats
|
|
with_rating = sum(1 for i in institutions if i.get('rating'))
|
|
with_photos = sum(1 for i in institutions if i.get('photos'))
|
|
with_reviews = sum(1 for i in institutions if i.get('reviews'))
|
|
with_hours = sum(1 for i in institutions if i.get('opening_hours'))
|
|
with_street_view = sum(1 for i in institutions if i.get('street_view_url'))
|
|
|
|
print(f"\n🗺️ Google Maps enrichment coverage:")
|
|
print(f" With ratings: {with_rating} ({with_rating*100/len(institutions):.1f}%)")
|
|
print(f" With photos: {with_photos} ({with_photos*100/len(institutions):.1f}%)")
|
|
print(f" With reviews: {with_reviews} ({with_reviews*100/len(institutions):.1f}%)")
|
|
print(f" With opening hours: {with_hours} ({with_hours*100/len(institutions):.1f}%)")
|
|
print(f" With Street View: {with_street_view} ({with_street_view*100/len(institutions):.1f}%)")
|
|
|
|
# Print new enrichment sources stats
|
|
with_isil = sum(1 for i in institutions if i.get('isil'))
|
|
with_museum_reg = sum(1 for i in institutions if i.get('museum_register'))
|
|
with_ghcid = sum(1 for i in institutions if i.get('ghcid'))
|
|
with_social = sum(1 for i in institutions if i.get('social_media'))
|
|
with_verified_name = sum(1 for i in institutions if i.get('verified_name'))
|
|
|
|
print(f"\n📋 New enrichment coverage:")
|
|
print(f" With ISIL code: {with_isil} ({with_isil*100/len(institutions):.1f}%)")
|
|
print(f" With Museum Register: {with_museum_reg} ({with_museum_reg*100/len(institutions):.1f}%)")
|
|
print(f" With GHCID: {with_ghcid} ({with_ghcid*100/len(institutions):.1f}%)")
|
|
print(f" With social media: {with_social} ({with_social*100/len(institutions):.1f}%)")
|
|
print(f" With verified name: {with_verified_name} ({with_verified_name*100/len(institutions):.1f}%)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|