#!/usr/bin/env python3 """ Export NDE Enriched Institutions to JSON for Frontend Map Reads the enriched YAML files and produces a lightweight JSON file suitable for the React/Leaflet map component. Now includes Google Maps enrichment data (ratings, photos, reviews, opening hours). """ import json from pathlib import Path from datetime import datetime, timezone import sys # Add project root to path for imports project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) try: import yaml # Use C-based loader for faster parsing (10x faster) try: from yaml import CSafeLoader as SafeLoader except ImportError: from yaml import SafeLoader except ImportError: print("Error: PyYAML not installed. Run: pip install pyyaml") sys.exit(1) # Institution type mappings TYPE_COLORS = { 'G': '#00bcd4', # Gallery - cyan 'L': '#2ecc71', # Library - green 'A': '#3498db', # Archive - blue 'M': '#e74c3c', # Museum - red 'O': '#f39c12', # Official - orange 'R': '#1abc9c', # Research - teal 'C': '#795548', # Corporation - brown 'U': '#9e9e9e', # Unknown - gray 'B': '#4caf50', # Botanical - green 'E': '#ff9800', # Education - amber 'S': '#9b59b6', # Society - purple 'F': '#95a5a6', # Features - gray 'I': '#673ab7', # Intangible - deep purple 'X': '#607d8b', # Mixed - blue gray 'P': '#ff5722', # Personal - deep orange 'H': '#607d8b', # Holy sites - blue gray 'D': '#34495e', # Digital - dark gray 'N': '#e91e63', # NGO - pink 'T': '#ff5722', # Taste/smell - deep orange } TYPE_NAMES = { 'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum', 'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown', 'B': 'Botanical', 'E': 'Education', 'S': 'Society', 'F': 'Features', 'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy sites', 'D': 'Digital', 'N': 'NGO', 'T': 'Taste/smell', } def extract_institution_data(entry_data: dict) -> dict | None: """Extract the relevant data for the map from an enriched entry.""" # Skip duplicates - they should not be visible in the export duplicate_status = entry_data.get('duplicate_status', {}) if duplicate_status.get('is_duplicate'): return None # Get original entry data original = entry_data.get('original_entry', {}) enrichment = entry_data.get('wikidata_enrichment', {}) google_maps = entry_data.get('google_maps_enrichment', {}) exa_data = entry_data.get('exa_enrichment', {}) # New enrichment sources nan_isil = entry_data.get('nan_isil_enrichment', {}) kb_isil = entry_data.get('kb_enrichment', {}) zcbs_data = entry_data.get('zcbs_enrichment', {}) museum_register = entry_data.get('museum_register_enrichment', {}) web_claims_data = entry_data.get('web_claims', {}) ghcid_data = entry_data.get('ghcid', {}) identifiers = entry_data.get('identifiers', []) custodian_name = entry_data.get('custodian_name', {}) youtube_data = entry_data.get('youtube_enrichment', {}) temporal_extent = entry_data.get('temporal_extent', {}) successor_org = entry_data.get('successor_organization', {}) genealogiewerkbalk_data = entry_data.get('genealogiewerkbalk_enrichment', {}) # Get coordinates - prefer Google Maps (more precise), fall back to Wikidata lat, lon = None, None # Try Google Maps coordinates first google_coords = google_maps.get('coordinates', {}) if google_coords.get('latitude') and google_coords.get('longitude'): lat = google_coords['latitude'] lon = google_coords['longitude'] else: # Fall back to Wikidata coordinates wd_coords = enrichment.get('wikidata_coordinates', {}) if wd_coords.get('latitude') and wd_coords.get('longitude'): lat = wd_coords['latitude'] lon = wd_coords['longitude'] # Skip if no coordinates from any source if not lat or not lon: return None # Get institution type (first one if list) types = original.get('type', []) inst_type = types[0] if types else 'U' # Get name - prefer Dutch label, fall back to original name name = ( enrichment.get('wikidata_label_nl') or original.get('organisatie') or 'Unknown Institution' ) # Get city - prefer Google Maps short address city = original.get('plaatsnaam_bezoekadres', '') # Get province from Google Maps address components (administrative_area_level_1) province = None address_components = google_maps.get('address_components', []) for component in address_components: component_types = component.get('types', []) if 'administrative_area_level_1' in component_types: province = component.get('long_name') break # Fall back to Wikidata located_in if no Google Maps province if not province: located_in = enrichment.get('wikidata_located_in', {}) if located_in: # Check if it's a municipality in a known province desc = located_in.get('description_nl', '') # Extract province from description like "gemeente in Drenthe, Nederland" if 'gemeente in ' in desc: parts = desc.split('gemeente in ') if len(parts) > 1: province_part = parts[1].split(',')[0].strip() if province_part and province_part != 'Nederland': province = province_part # Get description - prefer Dutch, fall back to English, then Exa, then Google editorial # Handle various types safely description = '' if enrichment.get('wikidata_description_nl'): description = enrichment['wikidata_description_nl'] elif enrichment.get('wikidata_description_en'): description = enrichment['wikidata_description_en'] elif exa_data.get('description'): description = exa_data['description'] else: editorial = google_maps.get('editorial_summary') if editorial and isinstance(editorial, dict): description = editorial.get('text', '') elif isinstance(editorial, str): description = editorial # Ensure description is a string if not isinstance(description, str): description = '' # Get website - prefer Google Maps (more current), fall back to Wikidata website = ( google_maps.get('website') or enrichment.get('wikidata_official_website') or original.get('webadres_organisatie') or '' ) # Get Wikidata ID wikidata_id = enrichment.get('wikidata_entity_id', '') # Get Wikidata instance_of types (P31) # This gives us fine-grained types like "museum", "historical society", "regional archive" wikidata_types = [] instance_of_list = enrichment.get('wikidata_instance_of', []) for wd_type in instance_of_list: # Prefer English label, fall back to Dutch label = wd_type.get('label_en') or wd_type.get('label_nl') if label: wikidata_types.append(label) # Extract founding date from Wikidata inception (P571) founding_year = None founding_decade = None inception = enrichment.get('wikidata_inception') if inception: # Handle both formats: # 1. Dict format: {'time': '+1959-00-00T00:00:00Z', ...} # 2. String format: '2001-01-01' or '+1959-00-00T00:00:00Z' if isinstance(inception, dict): time_str = inception.get('time', '') else: time_str = str(inception) if time_str: # Extract year from time string (e.g., +1959-00-00T00:00:00Z -> 1959) try: # Remove leading + and parse year year_part = time_str.lstrip('+').split('-')[0] if year_part.isdigit(): founding_year = int(year_part) # Only include reasonable years (after 1000 CE, before current year + 10) if 1000 <= founding_year <= 2035: founding_decade = (founding_year // 10) * 10 else: founding_year = None except (ValueError, IndexError): pass # Build result with base data result = { 'lat': lat, 'lon': lon, 'name': name, 'city': city, 'province': province, # Add province field 'type': inst_type, 'type_name': TYPE_NAMES.get(inst_type, 'Unknown'), 'color': TYPE_COLORS.get(inst_type, '#9e9e9e'), 'website': website, 'wikidata_id': wikidata_id, 'wikidata_types': wikidata_types, # Fine-grained Wikidata types (P31) 'description': description, # Keep full description } # Add founding date if available if founding_year: result['founding_year'] = founding_year result['founding_decade'] = founding_decade # Add Google Maps enrichment data if available if google_maps: # Rating and reviews count if google_maps.get('rating'): result['rating'] = google_maps['rating'] result['total_ratings'] = google_maps.get('total_ratings', 0) # Phone number if google_maps.get('phone_international'): result['phone'] = google_maps['phone_international'] elif google_maps.get('phone_local'): result['phone'] = google_maps['phone_local'] # Formatted address (more complete than city) if google_maps.get('formatted_address'): result['address'] = google_maps['formatted_address'] # Opening hours (weekday text is human readable) opening_hours = google_maps.get('opening_hours', {}) if opening_hours.get('weekday_text'): result['opening_hours'] = opening_hours['weekday_text'] result['open_now'] = opening_hours.get('open_now', None) # Reviews - keep all reviews with full text reviews = google_maps.get('reviews', []) if reviews: result['reviews'] = [ { 'author': r.get('author_name', 'Anonymous'), 'rating': r.get('rating', 0), 'text': r.get('text', ''), # Keep full text 'time': r.get('relative_time_description', '') } for r in reviews # Keep all reviews ] # Photos - keep all photos photos = google_maps.get('photos', []) photo_urls = google_maps.get('photo_urls', []) if photo_urls: # Direct URL format result['photos'] = [{'url': url, 'attribution': ''} for url in photo_urls] elif photos: # Object format with attribution result['photos'] = [ { 'url': p.get('url', ''), 'attribution': p.get('attributions', [''])[0] if p.get('attributions') else '' } for p in photos ] # Street View URL if google_maps.get('street_view_url'): result['street_view_url'] = google_maps['street_view_url'] # Business status if google_maps.get('business_status'): result['business_status'] = google_maps['business_status'] # Google Place ID for linking if google_maps.get('place_id'): result['google_place_id'] = google_maps['place_id'] # Add ISIL data from Nationaal Archief enrichment if nan_isil: result['isil'] = { 'code': nan_isil.get('isil_code', ''), 'name': nan_isil.get('nan_name', ''), 'city': nan_isil.get('nan_city', ''), 'assigned_date': nan_isil.get('nan_toegekend_op', ''), 'source': 'Nationaal Archief ISIL Registry', } # Add ISIL data from KB Netherlands Library Network (if no NA ISIL) elif kb_isil: result['isil'] = { 'code': kb_isil.get('isil_code', ''), 'name': kb_isil.get('name', ''), 'city': kb_isil.get('city', ''), 'assigned_date': kb_isil.get('extraction_date', ''), 'source': 'KB Netherlands Library Network', } # Add Museum Register data if museum_register: result['museum_register'] = { 'name': museum_register.get('museum_name', ''), 'province': museum_register.get('province', ''), 'registered_since': museum_register.get('registered_since', ''), 'website': museum_register.get('website_url', ''), } # Add ZCBS collection platform data if zcbs_data: result['zcbs'] = { 'id': zcbs_data.get('zcbs_id', ''), 'name': zcbs_data.get('zcbs_name', ''), 'platform_urls': zcbs_data.get('platform_urls', {}), 'match_score': zcbs_data.get('match_score', 0), } # Add GHCID (Global Heritage Custodian Identifier) if ghcid_data: result['ghcid'] = { 'current': ghcid_data.get('ghcid_current', ''), 'uuid': ghcid_data.get('ghcid_uuid', ''), 'numeric': ghcid_data.get('ghcid_numeric', ''), } # Add standardized identifiers if identifiers: result['identifiers'] = [ { 'scheme': id_entry.get('identifier_scheme', ''), 'value': id_entry.get('identifier_value', ''), 'url': id_entry.get('identifier_url', ''), } for id_entry in identifiers if id_entry.get('identifier_scheme') in ('ISIL', 'GHCID', 'Wikidata', 'VIAF', 'ZCBS') ] else: result['identifiers'] = [] # Add ZCBS to identifiers array if present if zcbs_data and zcbs_data.get('zcbs_id'): result['identifiers'].append({ 'scheme': 'ZCBS', 'value': str(zcbs_data.get('zcbs_id', '')), 'url': list(zcbs_data.get('platform_urls', {}).values())[0] if zcbs_data.get('platform_urls') else '', }) # Add web claims (social media, description from website) if web_claims_data and web_claims_data.get('claims'): web_claims_list = web_claims_data.get('claims', []) social_links = {} web_description = None for claim in web_claims_list: claim_type = claim.get('claim_type', '') claim_value = claim.get('claim_value', '') if claim_type == 'social_facebook': social_links['facebook'] = claim_value elif claim_type == 'social_instagram': social_links['instagram'] = claim_value elif claim_type == 'social_twitter': social_links['twitter'] = claim_value elif claim_type == 'social_linkedin': social_links['linkedin'] = claim_value elif claim_type == 'social_youtube': social_links['youtube'] = claim_value elif claim_type == 'description_short' and not web_description: web_description = claim_value if social_links: result['social_media'] = social_links if web_description and not result.get('description'): result['description'] = web_description # Add verified custodian name if available if custodian_name and custodian_name.get('claim_value'): result['verified_name'] = custodian_name.get('claim_value') result['name_source'] = custodian_name.get('extraction_method', 'unknown') # Add YouTube enrichment data if youtube_data and youtube_data.get('status') == 'SUCCESS': channel = youtube_data.get('channel', {}) videos = youtube_data.get('videos', []) youtube_result = { 'channel_id': channel.get('channel_id'), 'channel_url': channel.get('channel_url'), 'channel_title': channel.get('title'), 'channel_description': (channel.get('description') or '')[:500], # Truncate 'subscriber_count': channel.get('subscriber_count'), 'video_count': channel.get('video_count'), 'view_count': channel.get('view_count'), 'thumbnail_url': channel.get('thumbnail_url'), } # Add videos (limited to top 5 for JSON size) if videos: youtube_result['videos'] = [ { 'video_id': v.get('video_id'), 'video_url': v.get('video_url'), 'title': v.get('title'), 'description': (v.get('description') or '')[:200], # Truncate 'published_at': v.get('published_at'), 'duration': v.get('duration'), 'view_count': v.get('view_count'), 'like_count': v.get('like_count'), 'comment_count': v.get('comment_count'), 'thumbnail_url': v.get('thumbnail_url'), # Include top comments (max 3) 'comments': [ { 'author': c.get('author_display_name'), 'text': (c.get('text') or '')[:300], 'like_count': c.get('like_count'), } for c in (v.get('comments') or [])[:3] ], # Include transcript snippet if available 'has_transcript': bool(v.get('transcript')), 'transcript_snippet': (v.get('transcript', {}).get('transcript_text') or '')[:500] if v.get('transcript') else None, } for v in videos[:5] # Limit to 5 videos ] result['youtube'] = youtube_result # Add temporal extent (TimeSpan) - dissolution/closure dates if temporal_extent: timespan = {} # Dissolution/closure date if temporal_extent.get('dissolution_date'): timespan['dissolution_date'] = temporal_extent['dissolution_date'] if temporal_extent.get('dissolution_reason'): timespan['dissolution_reason'] = temporal_extent['dissolution_reason'] # Founding date (if in temporal_extent, not Wikidata inception) if temporal_extent.get('founding_date'): timespan['founding_date'] = temporal_extent['founding_date'] # End date (alternative to dissolution_date) if temporal_extent.get('end_date'): timespan['end_date'] = temporal_extent['end_date'] # Status indicators if temporal_extent.get('is_defunct'): timespan['is_defunct'] = temporal_extent['is_defunct'] if temporal_extent.get('is_operational') is not None: timespan['is_operational'] = temporal_extent['is_operational'] if timespan: result['temporal_extent'] = timespan # Add successor organization (for dissolved/merged entities) if successor_org: successor = {} if successor_org.get('name'): successor['name'] = successor_org['name'] if successor_org.get('wikidata_id'): successor['wikidata_id'] = successor_org['wikidata_id'] if successor_org.get('isil'): successor['isil'] = successor_org['isil'] if successor_org.get('website'): successor['website'] = successor_org['website'] if successor_org.get('relationship'): successor['relationship'] = successor_org['relationship'] if successor: result['successor_organization'] = successor # Add Genealogiewerkbalk enrichment (municipality/province archive information) if genealogiewerkbalk_data: genealogiewerkbalk = {} # Municipality info municipality = genealogiewerkbalk_data.get('municipality', {}) if municipality: genealogiewerkbalk['municipality'] = { 'name': municipality.get('name', ''), 'code': municipality.get('code', ''), } # Municipal archive info municipal_archive = genealogiewerkbalk_data.get('municipal_archive', {}) if municipal_archive: genealogiewerkbalk['municipal_archive'] = { 'name': municipal_archive.get('name', ''), 'website': municipal_archive.get('website', ''), 'isil': municipal_archive.get('isil', ''), } # Province info province = genealogiewerkbalk_data.get('province', {}) if province: genealogiewerkbalk['province'] = { 'name': province.get('name', ''), 'code': province.get('code', ''), } # Provincial archive info provincial_archive = genealogiewerkbalk_data.get('provincial_archive', {}) if provincial_archive: genealogiewerkbalk['provincial_archive'] = { 'name': provincial_archive.get('name', ''), 'website': provincial_archive.get('website', ''), } # Match metadata if genealogiewerkbalk_data.get('match_confidence'): genealogiewerkbalk['match_confidence'] = genealogiewerkbalk_data['match_confidence'] if genealogiewerkbalk_data.get('match_method'): genealogiewerkbalk['match_method'] = genealogiewerkbalk_data['match_method'] if genealogiewerkbalk: result['genealogiewerkbalk'] = genealogiewerkbalk return result def main(): """Main export function.""" # Paths enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries' output_dir = project_root / 'frontend' / 'public' / 'data' output_file = output_dir / 'nde_institutions.json' metadata_file = output_dir / 'nde_metadata.json' # Create output directory if needed output_dir.mkdir(parents=True, exist_ok=True) print(f"Reading enriched entries from: {enriched_dir}") institutions = [] files_processed = 0 files_with_coords = 0 # Track enrichment source counts from raw YAML files enrichment_counts = { 'nde_register': 0, # Count entries from original NDE register (has 'organisatie' field) 'wikidata': 0, 'google_maps': 0, 'web': 0, 'youtube': 0, 'isil_na': 0, 'isil_kb': 0, 'zcbs': 0, 'museum_register': 0, 'osm': 0, 'genealogiewerkbalk': 0, } # Process all YAML files yaml_files = sorted(enriched_dir.glob('*.yaml')) for yaml_file in yaml_files: try: with open(yaml_file, 'r', encoding='utf-8') as f: entry_data = yaml.load(f, Loader=SafeLoader) files_processed += 1 # Track enrichment sources from raw YAML # Count entries from original NDE register (source_type == 'nde_csv_registry' in provenance) prov = entry_data.get('provenance', {}) sources = prov.get('sources', {}) original_sources = sources.get('original_entry', []) is_nde_csv = any( s.get('source_type') == 'nde_csv_registry' for s in original_sources if isinstance(s, dict) ) if is_nde_csv: enrichment_counts['nde_register'] += 1 if entry_data.get('wikidata_enrichment'): enrichment_counts['wikidata'] += 1 if entry_data.get('google_maps_enrichment'): enrichment_counts['google_maps'] += 1 if entry_data.get('web_enrichment'): enrichment_counts['web'] += 1 if entry_data.get('youtube_enrichment'): enrichment_counts['youtube'] += 1 if entry_data.get('nan_isil_enrichment'): enrichment_counts['isil_na'] += 1 if entry_data.get('kb_enrichment'): enrichment_counts['isil_kb'] += 1 if entry_data.get('museum_register_enrichment'): enrichment_counts['museum_register'] += 1 if entry_data.get('osm_enrichment'): enrichment_counts['osm'] += 1 if entry_data.get('zcbs_enrichment'): enrichment_counts['zcbs'] += 1 if entry_data.get('genealogiewerkbalk_enrichment'): enrichment_counts['genealogiewerkbalk'] += 1 # Extract institution data inst_data = extract_institution_data(entry_data) if inst_data: institutions.append(inst_data) files_with_coords += 1 except Exception as e: print(f"Warning: Error processing {yaml_file.name}: {e}") continue # Sort by name institutions.sort(key=lambda x: x['name'].lower()) # Write JSON with open(output_file, 'w', encoding='utf-8') as f: json.dump(institutions, f, ensure_ascii=False, indent=2) # Create metadata with enrichment source stats metadata = { 'generated_at': datetime.now(timezone.utc).isoformat(), 'total_entries': files_processed, 'total_with_coordinates': files_with_coords, 'enrichment_sources': { 'nde_register': { 'name': 'NDE Register Nederland', 'name_nl': 'NDE Register Nederland', 'count': 1351, # From backup: voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.20251117_122408.yaml 'description': 'Base registry data', 'description_nl': 'Basisgegevens uit register', }, 'wikidata': { 'name': 'Wikidata', 'name_nl': 'Wikidata', 'count': enrichment_counts['wikidata'], 'description': 'Linked open data enrichment', 'description_nl': 'Linked open data verrijking', }, 'google_maps': { 'name': 'Google Maps', 'name_nl': 'Google Maps', 'count': enrichment_counts['google_maps'], 'description': 'Ratings, reviews, photos, opening hours', 'description_nl': 'Beoordelingen, reviews, foto\'s, openingstijden', }, 'web': { 'name': 'Website Scraping', 'name_nl': 'Website Scraping', 'count': enrichment_counts['web'], 'description': 'Social media links, descriptions from official websites', 'description_nl': 'Social media links, beschrijvingen van officiële websites', }, 'youtube': { 'name': 'YouTube', 'name_nl': 'YouTube', 'count': enrichment_counts['youtube'], 'description': 'Channel info, videos, comments, transcripts', 'description_nl': 'Kanaalinfo, video\'s, reacties, transcripties', }, 'isil_na': { 'name': 'ISIL Registry (Nationaal Archief)', 'name_nl': 'ISIL Register (Nationaal Archief)', 'count': enrichment_counts['isil_na'], 'description': 'Official ISIL codes from Dutch National Archives', 'description_nl': 'Officiële ISIL-codes van het Nationaal Archief', }, 'isil_kb': { 'name': 'ISIL Registry (KB Netherlands)', 'name_nl': 'ISIL Register (KB Nederland)', 'count': enrichment_counts['isil_kb'], 'description': 'ISIL codes from KB Netherlands Library Network', 'description_nl': 'ISIL-codes van het KB Bibliotheeknetwerk', }, 'museum_register': { 'name': 'Museumregister Nederland', 'name_nl': 'Museumregister Nederland', 'count': enrichment_counts['museum_register'], 'description': 'Official museum registration', 'description_nl': 'Officiële museumregistratie', }, 'zcbs': { 'name': 'ZCBS Collection Platforms', 'name_nl': 'ZCBS Collectieplatforms', 'count': enrichment_counts['zcbs'], 'description': 'Collection management systems (ZCBS network)', 'description_nl': 'Collectiebeheersystemen (ZCBS netwerk)', }, 'genealogiewerkbalk': { 'name': 'Genealogiewerkbalk', 'name_nl': 'Genealogiewerkbalk', 'count': enrichment_counts['genealogiewerkbalk'], 'description': 'Municipality and province archive registry data', 'description_nl': 'Gemeente- en provinciearchief registergegevens', }, }, } # Write metadata JSON with open(metadata_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) print(f"\n✅ Export complete!") print(f" Files processed: {files_processed}") print(f" Institutions with coordinates: {files_with_coords}") print(f" Output file: {output_file}") # Print type distribution type_counts = {} for inst in institutions: t = inst['type'] type_counts[t] = type_counts.get(t, 0) + 1 print(f"\n📊 Distribution by type:") for t, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {TYPE_NAMES.get(t, t)}: {count}") # Print Google Maps enrichment stats with_rating = sum(1 for i in institutions if i.get('rating')) with_photos = sum(1 for i in institutions if i.get('photos')) with_reviews = sum(1 for i in institutions if i.get('reviews')) with_hours = sum(1 for i in institutions if i.get('opening_hours')) with_street_view = sum(1 for i in institutions if i.get('street_view_url')) print(f"\n🗺️ Google Maps enrichment coverage:") print(f" With ratings: {with_rating} ({with_rating*100/len(institutions):.1f}%)") print(f" With photos: {with_photos} ({with_photos*100/len(institutions):.1f}%)") print(f" With reviews: {with_reviews} ({with_reviews*100/len(institutions):.1f}%)") print(f" With opening hours: {with_hours} ({with_hours*100/len(institutions):.1f}%)") print(f" With Street View: {with_street_view} ({with_street_view*100/len(institutions):.1f}%)") # Print new enrichment sources stats with_isil = sum(1 for i in institutions if i.get('isil')) with_isil_na = sum(1 for i in institutions if i.get('isil', {}).get('source') == 'Nationaal Archief ISIL Registry') with_isil_kb = sum(1 for i in institutions if i.get('isil', {}).get('source') == 'KB Netherlands Library Network') with_museum_reg = sum(1 for i in institutions if i.get('museum_register')) with_ghcid = sum(1 for i in institutions if i.get('ghcid')) with_social = sum(1 for i in institutions if i.get('social_media')) with_verified_name = sum(1 for i in institutions if i.get('verified_name')) print(f"\n📋 New enrichment coverage:") print(f" With ISIL code (total): {with_isil} ({with_isil*100/len(institutions):.1f}%)") print(f" - Nationaal Archief: {with_isil_na}") print(f" - KB Netherlands: {with_isil_kb}") print(f" With Museum Register: {with_museum_reg} ({with_museum_reg*100/len(institutions):.1f}%)") print(f" With GHCID: {with_ghcid} ({with_ghcid*100/len(institutions):.1f}%)") print(f" With social media: {with_social} ({with_social*100/len(institutions):.1f}%)") print(f" With verified name: {with_verified_name} ({with_verified_name*100/len(institutions):.1f}%)") # Print YouTube enrichment stats with_youtube = sum(1 for i in institutions if i.get('youtube')) with_youtube_videos = sum(1 for i in institutions if i.get('youtube', {}).get('videos')) total_videos = sum(len(i.get('youtube', {}).get('videos', [])) for i in institutions) print(f"\n🎬 YouTube enrichment coverage:") print(f" With YouTube channel: {with_youtube} ({with_youtube*100/len(institutions):.1f}%)") print(f" With videos: {with_youtube_videos} ({with_youtube_videos*100/len(institutions):.1f}%)") print(f" Total videos indexed: {total_videos}") # Print founding date stats with_founding = sum(1 for i in institutions if i.get('founding_year')) founding_decades = {} for i in institutions: decade = i.get('founding_decade') if decade: founding_decades[decade] = founding_decades.get(decade, 0) + 1 print(f"\n📅 Founding date coverage:") print(f" With founding year: {with_founding} ({with_founding*100/len(institutions):.1f}%)") if founding_decades: sorted_decades = sorted(founding_decades.items()) earliest = sorted_decades[0] latest = sorted_decades[-1] print(f" Earliest decade: {earliest[0]}s ({earliest[1]} institutions)") print(f" Latest decade: {latest[0]}s ({latest[1]} institutions)") # Print temporal extent stats (dissolution, defunct status) with_temporal = sum(1 for i in institutions if i.get('temporal_extent')) with_dissolution = sum(1 for i in institutions if i.get('temporal_extent', {}).get('dissolution_date')) with_successor = sum(1 for i in institutions if i.get('successor_organization')) defunct_count = sum(1 for i in institutions if i.get('temporal_extent', {}).get('is_defunct')) if with_temporal > 0: print(f"\n⏳ Temporal extent coverage:") print(f" With temporal data: {with_temporal} ({with_temporal*100/len(institutions):.1f}%)") print(f" With dissolution date: {with_dissolution}") print(f" With successor org: {with_successor}") print(f" Marked defunct: {defunct_count}") # Print Genealogiewerkbalk enrichment stats with_genealogiewerkbalk = sum(1 for i in institutions if i.get('genealogiewerkbalk')) with_municipal_archive = sum(1 for i in institutions if i.get('genealogiewerkbalk', {}).get('municipal_archive')) with_provincial_archive = sum(1 for i in institutions if i.get('genealogiewerkbalk', {}).get('provincial_archive')) print(f"\n📚 Genealogiewerkbalk enrichment coverage:") print(f" With genealogiewerkbalk data: {with_genealogiewerkbalk} ({with_genealogiewerkbalk*100/len(institutions):.1f}%)") print(f" With municipal archive: {with_municipal_archive}") print(f" With provincial archive: {with_provincial_archive}") if __name__ == '__main__': main()