glam/scripts/export_nde_map_json.py

#!/usr/bin/env python3
"""
Export NDE Enriched Institutions to JSON for Frontend Map

Reads the enriched YAML files and produces a lightweight JSON file
suitable for the React/Leaflet map component.

Now includes Google Maps enrichment data (ratings, photos, reviews, opening hours).
"""

import json
from pathlib import Path
from datetime import datetime, timezone
import sys

# Add project root to path for imports
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
    import yaml
    # Use C-based loader for faster parsing (10x faster)
    try:
        from yaml import CSafeLoader as SafeLoader
    except ImportError:
        from yaml import SafeLoader
except ImportError:
    print("Error: PyYAML not installed. Run: pip install pyyaml")
    sys.exit(1)

# Institution type mappings
TYPE_COLORS = {
    'G': '#00bcd4',   # Gallery - cyan
    'L': '#2ecc71',   # Library - green
    'A': '#3498db',   # Archive - blue
    'M': '#e74c3c',   # Museum - red
    'O': '#f39c12',   # Official - orange
    'R': '#1abc9c',   # Research - teal
    'C': '#795548',   # Corporation - brown
    'U': '#9e9e9e',   # Unknown - gray
    'B': '#4caf50',   # Botanical - green
    'E': '#ff9800',   # Education - amber
    'S': '#9b59b6',   # Society - purple
    'F': '#95a5a6',   # Features - gray
    'I': '#673ab7',   # Intangible - deep purple
    'X': '#607d8b',   # Mixed - blue gray
    'P': '#ff5722',   # Personal - deep orange
    'H': '#607d8b',   # Holy sites - blue gray
    'D': '#34495e',   # Digital - dark gray
    'N': '#e91e63',   # NGO - pink
    'T': '#ff5722',   # Taste/smell - deep orange
}

TYPE_NAMES = {
    'G': 'Gallery',
    'L': 'Library',
    'A': 'Archive',
    'M': 'Museum',
    'O': 'Official',
    'R': 'Research',
    'C': 'Corporation',
    'U': 'Unknown',
    'B': 'Botanical',
    'E': 'Education',
    'S': 'Society',
    'F': 'Features',
    'I': 'Intangible',
    'X': 'Mixed',
    'P': 'Personal',
    'H': 'Holy sites',
    'D': 'Digital',
    'N': 'NGO',
    'T': 'Taste/smell',
}


def extract_institution_data(entry_data: dict) -> dict | None:
    """Extract the relevant data for the map from an enriched entry."""

    # Skip duplicates - they should not be visible in the export
    duplicate_status = entry_data.get('duplicate_status', {})
    if duplicate_status.get('is_duplicate'):
        return None

    # Get original entry data
    original = entry_data.get('original_entry', {})
    enrichment = entry_data.get('wikidata_enrichment', {})
    google_maps = entry_data.get('google_maps_enrichment', {})
    exa_data = entry_data.get('exa_enrichment', {})

    # New enrichment sources
    nan_isil = entry_data.get('nan_isil_enrichment', {})
    kb_isil = entry_data.get('kb_enrichment', {})
    zcbs_data = entry_data.get('zcbs_enrichment', {})
    museum_register = entry_data.get('museum_register_enrichment', {})
    web_claims_data = entry_data.get('web_claims', {})
    ghcid_data = entry_data.get('ghcid', {})
    identifiers = entry_data.get('identifiers', [])
    custodian_name = entry_data.get('custodian_name', {})
    youtube_data = entry_data.get('youtube_enrichment', {})
    temporal_extent = entry_data.get('temporal_extent', {})
    successor_org = entry_data.get('successor_organization', {})
    genealogiewerkbalk_data = entry_data.get('genealogiewerkbalk_enrichment', {})

    # Get coordinates - priority order:
    # 1. Google Maps (most precise)
    # 2. Wikidata coordinates
    # 3. locations array (for KIEN intangible heritage entries)
    # 4. location object (singular)
    lat, lon = None, None

    # Try Google Maps coordinates first
    google_coords = google_maps.get('coordinates', {})
    if google_coords.get('latitude') and google_coords.get('longitude'):
        lat = google_coords['latitude']
        lon = google_coords['longitude']

    # Fall back to Wikidata coordinates
    if not lat or not lon:
        wd_coords = enrichment.get('wikidata_coordinates', {})
        if wd_coords.get('latitude') and wd_coords.get('longitude'):
            lat = wd_coords['latitude']
            lon = wd_coords['longitude']

    # Fall back to locations array (KIEN intangible heritage entries)
    if not lat or not lon:
        locations_list = entry_data.get('locations', [])
        if locations_list and isinstance(locations_list, list) and len(locations_list) > 0:
            first_loc = locations_list[0]
            if first_loc.get('latitude') and first_loc.get('longitude'):
                lat = first_loc['latitude']
                lon = first_loc['longitude']

    # Fall back to singular location object
    if not lat or not lon:
        location_obj = entry_data.get('location', {})
        if location_obj.get('latitude') and location_obj.get('longitude'):
            lat = location_obj['latitude']
            lon = location_obj['longitude']

    # Skip if no coordinates from any source
    if not lat or not lon:
        return None

    # Get institution type (first one if list)
    types = original.get('type', [])
    inst_type = types[0] if types else 'U'

    # Get name - priority order:
    # 1. Verified custodian_name
    # 2. Wikidata Dutch label
    # 3. KIEN name
    # 4. Original CSV name
    custodian_name_data = entry_data.get('custodian_name', {})
    kien_data = entry_data.get('kien_enrichment', {})

    name = (
        custodian_name_data.get('claim_value') or
        enrichment.get('wikidata_label_nl') or
        kien_data.get('kien_name') or
        original.get('organisatie') or
        'Unknown Institution'
    )

    # Get city - priority: original CSV, then locations array, then location object
    city = original.get('plaatsnaam_bezoekadres', '')

    # Fall back to locations array (KIEN entries)
    if not city:
        locations_list = entry_data.get('locations', [])
        if locations_list and isinstance(locations_list, list) and len(locations_list) > 0:
            city = locations_list[0].get('city', '')

    # Fall back to singular location object
    if not city:
        location_obj = entry_data.get('location', {})
        city = location_obj.get('city', '')

    # Get province from Google Maps address components (administrative_area_level_1)
    province = None
    address_components = google_maps.get('address_components', [])
    for component in address_components:
        component_types = component.get('types', [])
        if 'administrative_area_level_1' in component_types:
            province = component.get('long_name')
            break

    # Fall back to Wikidata located_in if no Google Maps province
    if not province:
        located_in = enrichment.get('wikidata_located_in', {})
        if located_in:
            # Check if it's a municipality in a known province
            desc = located_in.get('description_nl', '')
            # Extract province from description like "gemeente in Drenthe, Nederland"
            if 'gemeente in ' in desc:
                parts = desc.split('gemeente in ')
                if len(parts) > 1:
                    province_part = parts[1].split(',')[0].strip()
                    if province_part and province_part != 'Nederland':
                        province = province_part

    # Get description - prefer Dutch, fall back to English, then Exa, then Google editorial
    # Handle various types safely
    description = ''
    if enrichment.get('wikidata_description_nl'):
        description = enrichment['wikidata_description_nl']
    elif enrichment.get('wikidata_description_en'):
        description = enrichment['wikidata_description_en']
    elif exa_data.get('description'):
        description = exa_data['description']
    else:
        editorial = google_maps.get('editorial_summary')
        if editorial and isinstance(editorial, dict):
            description = editorial.get('text', '')
        elif isinstance(editorial, str):
            description = editorial

    # Ensure description is a string
    if not isinstance(description, str):
        description = ''

    # Get website - prefer Google Maps (more current), fall back to Wikidata
    website = (
        google_maps.get('website') or
        enrichment.get('wikidata_official_website') or
        original.get('webadres_organisatie') or
        ''
    )

    # Get Wikidata ID
    wikidata_id = enrichment.get('wikidata_entity_id', '')

    # Get Wikidata instance_of types (P31)
    # This gives us fine-grained types like "museum", "historical society", "regional archive"
    wikidata_types = []
    instance_of_list = enrichment.get('wikidata_instance_of', [])
    for wd_type in instance_of_list:
        # Prefer English label, fall back to Dutch
        label = wd_type.get('label_en') or wd_type.get('label_nl')
        if label:
            wikidata_types.append(label)

    # Extract founding date from Wikidata inception (P571)
    founding_year = None
    founding_decade = None
    inception = enrichment.get('wikidata_inception')
    if inception:
        # Handle both formats:
        # 1. Dict format: {'time': '+1959-00-00T00:00:00Z', ...}
        # 2. String format: '2001-01-01' or '+1959-00-00T00:00:00Z'
        if isinstance(inception, dict):
            time_str = inception.get('time', '')
        else:
            time_str = str(inception)

        if time_str:
            # Extract year from time string (e.g., +1959-00-00T00:00:00Z -> 1959)
            try:
                # Remove leading + and parse year
                year_part = time_str.lstrip('+').split('-')[0]
                if year_part.isdigit():
                    founding_year = int(year_part)
                    # Only include reasonable years (after 1000 CE, before current year + 10)
                    if 1000 <= founding_year <= 2035:
                        founding_decade = (founding_year // 10) * 10
                    else:
                        founding_year = None
            except (ValueError, IndexError):
                pass

    # Build result with base data
    result = {
        'lat': lat,
        'lon': lon,
        'name': name,
        'city': city,
        'province': province,  # Add province field
        'type': inst_type,
        'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
        'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
        'website': website,
        'wikidata_id': wikidata_id,
        'wikidata_types': wikidata_types,  # Fine-grained Wikidata types (P31)
        'description': description,  # Keep full description
    }

    # Add founding date if available
    if founding_year:
        result['founding_year'] = founding_year
        result['founding_decade'] = founding_decade

    # Add Google Maps enrichment data if available
    if google_maps:
        # Rating and reviews count
        if google_maps.get('rating'):
            result['rating'] = google_maps['rating']
            result['total_ratings'] = google_maps.get('total_ratings', 0)

        # Phone number
        if google_maps.get('phone_international'):
            result['phone'] = google_maps['phone_international']
        elif google_maps.get('phone_local'):
            result['phone'] = google_maps['phone_local']

        # Formatted address (more complete than city)
        if google_maps.get('formatted_address'):
            result['address'] = google_maps['formatted_address']

        # Opening hours (weekday text is human readable)
        opening_hours = google_maps.get('opening_hours', {})
        if opening_hours.get('weekday_text'):
            result['opening_hours'] = opening_hours['weekday_text']
            result['open_now'] = opening_hours.get('open_now', None)

        # Reviews - keep all reviews with full text
        reviews = google_maps.get('reviews', [])
        if reviews:
            result['reviews'] = [
                {
                    'author': r.get('author_name', 'Anonymous'),
                    'rating': r.get('rating', 0),
                    'text': r.get('text', ''),  # Keep full text
                    'time': r.get('relative_time_description', '')
                }
                for r in reviews  # Keep all reviews
            ]

        # Photos - keep all photos
        photos = google_maps.get('photos', [])
        photo_urls = google_maps.get('photo_urls', [])

        if photo_urls:
            # Direct URL format
            result['photos'] = [{'url': url, 'attribution': ''} for url in photo_urls]
        elif photos:
            # Object format with attribution
            result['photos'] = [
                {
                    'url': p.get('url', ''),
                    'attribution': p.get('attributions', [''])[0] if p.get('attributions') else ''
                }
                for p in photos
            ]

        # Street View URL
        if google_maps.get('street_view_url'):
            result['street_view_url'] = google_maps['street_view_url']

        # Business status
        if google_maps.get('business_status'):
            result['business_status'] = google_maps['business_status']

        # Google Place ID for linking
        if google_maps.get('place_id'):
            result['google_place_id'] = google_maps['place_id']

    # Add ISIL data from Nationaal Archief enrichment
    if nan_isil:
        result['isil'] = {
            'code': nan_isil.get('isil_code', ''),
            'name': nan_isil.get('nan_name', ''),
            'city': nan_isil.get('nan_city', ''),
            'assigned_date': nan_isil.get('nan_toegekend_op', ''),
            'source': 'Nationaal Archief ISIL Registry',
        }
    # Add ISIL data from KB Netherlands Library Network (if no NA ISIL)
    elif kb_isil:
        result['isil'] = {
            'code': kb_isil.get('isil_code', ''),
            'name': kb_isil.get('name', ''),
            'city': kb_isil.get('city', ''),
            'assigned_date': kb_isil.get('extraction_date', ''),
            'source': 'KB Netherlands Library Network',
        }

    # Add Museum Register data
    if museum_register:
        result['museum_register'] = {
            'name': museum_register.get('museum_name', ''),
            'province': museum_register.get('province', ''),
            'registered_since': museum_register.get('registered_since', ''),
            'website': museum_register.get('website_url', ''),
        }

    # Add ZCBS collection platform data
    if zcbs_data:
        result['zcbs'] = {
            'id': zcbs_data.get('zcbs_id', ''),
            'name': zcbs_data.get('zcbs_name', ''),
            'platform_urls': zcbs_data.get('platform_urls', {}),
            'match_score': zcbs_data.get('match_score', 0),
        }

    # Add GHCID (Global Heritage Custodian Identifier)
    if ghcid_data:
        result['ghcid'] = {
            'current': ghcid_data.get('ghcid_current', ''),
            'uuid': ghcid_data.get('ghcid_uuid', ''),
            'numeric': ghcid_data.get('ghcid_numeric', ''),
        }

    # Add standardized identifiers
    if identifiers:
        result['identifiers'] = [
            {
                'scheme': id_entry.get('identifier_scheme', ''),
                'value': id_entry.get('identifier_value', ''),
                'url': id_entry.get('identifier_url', ''),
            }
            for id_entry in identifiers
            if id_entry.get('identifier_scheme') in ('ISIL', 'GHCID', 'Wikidata', 'VIAF', 'ZCBS')
        ]
    else:
        result['identifiers'] = []

    # Add ZCBS to identifiers array if present
    if zcbs_data and zcbs_data.get('zcbs_id'):
        result['identifiers'].append({
            'scheme': 'ZCBS',
            'value': str(zcbs_data.get('zcbs_id', '')),
            'url': list(zcbs_data.get('platform_urls', {}).values())[0] if zcbs_data.get('platform_urls') else '',
        })

    # Add web claims (social media, description from website)
    if web_claims_data and web_claims_data.get('claims'):
        web_claims_list = web_claims_data.get('claims', [])
        social_links = {}
        web_description = None

        for claim in web_claims_list:
            claim_type = claim.get('claim_type', '')
            claim_value = claim.get('claim_value', '')

            if claim_type == 'social_facebook':
                social_links['facebook'] = claim_value
            elif claim_type == 'social_instagram':
                social_links['instagram'] = claim_value
            elif claim_type == 'social_twitter':
                social_links['twitter'] = claim_value
            elif claim_type == 'social_linkedin':
                social_links['linkedin'] = claim_value
            elif claim_type == 'social_youtube':
                social_links['youtube'] = claim_value
            elif claim_type == 'description_short' and not web_description:
                web_description = claim_value

        if social_links:
            result['social_media'] = social_links

        if web_description and not result.get('description'):
            result['description'] = web_description

    # Add verified custodian name if available
    if custodian_name and custodian_name.get('claim_value'):
        result['verified_name'] = custodian_name.get('claim_value')
        result['name_source'] = custodian_name.get('extraction_method', 'unknown')

    # Add YouTube enrichment data
    if youtube_data and youtube_data.get('status') == 'SUCCESS':
        channel = youtube_data.get('channel', {})
        videos = youtube_data.get('videos', [])

        youtube_result = {
            'channel_id': channel.get('channel_id'),
            'channel_url': channel.get('channel_url'),
            'channel_title': channel.get('title'),
            'channel_description': (channel.get('description') or '')[:500],  # Truncate
            'subscriber_count': channel.get('subscriber_count'),
            'video_count': channel.get('video_count'),
            'view_count': channel.get('view_count'),
            'thumbnail_url': channel.get('thumbnail_url'),
        }

        # Add videos (limited to top 5 for JSON size)
        if videos:
            youtube_result['videos'] = [
                {
                    'video_id': v.get('video_id'),
                    'video_url': v.get('video_url'),
                    'title': v.get('title'),
                    'description': (v.get('description') or '')[:200],  # Truncate
                    'published_at': v.get('published_at'),
                    'duration': v.get('duration'),
                    'view_count': v.get('view_count'),
                    'like_count': v.get('like_count'),
                    'comment_count': v.get('comment_count'),
                    'thumbnail_url': v.get('thumbnail_url'),
                    # Include top comments (max 3)
                    'comments': [
                        {
                            'author': c.get('author_display_name'),
                            'text': (c.get('text') or '')[:300],
                            'like_count': c.get('like_count'),
                        }
                        for c in (v.get('comments') or [])[:3]
                    ],
                    # Include transcript snippet if available
                    'has_transcript': bool(v.get('transcript')),
                    'transcript_snippet': (v.get('transcript', {}).get('transcript_text') or '')[:500] if v.get('transcript') else None,
                }
                for v in videos[:5]  # Limit to 5 videos
            ]

        result['youtube'] = youtube_result

    # Add temporal extent (TimeSpan) - dissolution/closure dates
    if temporal_extent:
        timespan = {}

        # Dissolution/closure date
        if temporal_extent.get('dissolution_date'):
            timespan['dissolution_date'] = temporal_extent['dissolution_date']
        if temporal_extent.get('dissolution_reason'):
            timespan['dissolution_reason'] = temporal_extent['dissolution_reason']

        # Founding date (if in temporal_extent, not Wikidata inception)
        if temporal_extent.get('founding_date'):
            timespan['founding_date'] = temporal_extent['founding_date']

        # End date (alternative to dissolution_date)
        if temporal_extent.get('end_date'):
            timespan['end_date'] = temporal_extent['end_date']

        # Status indicators
        if temporal_extent.get('is_defunct'):
            timespan['is_defunct'] = temporal_extent['is_defunct']
        if temporal_extent.get('is_operational') is not None:
            timespan['is_operational'] = temporal_extent['is_operational']

        if timespan:
            result['temporal_extent'] = timespan

    # Add successor organization (for dissolved/merged entities)
    if successor_org:
        successor = {}

        if successor_org.get('name'):
            successor['name'] = successor_org['name']
        if successor_org.get('wikidata_id'):
            successor['wikidata_id'] = successor_org['wikidata_id']
        if successor_org.get('isil'):
            successor['isil'] = successor_org['isil']
        if successor_org.get('website'):
            successor['website'] = successor_org['website']
        if successor_org.get('relationship'):
            successor['relationship'] = successor_org['relationship']

        if successor:
            result['successor_organization'] = successor

    # Add Genealogiewerkbalk enrichment (municipality/province archive information)
    if genealogiewerkbalk_data:
        genealogiewerkbalk = {}

        # Municipality info
        municipality = genealogiewerkbalk_data.get('municipality', {})
        if municipality:
            genealogiewerkbalk['municipality'] = {
                'name': municipality.get('name', ''),
                'code': municipality.get('code', ''),
            }

        # Municipal archive info
        municipal_archive = genealogiewerkbalk_data.get('municipal_archive', {})
        if municipal_archive:
            genealogiewerkbalk['municipal_archive'] = {
                'name': municipal_archive.get('name', ''),
                'website': municipal_archive.get('website', ''),
                'isil': municipal_archive.get('isil', ''),
            }

        # Province info
        province = genealogiewerkbalk_data.get('province', {})
        if province:
            genealogiewerkbalk['province'] = {
                'name': province.get('name', ''),
                'code': province.get('code', ''),
            }

        # Provincial archive info
        provincial_archive = genealogiewerkbalk_data.get('provincial_archive', {})
        if provincial_archive:
            genealogiewerkbalk['provincial_archive'] = {
                'name': provincial_archive.get('name', ''),
                'website': provincial_archive.get('website', ''),
            }

        # Match metadata
        if genealogiewerkbalk_data.get('match_confidence'):
            genealogiewerkbalk['match_confidence'] = genealogiewerkbalk_data['match_confidence']
        if genealogiewerkbalk_data.get('match_method'):
            genealogiewerkbalk['match_method'] = genealogiewerkbalk_data['match_method']

        if genealogiewerkbalk:
            result['genealogiewerkbalk'] = genealogiewerkbalk

    return result


def main():
    """Main export function."""

    # Paths
    enriched_dir = project_root / 'data' / 'nde' / 'enriched' / 'entries'
    output_dir = project_root / 'frontend' / 'public' / 'data'
    output_file = output_dir / 'nde_institutions.json'
    metadata_file = output_dir / 'nde_metadata.json'

    # Create output directory if needed
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Reading enriched entries from: {enriched_dir}")

    institutions = []
    files_processed = 0
    files_with_coords = 0

    # Track enrichment source counts from raw YAML files
    enrichment_counts = {
        'nde_register': 0,  # Count entries from original NDE register (has 'organisatie' field)
        'wikidata': 0,
        'google_maps': 0,
        'web': 0,
        'youtube': 0,
        'isil_na': 0,
        'isil_kb': 0,
        'zcbs': 0,
        'museum_register': 0,
        'osm': 0,
        'genealogiewerkbalk': 0,
        'kien': 0,  # KIEN intangible heritage registry
    }

    # Process all YAML files
    yaml_files = sorted(enriched_dir.glob('*.yaml'))

    for yaml_file in yaml_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                entry_data = yaml.load(f, Loader=SafeLoader)

            files_processed += 1

            # Track enrichment sources from raw YAML
            # Count entries from original NDE register (source_type == 'nde_csv_registry' in provenance)
            prov = entry_data.get('provenance', {})
            sources = prov.get('sources', {})
            original_sources = sources.get('original_entry', [])
            is_nde_csv = any(
                s.get('source_type') == 'nde_csv_registry'
                for s in original_sources if isinstance(s, dict)
            )
            if is_nde_csv:
                enrichment_counts['nde_register'] += 1
            if entry_data.get('wikidata_enrichment'):
                enrichment_counts['wikidata'] += 1
            if entry_data.get('google_maps_enrichment'):
                enrichment_counts['google_maps'] += 1
            if entry_data.get('web_enrichment'):
                enrichment_counts['web'] += 1
            if entry_data.get('youtube_enrichment'):
                enrichment_counts['youtube'] += 1
            if entry_data.get('nan_isil_enrichment'):
                enrichment_counts['isil_na'] += 1
            if entry_data.get('kb_enrichment'):
                enrichment_counts['isil_kb'] += 1
            if entry_data.get('museum_register_enrichment'):
                enrichment_counts['museum_register'] += 1
            if entry_data.get('osm_enrichment'):
                enrichment_counts['osm'] += 1
            if entry_data.get('zcbs_enrichment'):
                enrichment_counts['zcbs'] += 1
            if entry_data.get('genealogiewerkbalk_enrichment'):
                enrichment_counts['genealogiewerkbalk'] += 1
            if entry_data.get('kien_enrichment'):
                enrichment_counts['kien'] += 1

            # Extract institution data
            inst_data = extract_institution_data(entry_data)
            if inst_data:
                institutions.append(inst_data)
                files_with_coords += 1

        except Exception as e:
            print(f"Warning: Error processing {yaml_file.name}: {e}")
            continue

    # Sort by name
    institutions.sort(key=lambda x: x['name'].lower())

    # Write JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(institutions, f, ensure_ascii=False, indent=2)

    # Create metadata with enrichment source stats
    metadata = {
        'generated_at': datetime.now(timezone.utc).isoformat(),
        'total_entries': files_processed,
        'total_with_coordinates': files_with_coords,
        'enrichment_sources': {
            'nde_register': {
                'name': 'NDE Register Nederland',
                'name_nl': 'NDE Register Nederland',
                'count': 1351,  # From backup: voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.20251117_122408.yaml
                'description': 'Base registry data',
                'description_nl': 'Basisgegevens uit register',
            },
            'wikidata': {
                'name': 'Wikidata',
                'name_nl': 'Wikidata',
                'count': enrichment_counts['wikidata'],
                'description': 'Linked open data enrichment',
                'description_nl': 'Linked open data verrijking',
            },
            'google_maps': {
                'name': 'Google Maps',
                'name_nl': 'Google Maps',
                'count': enrichment_counts['google_maps'],
                'description': 'Ratings, reviews, photos, opening hours',
                'description_nl': 'Beoordelingen, reviews, foto\'s, openingstijden',
            },
            'web': {
                'name': 'Website Scraping',
                'name_nl': 'Website Scraping',
                'count': enrichment_counts['web'],
                'description': 'Social media links, descriptions from official websites',
                'description_nl': 'Social media links, beschrijvingen van officiële websites',
            },
            'youtube': {
                'name': 'YouTube',
                'name_nl': 'YouTube',
                'count': enrichment_counts['youtube'],
                'description': 'Channel info, videos, comments, transcripts',
                'description_nl': 'Kanaalinfo, video\'s, reacties, transcripties',
            },
            'isil_na': {
                'name': 'ISIL Registry (Nationaal Archief)',
                'name_nl': 'ISIL Register (Nationaal Archief)',
                'count': enrichment_counts['isil_na'],
                'description': 'Official ISIL codes from Dutch National Archives',
                'description_nl': 'Officiële ISIL-codes van het Nationaal Archief',
            },
            'isil_kb': {
                'name': 'ISIL Registry (KB Netherlands)',
                'name_nl': 'ISIL Register (KB Nederland)',
                'count': enrichment_counts['isil_kb'],
                'description': 'ISIL codes from KB Netherlands Library Network',
                'description_nl': 'ISIL-codes van het KB Bibliotheeknetwerk',
            },
            'museum_register': {
                'name': 'Museumregister Nederland',
                'name_nl': 'Museumregister Nederland',
                'count': enrichment_counts['museum_register'],
                'description': 'Official museum registration',
                'description_nl': 'Officiële museumregistratie',
            },
            'zcbs': {
                'name': 'ZCBS Collection Platforms',
                'name_nl': 'ZCBS Collectieplatforms',
                'count': enrichment_counts['zcbs'],
                'description': 'Collection management systems (ZCBS network)',
                'description_nl': 'Collectiebeheersystemen (ZCBS netwerk)',
            },
            'genealogiewerkbalk': {
                'name': 'Genealogiewerkbalk',
                'name_nl': 'Genealogiewerkbalk',
                'count': enrichment_counts['genealogiewerkbalk'],
                'description': 'Municipality and province archive registry data',
                'description_nl': 'Gemeente- en provinciearchief registergegevens',
            },
            'kien': {
                'name': 'KIEN Intangible Heritage',
                'name_nl': 'KIEN Immaterieel Erfgoed',
                'count': enrichment_counts['kien'],
                'description': 'Intangible heritage custodians from immaterieelerfgoed.nl',
                'description_nl': 'Immaterieel erfgoed beheerders van immaterieelerfgoed.nl',
                'url': 'https://www.immaterieelerfgoed.nl/',
            },
        },
    }

    # Write metadata JSON
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Export complete!")
    print(f"   Files processed: {files_processed}")
    print(f"   Institutions with coordinates: {files_with_coords}")
    print(f"   Output file: {output_file}")

    # Print type distribution
    type_counts = {}
    for inst in institutions:
        t = inst['type']
        type_counts[t] = type_counts.get(t, 0) + 1

    print(f"\n📊 Distribution by type:")
    for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"   {TYPE_NAMES.get(t, t)}: {count}")

    # Print Google Maps enrichment stats
    with_rating = sum(1 for i in institutions if i.get('rating'))
    with_photos = sum(1 for i in institutions if i.get('photos'))
    with_reviews = sum(1 for i in institutions if i.get('reviews'))
    with_hours = sum(1 for i in institutions if i.get('opening_hours'))
    with_street_view = sum(1 for i in institutions if i.get('street_view_url'))

    print(f"\n🗺️  Google Maps enrichment coverage:")
    print(f"   With ratings: {with_rating} ({with_rating*100/len(institutions):.1f}%)")
    print(f"   With photos: {with_photos} ({with_photos*100/len(institutions):.1f}%)")
    print(f"   With reviews: {with_reviews} ({with_reviews*100/len(institutions):.1f}%)")
    print(f"   With opening hours: {with_hours} ({with_hours*100/len(institutions):.1f}%)")
    print(f"   With Street View: {with_street_view} ({with_street_view*100/len(institutions):.1f}%)")

    # Print new enrichment sources stats
    with_isil = sum(1 for i in institutions if i.get('isil'))
    with_isil_na = sum(1 for i in institutions if i.get('isil', {}).get('source') == 'Nationaal Archief ISIL Registry')
    with_isil_kb = sum(1 for i in institutions if i.get('isil', {}).get('source') == 'KB Netherlands Library Network')
    with_museum_reg = sum(1 for i in institutions if i.get('museum_register'))
    with_ghcid = sum(1 for i in institutions if i.get('ghcid'))
    with_social = sum(1 for i in institutions if i.get('social_media'))
    with_verified_name = sum(1 for i in institutions if i.get('verified_name'))

    print(f"\n📋 New enrichment coverage:")
    print(f"   With ISIL code (total): {with_isil} ({with_isil*100/len(institutions):.1f}%)")
    print(f"     - Nationaal Archief: {with_isil_na}")
    print(f"     - KB Netherlands: {with_isil_kb}")
    print(f"   With Museum Register: {with_museum_reg} ({with_museum_reg*100/len(institutions):.1f}%)")
    print(f"   With GHCID: {with_ghcid} ({with_ghcid*100/len(institutions):.1f}%)")
    print(f"   With social media: {with_social} ({with_social*100/len(institutions):.1f}%)")
    print(f"   With verified name: {with_verified_name} ({with_verified_name*100/len(institutions):.1f}%)")

    # Print YouTube enrichment stats
    with_youtube = sum(1 for i in institutions if i.get('youtube'))
    with_youtube_videos = sum(1 for i in institutions if i.get('youtube', {}).get('videos'))
    total_videos = sum(len(i.get('youtube', {}).get('videos', [])) for i in institutions)

    print(f"\n🎬 YouTube enrichment coverage:")
    print(f"   With YouTube channel: {with_youtube} ({with_youtube*100/len(institutions):.1f}%)")
    print(f"   With videos: {with_youtube_videos} ({with_youtube_videos*100/len(institutions):.1f}%)")
    print(f"   Total videos indexed: {total_videos}")

    # Print founding date stats
    with_founding = sum(1 for i in institutions if i.get('founding_year'))
    founding_decades = {}
    for i in institutions:
        decade = i.get('founding_decade')
        if decade:
            founding_decades[decade] = founding_decades.get(decade, 0) + 1

    print(f"\n📅 Founding date coverage:")
    print(f"   With founding year: {with_founding} ({with_founding*100/len(institutions):.1f}%)")
    if founding_decades:
        sorted_decades = sorted(founding_decades.items())
        earliest = sorted_decades[0]
        latest = sorted_decades[-1]
        print(f"   Earliest decade: {earliest[0]}s ({earliest[1]} institutions)")
        print(f"   Latest decade: {latest[0]}s ({latest[1]} institutions)")

    # Print temporal extent stats (dissolution, defunct status)
    with_temporal = sum(1 for i in institutions if i.get('temporal_extent'))
    with_dissolution = sum(1 for i in institutions if i.get('temporal_extent', {}).get('dissolution_date'))
    with_successor = sum(1 for i in institutions if i.get('successor_organization'))
    defunct_count = sum(1 for i in institutions if i.get('temporal_extent', {}).get('is_defunct'))

    if with_temporal > 0:
        print(f"\n⏳ Temporal extent coverage:")
        print(f"   With temporal data: {with_temporal} ({with_temporal*100/len(institutions):.1f}%)")
        print(f"   With dissolution date: {with_dissolution}")
        print(f"   With successor org: {with_successor}")
        print(f"   Marked defunct: {defunct_count}")

    # Print Genealogiewerkbalk enrichment stats
    with_genealogiewerkbalk = sum(1 for i in institutions if i.get('genealogiewerkbalk'))
    with_municipal_archive = sum(1 for i in institutions if i.get('genealogiewerkbalk', {}).get('municipal_archive'))
    with_provincial_archive = sum(1 for i in institutions if i.get('genealogiewerkbalk', {}).get('provincial_archive'))

    print(f"\n📚 Genealogiewerkbalk enrichment coverage:")
    print(f"   With genealogiewerkbalk data: {with_genealogiewerkbalk} ({with_genealogiewerkbalk*100/len(institutions):.1f}%)")
    print(f"   With municipal archive: {with_municipal_archive}")
    print(f"   With provincial archive: {with_provincial_archive}")

    # Print KIEN enrichment stats
    print(f"\n🎭 KIEN intangible heritage coverage:")
    print(f"   KIEN entries: {enrichment_counts['kien']} ({enrichment_counts['kien']*100/files_processed:.1f}%)")
    print(f"   Source: https://www.immaterieelerfgoed.nl/")


if __name__ == '__main__':
    main()