glam/scripts/export_nde_for_duckdb.py

#!/usr/bin/env python3
"""
Export NDE enriched entries to formats suitable for DuckDB browser loading.

Generates:
1. JSON file for direct browser upload
2. Parquet file for efficient loading
3. Summary statistics

Usage:
    python scripts/export_nde_for_duckdb.py
"""

import json
import sys
from pathlib import Path
from datetime import datetime
import yaml

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

NDE_ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
OUTPUT_DIR = PROJECT_ROOT / "frontend" / "public" / "data"


def safe_get(d: dict, *keys, default=None):
    """Safely get nested dictionary value."""
    for key in keys:
        if isinstance(d, dict):
            d = d.get(key, default)
        else:
            return default
    return d


def extract_institution_record(yaml_data: dict, filename: str) -> dict:
    """Extract a flat record from YAML entry for DuckDB."""
    original = yaml_data.get("original_entry", {})
    wikidata = yaml_data.get("wikidata_enrichment", {})
    google = yaml_data.get("google_maps_enrichment", {})
    osm = yaml_data.get("osm_enrichment", {})
    ghcid_data = yaml_data.get("ghcid", {})
    location = yaml_data.get("location", {})
    location_res = yaml_data.get("location_resolution", {})

    # Extract Wikidata ID from filename or data
    wikidata_id = None
    if "_Q" in filename:
        wikidata_id = filename.split("_Q")[1].replace(".yaml", "")
        wikidata_id = f"Q{wikidata_id}"
    elif wikidata.get("wikidata_entity_id"):
        wikidata_id = wikidata["wikidata_entity_id"]

    # Get coordinates
    coords = wikidata.get("wikidata_coordinates", {})
    lat = coords.get("latitude") or location.get("latitude")
    lon = coords.get("longitude") or location.get("longitude")

    # Get Google Maps data
    google_rating = safe_get(google, "rating")
    google_reviews = safe_get(google, "user_ratings_total") or safe_get(google, "reviews_count")

    # Build record
    record = {
        # Identifiers
        "entry_index": yaml_data.get("entry_index"),
        "filename": filename,
        "wikidata_id": wikidata_id,
        "ghcid": ghcid_data.get("ghcid_string") if isinstance(ghcid_data, dict) else None,
        "ghcid_uuid": ghcid_data.get("ghcid_uuid") if isinstance(ghcid_data, dict) else None,

        # Basic info from original entry
        "name": original.get("organisatie") or wikidata.get("wikidata_label_nl"),
        "name_en": wikidata.get("wikidata_label_en"),
        "type_code": original.get("type", [None])[0] if isinstance(original.get("type"), list) else original.get("type"),
        "type_description": original.get("type_organisatie"),

        # Location
        "city": original.get("plaatsnaam_bezoekadres") or location.get("city"),
        "address": original.get("straat_en_huisnummer_bezoekadres"),
        "country": location.get("country") or "NL",
        "region": location_res.get("region_code") or location.get("region"),
        "latitude": lat,
        "longitude": lon,
        "geonames_id": location_res.get("geonames_id"),

        # Wikidata enrichment
        "description_nl": wikidata.get("wikidata_description_nl"),
        "description_en": wikidata.get("wikidata_description_en"),
        "website": original.get("webadres_organisatie") or wikidata.get("wikidata_official_website"),
        "image": wikidata.get("wikidata_image"),
        "inception_year": extract_year(wikidata.get("wikidata_inception")),

        # Instance of (type from Wikidata)
        "instance_of": extract_instance_of(wikidata.get("wikidata_instance_of")),

        # Google Maps enrichment
        "google_rating": google_rating,
        "google_reviews_count": google_reviews,
        "google_place_id": safe_get(google, "place_id"),

        # OSM enrichment
        "osm_id": safe_get(osm, "osm_id"),
        "osm_type": safe_get(osm, "osm_type"),

        # Systems and standards
        "collection_system": original.get("systeem"),
        "in_museum_register": original.get("museum_register") == "ja",
        "has_linked_data": original.get("linked_data") == "ja",

        # Processing metadata
        "processing_timestamp": yaml_data.get("processing_timestamp"),
        "enrichment_status": yaml_data.get("enrichment_status", "unknown"),
    }

    return record


def extract_year(inception_data):
    """Extract year from Wikidata inception data."""
    if not inception_data:
        return None
    time_str = inception_data.get("time", "")
    if time_str:
        # Format: +1959-00-00T00:00:00Z
        try:
            year_part = time_str.split("-")[0].replace("+", "")
            return int(year_part) if year_part.isdigit() else None
        except:
            return None
    return None


def extract_instance_of(instance_of_list):
    """Extract primary instance_of label."""
    if not instance_of_list or not isinstance(instance_of_list, list):
        return None
    if len(instance_of_list) > 0:
        first = instance_of_list[0]
        if isinstance(first, dict):
            return first.get("label_en") or first.get("label_nl")
    return None


def main():
    print("=" * 60)
    print("NDE to DuckDB Export")
    print("=" * 60)

    # Ensure output directory exists
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Collect all YAML files
    yaml_files = sorted(NDE_ENTRIES_DIR.glob("*.yaml"))
    print(f"\nFound {len(yaml_files)} YAML files in {NDE_ENTRIES_DIR}")

    records = []
    errors = []

    for yaml_file in yaml_files:
        try:
            with open(yaml_file, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f)

            if data:
                record = extract_institution_record(data, yaml_file.name)
                records.append(record)
        except Exception as e:
            errors.append((yaml_file.name, str(e)))
            print(f"  Error processing {yaml_file.name}: {e}")

    print(f"\nProcessed {len(records)} records successfully")
    if errors:
        print(f"  {len(errors)} errors encountered")

    # Export to JSON
    json_output = OUTPUT_DIR / "nde_institutions.json"
    with open(json_output, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False, default=str)
    print(f"\nExported JSON: {json_output}")
    print(f"  Size: {json_output.stat().st_size / 1024:.1f} KB")

    # Try to export to Parquet if pandas/pyarrow available
    try:
        import pandas as pd

        df = pd.DataFrame(records)

        # Parquet export
        parquet_output = OUTPUT_DIR / "nde_institutions.parquet"
        df.to_parquet(parquet_output, index=False)
        print(f"\nExported Parquet: {parquet_output}")
        print(f"  Size: {parquet_output.stat().st_size / 1024:.1f} KB")

        # CSV export for easy inspection
        csv_output = OUTPUT_DIR / "nde_institutions.csv"
        df.to_csv(csv_output, index=False)
        print(f"\nExported CSV: {csv_output}")
        print(f"  Size: {csv_output.stat().st_size / 1024:.1f} KB")

        # Print summary statistics
        print("\n" + "=" * 60)
        print("Summary Statistics")
        print("=" * 60)
        print(f"Total institutions: {len(df)}")
        print(f"With Wikidata ID: {df['wikidata_id'].notna().sum()}")
        print(f"With coordinates: {df['latitude'].notna().sum()}")
        print(f"With Google rating: {df['google_rating'].notna().sum()}")
        print(f"With GHCID: {df['ghcid'].notna().sum()}")
        print(f"\nBy country:")
        print(df['country'].value_counts().head(10).to_string())
        print(f"\nBy type code:")
        print(df['type_code'].value_counts().head(10).to_string())
        print(f"\nBy city (top 20):")
        print(df['city'].value_counts().head(20).to_string())

    except ImportError:
        print("\nNote: pandas/pyarrow not available, skipping Parquet/CSV export")

    # Generate metadata file for frontend
    metadata = {
        "generated_at": datetime.now().isoformat(),
        "record_count": len(records),
        "source_directory": str(NDE_ENTRIES_DIR),
        "files": {
            "json": "nde_institutions.json",
            "parquet": "nde_institutions.parquet",
            "csv": "nde_institutions.csv"
        },
        "schema": {
            "entry_index": "integer",
            "filename": "string",
            "wikidata_id": "string",
            "ghcid": "string",
            "ghcid_uuid": "string",
            "name": "string",
            "name_en": "string",
            "type_code": "string",
            "type_description": "string",
            "city": "string",
            "address": "string",
            "country": "string",
            "region": "string",
            "latitude": "float",
            "longitude": "float",
            "geonames_id": "integer",
            "description_nl": "string",
            "description_en": "string",
            "website": "string",
            "image": "string",
            "inception_year": "integer",
            "instance_of": "string",
            "google_rating": "float",
            "google_reviews_count": "integer",
            "google_place_id": "string",
            "osm_id": "string",
            "osm_type": "string",
            "collection_system": "string",
            "in_museum_register": "boolean",
            "has_linked_data": "boolean",
            "processing_timestamp": "datetime",
            "enrichment_status": "string"
        }
    }

    metadata_output = OUTPUT_DIR / "nde_metadata.json"
    with open(metadata_output, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2)
    print(f"\nExported metadata: {metadata_output}")

    print("\n" + "=" * 60)
    print("Export complete!")
    print("=" * 60)
    print(f"\nFiles ready for DuckDB in: {OUTPUT_DIR}")
    print("\nTo load in DuckDB browser:")
    print("  1. Upload nde_institutions.json via the DuckDB panel")
    print("  2. Or fetch directly: SELECT * FROM 'data/nde_institutions.parquet'")


if __name__ == "__main__":
    main()