#!/usr/bin/env python3 """ Export NDE enriched entries to formats suitable for DuckDB browser loading. Generates: 1. JSON file for direct browser upload 2. Parquet file for efficient loading 3. Summary statistics Usage: python scripts/export_nde_for_duckdb.py """ import json import sys from pathlib import Path from datetime import datetime import yaml # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) NDE_ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries" OUTPUT_DIR = PROJECT_ROOT / "frontend" / "public" / "data" def safe_get(d: dict, *keys, default=None): """Safely get nested dictionary value.""" for key in keys: if isinstance(d, dict): d = d.get(key, default) else: return default return d def extract_institution_record(yaml_data: dict, filename: str) -> dict: """Extract a flat record from YAML entry for DuckDB.""" original = yaml_data.get("original_entry", {}) wikidata = yaml_data.get("wikidata_enrichment", {}) google = yaml_data.get("google_maps_enrichment", {}) osm = yaml_data.get("osm_enrichment", {}) ghcid_data = yaml_data.get("ghcid", {}) location = yaml_data.get("location", {}) location_res = yaml_data.get("location_resolution", {}) # Extract Wikidata ID from filename or data wikidata_id = None if "_Q" in filename: wikidata_id = filename.split("_Q")[1].replace(".yaml", "") wikidata_id = f"Q{wikidata_id}" elif wikidata.get("wikidata_entity_id"): wikidata_id = wikidata["wikidata_entity_id"] # Get coordinates coords = wikidata.get("wikidata_coordinates", {}) lat = coords.get("latitude") or location.get("latitude") lon = coords.get("longitude") or location.get("longitude") # Get Google Maps data google_rating = safe_get(google, "rating") google_reviews = safe_get(google, "user_ratings_total") or safe_get(google, "reviews_count") # Build record record = { # Identifiers "entry_index": yaml_data.get("entry_index"), "filename": filename, "wikidata_id": wikidata_id, "ghcid": ghcid_data.get("ghcid_string") if isinstance(ghcid_data, dict) else None, "ghcid_uuid": ghcid_data.get("ghcid_uuid") if isinstance(ghcid_data, dict) else None, # Basic info from original entry "name": original.get("organisatie") or wikidata.get("wikidata_label_nl"), "name_en": wikidata.get("wikidata_label_en"), "type_code": original.get("type", [None])[0] if isinstance(original.get("type"), list) else original.get("type"), "type_description": original.get("type_organisatie"), # Location "city": original.get("plaatsnaam_bezoekadres") or location.get("city"), "address": original.get("straat_en_huisnummer_bezoekadres"), "country": location.get("country") or "NL", "region": location_res.get("region_code") or location.get("region"), "latitude": lat, "longitude": lon, "geonames_id": location_res.get("geonames_id"), # Wikidata enrichment "description_nl": wikidata.get("wikidata_description_nl"), "description_en": wikidata.get("wikidata_description_en"), "website": original.get("webadres_organisatie") or wikidata.get("wikidata_official_website"), "image": wikidata.get("wikidata_image"), "inception_year": extract_year(wikidata.get("wikidata_inception")), # Instance of (type from Wikidata) "instance_of": extract_instance_of(wikidata.get("wikidata_instance_of")), # Google Maps enrichment "google_rating": google_rating, "google_reviews_count": google_reviews, "google_place_id": safe_get(google, "place_id"), # OSM enrichment "osm_id": safe_get(osm, "osm_id"), "osm_type": safe_get(osm, "osm_type"), # Systems and standards "collection_system": original.get("systeem"), "in_museum_register": original.get("museum_register") == "ja", "has_linked_data": original.get("linked_data") == "ja", # Processing metadata "processing_timestamp": yaml_data.get("processing_timestamp"), "enrichment_status": yaml_data.get("enrichment_status", "unknown"), } return record def extract_year(inception_data): """Extract year from Wikidata inception data.""" if not inception_data: return None time_str = inception_data.get("time", "") if time_str: # Format: +1959-00-00T00:00:00Z try: year_part = time_str.split("-")[0].replace("+", "") return int(year_part) if year_part.isdigit() else None except: return None return None def extract_instance_of(instance_of_list): """Extract primary instance_of label.""" if not instance_of_list or not isinstance(instance_of_list, list): return None if len(instance_of_list) > 0: first = instance_of_list[0] if isinstance(first, dict): return first.get("label_en") or first.get("label_nl") return None def main(): print("=" * 60) print("NDE to DuckDB Export") print("=" * 60) # Ensure output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Collect all YAML files yaml_files = sorted(NDE_ENTRIES_DIR.glob("*.yaml")) print(f"\nFound {len(yaml_files)} YAML files in {NDE_ENTRIES_DIR}") records = [] errors = [] for yaml_file in yaml_files: try: with open(yaml_file, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if data: record = extract_institution_record(data, yaml_file.name) records.append(record) except Exception as e: errors.append((yaml_file.name, str(e))) print(f" Error processing {yaml_file.name}: {e}") print(f"\nProcessed {len(records)} records successfully") if errors: print(f" {len(errors)} errors encountered") # Export to JSON json_output = OUTPUT_DIR / "nde_institutions.json" with open(json_output, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False, default=str) print(f"\nExported JSON: {json_output}") print(f" Size: {json_output.stat().st_size / 1024:.1f} KB") # Try to export to Parquet if pandas/pyarrow available try: import pandas as pd df = pd.DataFrame(records) # Parquet export parquet_output = OUTPUT_DIR / "nde_institutions.parquet" df.to_parquet(parquet_output, index=False) print(f"\nExported Parquet: {parquet_output}") print(f" Size: {parquet_output.stat().st_size / 1024:.1f} KB") # CSV export for easy inspection csv_output = OUTPUT_DIR / "nde_institutions.csv" df.to_csv(csv_output, index=False) print(f"\nExported CSV: {csv_output}") print(f" Size: {csv_output.stat().st_size / 1024:.1f} KB") # Print summary statistics print("\n" + "=" * 60) print("Summary Statistics") print("=" * 60) print(f"Total institutions: {len(df)}") print(f"With Wikidata ID: {df['wikidata_id'].notna().sum()}") print(f"With coordinates: {df['latitude'].notna().sum()}") print(f"With Google rating: {df['google_rating'].notna().sum()}") print(f"With GHCID: {df['ghcid'].notna().sum()}") print(f"\nBy country:") print(df['country'].value_counts().head(10).to_string()) print(f"\nBy type code:") print(df['type_code'].value_counts().head(10).to_string()) print(f"\nBy city (top 20):") print(df['city'].value_counts().head(20).to_string()) except ImportError: print("\nNote: pandas/pyarrow not available, skipping Parquet/CSV export") # Generate metadata file for frontend metadata = { "generated_at": datetime.now().isoformat(), "record_count": len(records), "source_directory": str(NDE_ENTRIES_DIR), "files": { "json": "nde_institutions.json", "parquet": "nde_institutions.parquet", "csv": "nde_institutions.csv" }, "schema": { "entry_index": "integer", "filename": "string", "wikidata_id": "string", "ghcid": "string", "ghcid_uuid": "string", "name": "string", "name_en": "string", "type_code": "string", "type_description": "string", "city": "string", "address": "string", "country": "string", "region": "string", "latitude": "float", "longitude": "float", "geonames_id": "integer", "description_nl": "string", "description_en": "string", "website": "string", "image": "string", "inception_year": "integer", "instance_of": "string", "google_rating": "float", "google_reviews_count": "integer", "google_place_id": "string", "osm_id": "string", "osm_type": "string", "collection_system": "string", "in_museum_register": "boolean", "has_linked_data": "boolean", "processing_timestamp": "datetime", "enrichment_status": "string" } } metadata_output = OUTPUT_DIR / "nde_metadata.json" with open(metadata_output, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2) print(f"\nExported metadata: {metadata_output}") print("\n" + "=" * 60) print("Export complete!") print("=" * 60) print(f"\nFiles ready for DuckDB in: {OUTPUT_DIR}") print("\nTo load in DuckDB browser:") print(" 1. Upload nde_institutions.json via the DuckDB panel") print(" 2. Or fetch directly: SELECT * FROM 'data/nde_institutions.parquet'") if __name__ == "__main__": main()