glam/scripts/export_nde_for_duckdb.py
2025-12-06 19:50:04 +01:00

283 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Export NDE enriched entries to formats suitable for DuckDB browser loading.
Generates:
1. JSON file for direct browser upload
2. Parquet file for efficient loading
3. Summary statistics
Usage:
python scripts/export_nde_for_duckdb.py
"""
import json
import sys
from pathlib import Path
from datetime import datetime
import yaml
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
NDE_ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
OUTPUT_DIR = PROJECT_ROOT / "frontend" / "public" / "data"
def safe_get(d: dict, *keys, default=None):
"""Safely get nested dictionary value."""
for key in keys:
if isinstance(d, dict):
d = d.get(key, default)
else:
return default
return d
def extract_institution_record(yaml_data: dict, filename: str) -> dict:
"""Extract a flat record from YAML entry for DuckDB."""
original = yaml_data.get("original_entry", {})
wikidata = yaml_data.get("wikidata_enrichment", {})
google = yaml_data.get("google_maps_enrichment", {})
osm = yaml_data.get("osm_enrichment", {})
ghcid_data = yaml_data.get("ghcid", {})
location = yaml_data.get("location", {})
location_res = yaml_data.get("location_resolution", {})
# Extract Wikidata ID from filename or data
wikidata_id = None
if "_Q" in filename:
wikidata_id = filename.split("_Q")[1].replace(".yaml", "")
wikidata_id = f"Q{wikidata_id}"
elif wikidata.get("wikidata_entity_id"):
wikidata_id = wikidata["wikidata_entity_id"]
# Get coordinates
coords = wikidata.get("wikidata_coordinates", {})
lat = coords.get("latitude") or location.get("latitude")
lon = coords.get("longitude") or location.get("longitude")
# Get Google Maps data
google_rating = safe_get(google, "rating")
google_reviews = safe_get(google, "user_ratings_total") or safe_get(google, "reviews_count")
# Build record
record = {
# Identifiers
"entry_index": yaml_data.get("entry_index"),
"filename": filename,
"wikidata_id": wikidata_id,
"ghcid": ghcid_data.get("ghcid_string") if isinstance(ghcid_data, dict) else None,
"ghcid_uuid": ghcid_data.get("ghcid_uuid") if isinstance(ghcid_data, dict) else None,
# Basic info from original entry
"name": original.get("organisatie") or wikidata.get("wikidata_label_nl"),
"name_en": wikidata.get("wikidata_label_en"),
"type_code": original.get("type", [None])[0] if isinstance(original.get("type"), list) else original.get("type"),
"type_description": original.get("type_organisatie"),
# Location
"city": original.get("plaatsnaam_bezoekadres") or location.get("city"),
"address": original.get("straat_en_huisnummer_bezoekadres"),
"country": location.get("country") or "NL",
"region": location_res.get("region_code") or location.get("region"),
"latitude": lat,
"longitude": lon,
"geonames_id": location_res.get("geonames_id"),
# Wikidata enrichment
"description_nl": wikidata.get("wikidata_description_nl"),
"description_en": wikidata.get("wikidata_description_en"),
"website": original.get("webadres_organisatie") or wikidata.get("wikidata_official_website"),
"image": wikidata.get("wikidata_image"),
"inception_year": extract_year(wikidata.get("wikidata_inception")),
# Instance of (type from Wikidata)
"instance_of": extract_instance_of(wikidata.get("wikidata_instance_of")),
# Google Maps enrichment
"google_rating": google_rating,
"google_reviews_count": google_reviews,
"google_place_id": safe_get(google, "place_id"),
# OSM enrichment
"osm_id": safe_get(osm, "osm_id"),
"osm_type": safe_get(osm, "osm_type"),
# Systems and standards
"collection_system": original.get("systeem"),
"in_museum_register": original.get("museum_register") == "ja",
"has_linked_data": original.get("linked_data") == "ja",
# Processing metadata
"processing_timestamp": yaml_data.get("processing_timestamp"),
"enrichment_status": yaml_data.get("enrichment_status", "unknown"),
}
return record
def extract_year(inception_data):
"""Extract year from Wikidata inception data."""
if not inception_data:
return None
time_str = inception_data.get("time", "")
if time_str:
# Format: +1959-00-00T00:00:00Z
try:
year_part = time_str.split("-")[0].replace("+", "")
return int(year_part) if year_part.isdigit() else None
except:
return None
return None
def extract_instance_of(instance_of_list):
"""Extract primary instance_of label."""
if not instance_of_list or not isinstance(instance_of_list, list):
return None
if len(instance_of_list) > 0:
first = instance_of_list[0]
if isinstance(first, dict):
return first.get("label_en") or first.get("label_nl")
return None
def main():
print("=" * 60)
print("NDE to DuckDB Export")
print("=" * 60)
# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Collect all YAML files
yaml_files = sorted(NDE_ENTRIES_DIR.glob("*.yaml"))
print(f"\nFound {len(yaml_files)} YAML files in {NDE_ENTRIES_DIR}")
records = []
errors = []
for yaml_file in yaml_files:
try:
with open(yaml_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if data:
record = extract_institution_record(data, yaml_file.name)
records.append(record)
except Exception as e:
errors.append((yaml_file.name, str(e)))
print(f" Error processing {yaml_file.name}: {e}")
print(f"\nProcessed {len(records)} records successfully")
if errors:
print(f" {len(errors)} errors encountered")
# Export to JSON
json_output = OUTPUT_DIR / "nde_institutions.json"
with open(json_output, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False, default=str)
print(f"\nExported JSON: {json_output}")
print(f" Size: {json_output.stat().st_size / 1024:.1f} KB")
# Try to export to Parquet if pandas/pyarrow available
try:
import pandas as pd
df = pd.DataFrame(records)
# Parquet export
parquet_output = OUTPUT_DIR / "nde_institutions.parquet"
df.to_parquet(parquet_output, index=False)
print(f"\nExported Parquet: {parquet_output}")
print(f" Size: {parquet_output.stat().st_size / 1024:.1f} KB")
# CSV export for easy inspection
csv_output = OUTPUT_DIR / "nde_institutions.csv"
df.to_csv(csv_output, index=False)
print(f"\nExported CSV: {csv_output}")
print(f" Size: {csv_output.stat().st_size / 1024:.1f} KB")
# Print summary statistics
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
print(f"Total institutions: {len(df)}")
print(f"With Wikidata ID: {df['wikidata_id'].notna().sum()}")
print(f"With coordinates: {df['latitude'].notna().sum()}")
print(f"With Google rating: {df['google_rating'].notna().sum()}")
print(f"With GHCID: {df['ghcid'].notna().sum()}")
print(f"\nBy country:")
print(df['country'].value_counts().head(10).to_string())
print(f"\nBy type code:")
print(df['type_code'].value_counts().head(10).to_string())
print(f"\nBy city (top 20):")
print(df['city'].value_counts().head(20).to_string())
except ImportError:
print("\nNote: pandas/pyarrow not available, skipping Parquet/CSV export")
# Generate metadata file for frontend
metadata = {
"generated_at": datetime.now().isoformat(),
"record_count": len(records),
"source_directory": str(NDE_ENTRIES_DIR),
"files": {
"json": "nde_institutions.json",
"parquet": "nde_institutions.parquet",
"csv": "nde_institutions.csv"
},
"schema": {
"entry_index": "integer",
"filename": "string",
"wikidata_id": "string",
"ghcid": "string",
"ghcid_uuid": "string",
"name": "string",
"name_en": "string",
"type_code": "string",
"type_description": "string",
"city": "string",
"address": "string",
"country": "string",
"region": "string",
"latitude": "float",
"longitude": "float",
"geonames_id": "integer",
"description_nl": "string",
"description_en": "string",
"website": "string",
"image": "string",
"inception_year": "integer",
"instance_of": "string",
"google_rating": "float",
"google_reviews_count": "integer",
"google_place_id": "string",
"osm_id": "string",
"osm_type": "string",
"collection_system": "string",
"in_museum_register": "boolean",
"has_linked_data": "boolean",
"processing_timestamp": "datetime",
"enrichment_status": "string"
}
}
metadata_output = OUTPUT_DIR / "nde_metadata.json"
with open(metadata_output, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)
print(f"\nExported metadata: {metadata_output}")
print("\n" + "=" * 60)
print("Export complete!")
print("=" * 60)
print(f"\nFiles ready for DuckDB in: {OUTPUT_DIR}")
print("\nTo load in DuckDB browser:")
print(" 1. Upload nde_institutions.json via the DuckDB panel")
print(" 2. Or fetch directly: SELECT * FROM 'data/nde_institutions.parquet'")
if __name__ == "__main__":
main()