283 lines
10 KiB
Python
283 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export NDE enriched entries to formats suitable for DuckDB browser loading.
|
|
|
|
Generates:
|
|
1. JSON file for direct browser upload
|
|
2. Parquet file for efficient loading
|
|
3. Summary statistics
|
|
|
|
Usage:
|
|
python scripts/export_nde_for_duckdb.py
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import yaml
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
NDE_ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
|
|
OUTPUT_DIR = PROJECT_ROOT / "frontend" / "public" / "data"
|
|
|
|
|
|
def safe_get(d: dict, *keys, default=None):
|
|
"""Safely get nested dictionary value."""
|
|
for key in keys:
|
|
if isinstance(d, dict):
|
|
d = d.get(key, default)
|
|
else:
|
|
return default
|
|
return d
|
|
|
|
|
|
def extract_institution_record(yaml_data: dict, filename: str) -> dict:
|
|
"""Extract a flat record from YAML entry for DuckDB."""
|
|
original = yaml_data.get("original_entry", {})
|
|
wikidata = yaml_data.get("wikidata_enrichment", {})
|
|
google = yaml_data.get("google_maps_enrichment", {})
|
|
osm = yaml_data.get("osm_enrichment", {})
|
|
ghcid_data = yaml_data.get("ghcid", {})
|
|
location = yaml_data.get("location", {})
|
|
location_res = yaml_data.get("location_resolution", {})
|
|
|
|
# Extract Wikidata ID from filename or data
|
|
wikidata_id = None
|
|
if "_Q" in filename:
|
|
wikidata_id = filename.split("_Q")[1].replace(".yaml", "")
|
|
wikidata_id = f"Q{wikidata_id}"
|
|
elif wikidata.get("wikidata_entity_id"):
|
|
wikidata_id = wikidata["wikidata_entity_id"]
|
|
|
|
# Get coordinates
|
|
coords = wikidata.get("wikidata_coordinates", {})
|
|
lat = coords.get("latitude") or location.get("latitude")
|
|
lon = coords.get("longitude") or location.get("longitude")
|
|
|
|
# Get Google Maps data
|
|
google_rating = safe_get(google, "rating")
|
|
google_reviews = safe_get(google, "user_ratings_total") or safe_get(google, "reviews_count")
|
|
|
|
# Build record
|
|
record = {
|
|
# Identifiers
|
|
"entry_index": yaml_data.get("entry_index"),
|
|
"filename": filename,
|
|
"wikidata_id": wikidata_id,
|
|
"ghcid": ghcid_data.get("ghcid_string") if isinstance(ghcid_data, dict) else None,
|
|
"ghcid_uuid": ghcid_data.get("ghcid_uuid") if isinstance(ghcid_data, dict) else None,
|
|
|
|
# Basic info from original entry
|
|
"name": original.get("organisatie") or wikidata.get("wikidata_label_nl"),
|
|
"name_en": wikidata.get("wikidata_label_en"),
|
|
"type_code": original.get("type", [None])[0] if isinstance(original.get("type"), list) else original.get("type"),
|
|
"type_description": original.get("type_organisatie"),
|
|
|
|
# Location
|
|
"city": original.get("plaatsnaam_bezoekadres") or location.get("city"),
|
|
"address": original.get("straat_en_huisnummer_bezoekadres"),
|
|
"country": location.get("country") or "NL",
|
|
"region": location_res.get("region_code") or location.get("region"),
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
"geonames_id": location_res.get("geonames_id"),
|
|
|
|
# Wikidata enrichment
|
|
"description_nl": wikidata.get("wikidata_description_nl"),
|
|
"description_en": wikidata.get("wikidata_description_en"),
|
|
"website": original.get("webadres_organisatie") or wikidata.get("wikidata_official_website"),
|
|
"image": wikidata.get("wikidata_image"),
|
|
"inception_year": extract_year(wikidata.get("wikidata_inception")),
|
|
|
|
# Instance of (type from Wikidata)
|
|
"instance_of": extract_instance_of(wikidata.get("wikidata_instance_of")),
|
|
|
|
# Google Maps enrichment
|
|
"google_rating": google_rating,
|
|
"google_reviews_count": google_reviews,
|
|
"google_place_id": safe_get(google, "place_id"),
|
|
|
|
# OSM enrichment
|
|
"osm_id": safe_get(osm, "osm_id"),
|
|
"osm_type": safe_get(osm, "osm_type"),
|
|
|
|
# Systems and standards
|
|
"collection_system": original.get("systeem"),
|
|
"in_museum_register": original.get("museum_register") == "ja",
|
|
"has_linked_data": original.get("linked_data") == "ja",
|
|
|
|
# Processing metadata
|
|
"processing_timestamp": yaml_data.get("processing_timestamp"),
|
|
"enrichment_status": yaml_data.get("enrichment_status", "unknown"),
|
|
}
|
|
|
|
return record
|
|
|
|
|
|
def extract_year(inception_data):
|
|
"""Extract year from Wikidata inception data."""
|
|
if not inception_data:
|
|
return None
|
|
time_str = inception_data.get("time", "")
|
|
if time_str:
|
|
# Format: +1959-00-00T00:00:00Z
|
|
try:
|
|
year_part = time_str.split("-")[0].replace("+", "")
|
|
return int(year_part) if year_part.isdigit() else None
|
|
except:
|
|
return None
|
|
return None
|
|
|
|
|
|
def extract_instance_of(instance_of_list):
|
|
"""Extract primary instance_of label."""
|
|
if not instance_of_list or not isinstance(instance_of_list, list):
|
|
return None
|
|
if len(instance_of_list) > 0:
|
|
first = instance_of_list[0]
|
|
if isinstance(first, dict):
|
|
return first.get("label_en") or first.get("label_nl")
|
|
return None
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("NDE to DuckDB Export")
|
|
print("=" * 60)
|
|
|
|
# Ensure output directory exists
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Collect all YAML files
|
|
yaml_files = sorted(NDE_ENTRIES_DIR.glob("*.yaml"))
|
|
print(f"\nFound {len(yaml_files)} YAML files in {NDE_ENTRIES_DIR}")
|
|
|
|
records = []
|
|
errors = []
|
|
|
|
for yaml_file in yaml_files:
|
|
try:
|
|
with open(yaml_file, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data:
|
|
record = extract_institution_record(data, yaml_file.name)
|
|
records.append(record)
|
|
except Exception as e:
|
|
errors.append((yaml_file.name, str(e)))
|
|
print(f" Error processing {yaml_file.name}: {e}")
|
|
|
|
print(f"\nProcessed {len(records)} records successfully")
|
|
if errors:
|
|
print(f" {len(errors)} errors encountered")
|
|
|
|
# Export to JSON
|
|
json_output = OUTPUT_DIR / "nde_institutions.json"
|
|
with open(json_output, "w", encoding="utf-8") as f:
|
|
json.dump(records, f, indent=2, ensure_ascii=False, default=str)
|
|
print(f"\nExported JSON: {json_output}")
|
|
print(f" Size: {json_output.stat().st_size / 1024:.1f} KB")
|
|
|
|
# Try to export to Parquet if pandas/pyarrow available
|
|
try:
|
|
import pandas as pd
|
|
|
|
df = pd.DataFrame(records)
|
|
|
|
# Parquet export
|
|
parquet_output = OUTPUT_DIR / "nde_institutions.parquet"
|
|
df.to_parquet(parquet_output, index=False)
|
|
print(f"\nExported Parquet: {parquet_output}")
|
|
print(f" Size: {parquet_output.stat().st_size / 1024:.1f} KB")
|
|
|
|
# CSV export for easy inspection
|
|
csv_output = OUTPUT_DIR / "nde_institutions.csv"
|
|
df.to_csv(csv_output, index=False)
|
|
print(f"\nExported CSV: {csv_output}")
|
|
print(f" Size: {csv_output.stat().st_size / 1024:.1f} KB")
|
|
|
|
# Print summary statistics
|
|
print("\n" + "=" * 60)
|
|
print("Summary Statistics")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {len(df)}")
|
|
print(f"With Wikidata ID: {df['wikidata_id'].notna().sum()}")
|
|
print(f"With coordinates: {df['latitude'].notna().sum()}")
|
|
print(f"With Google rating: {df['google_rating'].notna().sum()}")
|
|
print(f"With GHCID: {df['ghcid'].notna().sum()}")
|
|
print(f"\nBy country:")
|
|
print(df['country'].value_counts().head(10).to_string())
|
|
print(f"\nBy type code:")
|
|
print(df['type_code'].value_counts().head(10).to_string())
|
|
print(f"\nBy city (top 20):")
|
|
print(df['city'].value_counts().head(20).to_string())
|
|
|
|
except ImportError:
|
|
print("\nNote: pandas/pyarrow not available, skipping Parquet/CSV export")
|
|
|
|
# Generate metadata file for frontend
|
|
metadata = {
|
|
"generated_at": datetime.now().isoformat(),
|
|
"record_count": len(records),
|
|
"source_directory": str(NDE_ENTRIES_DIR),
|
|
"files": {
|
|
"json": "nde_institutions.json",
|
|
"parquet": "nde_institutions.parquet",
|
|
"csv": "nde_institutions.csv"
|
|
},
|
|
"schema": {
|
|
"entry_index": "integer",
|
|
"filename": "string",
|
|
"wikidata_id": "string",
|
|
"ghcid": "string",
|
|
"ghcid_uuid": "string",
|
|
"name": "string",
|
|
"name_en": "string",
|
|
"type_code": "string",
|
|
"type_description": "string",
|
|
"city": "string",
|
|
"address": "string",
|
|
"country": "string",
|
|
"region": "string",
|
|
"latitude": "float",
|
|
"longitude": "float",
|
|
"geonames_id": "integer",
|
|
"description_nl": "string",
|
|
"description_en": "string",
|
|
"website": "string",
|
|
"image": "string",
|
|
"inception_year": "integer",
|
|
"instance_of": "string",
|
|
"google_rating": "float",
|
|
"google_reviews_count": "integer",
|
|
"google_place_id": "string",
|
|
"osm_id": "string",
|
|
"osm_type": "string",
|
|
"collection_system": "string",
|
|
"in_museum_register": "boolean",
|
|
"has_linked_data": "boolean",
|
|
"processing_timestamp": "datetime",
|
|
"enrichment_status": "string"
|
|
}
|
|
}
|
|
|
|
metadata_output = OUTPUT_DIR / "nde_metadata.json"
|
|
with open(metadata_output, "w", encoding="utf-8") as f:
|
|
json.dump(metadata, f, indent=2)
|
|
print(f"\nExported metadata: {metadata_output}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Export complete!")
|
|
print("=" * 60)
|
|
print(f"\nFiles ready for DuckDB in: {OUTPUT_DIR}")
|
|
print("\nTo load in DuckDB browser:")
|
|
print(" 1. Upload nde_institutions.json via the DuckDB panel")
|
|
print(" 2. Or fetch directly: SELECT * FROM 'data/nde_institutions.parquet'")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|