h2oai · Dandandan · Jan 17, 2021 · Jan 17, 2021 · Jan 17, 2021 · Jan 17, 2021
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,5 @@ run.out
 clickhouse/etc_sudoers.bak
 workdir/
 timeout-exit-codes.out
+*/target
+*.lock
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "db-benchmark"
+version = "0.1.0"
+edition = "2018"
+
+[dependencies]
+datafusion = { git = "https://github.com/apache/arrow.git", features = ["simd"] }
+arrow = { git = "https://github.com/apache/arrow.git", features = ["simd"] }
+tokio = { version = "0.2", features = ["macros", "rt-core", "rt-threaded"] }
+snmalloc-rs = "0.2"
+
+[profile.release]
+lto = true
diff --git a/datafusion/setup-datafusion.sh b/datafusion/setup-datafusion.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
diff --git a/datafusion/src/main.rs b/datafusion/src/main.rs
@@ -0,0 +1,78 @@
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::datasource::{CsvFile, MemTable};
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use std::env;
+use std::time::Instant;
+
+#[cfg(feature = "snmalloc")]
+#[global_allocator]
+static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut ctx = ExecutionContext::new();
+    let data = format!("../data/{}.csv", env::var("SRC_DATANAME").unwrap());
+
+    let schema = Schema::new(vec![
+        Field::new("id1", DataType::Utf8, false),
+        Field::new("id2", DataType::Utf8, false),
+        Field::new("id3", DataType::Utf8, false),
+        Field::new("id4", DataType::Int32, false),
+        Field::new("id5", DataType::Int32, false),
+        Field::new("id6", DataType::Int32, false),
+        Field::new("v1", DataType::Int32, false),
+        Field::new("v2", DataType::Int32, false),
+        Field::new("v3", DataType::Float64, false),
+    ]);
+    let options = CsvReadOptions::new().schema(&schema).has_header(true);
+
+    let csv = CsvFile::try_new(&data, options).unwrap();
+    let batch_size = 65536;
+    let memtable = MemTable::load(&csv, batch_size).await?;
+    ctx.register_table("t", Box::new(memtable));
+
+    // "q1"
+    let start = Instant::now();
+    let df = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM t GROUP BY id1")?;
+
+    let _results = df.collect().await?;
 def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk): 
 def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk): 
+
+    println!("q1 took {} ms", start.elapsed().as_millis());
+
+    // "q2"
+    let start = Instant::now();
+    let df = ctx.sql("SELECT id1, id2, SUM(v1) AS v1 FROM t GROUP BY id1, id2")?;
+
+    let _results = df.collect().await?;
+
+    println!("q2 took {} ms", start.elapsed().as_millis());
+
+    // "q3"
+    let start = Instant::now();
+    let df = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM t GROUP BY id3")?;
+
+    let _results = df.collect().await?;
+
+    println!("q3 took {} ms", start.elapsed().as_millis());
+
+    // "q4"
+    let start = Instant::now();
+    let df =
+        ctx.sql("SELECT id4, AVG(v1) AS v1, AVG(v2) AS v2, AVG(v3) AS v3 FROM t GROUP BY id4")?;
+
+    let _results = df.collect().await?;
+
+    println!("q4 took {} ms", start.elapsed().as_millis());
+
+    // "q5"
+    let start = Instant::now();
+    let df =
+        ctx.sql("SELECT id6, SUM(v1) AS v1, SUM(v2) AS v2, SUM(v3) AS v3 FROM t GROUP BY id6")?;
+
+    let _results = df.collect().await?;
+
+    println!("q5 took {} ms", start.elapsed().as_millis());
+
+    Ok(())
+}
diff --git a/datafusion/upg-datafusion.sh b/datafusion/upg-datafusion.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+cd datafusion
+
+cargo update
+
+cd ../
diff --git a/datafusion/ver-datafusion.sh b/datafusion/ver-datafusion.sh
@@ -0,0 +1 @@
+cargo tree | grep "├── datafusion (.*)"
diff --git a/run.conf b/run.conf
@@ -1,7 +1,7 @@
 # task, used in init-setup-iteration.R
 export RUN_TASKS="groupby join"
 # solution, used in init-setup-iteration.R
-export RUN_SOLUTIONS="data.table pydatatable dplyr pandas spark dask juliadf cudf clickhouse polars"
+export RUN_SOLUTIONS="data.table pydatatable dplyr pandas spark dask juliadf cudf clickhouse polars datafusion"
 
 # flag to upgrade tools, used in run.sh on init
 export DO_UPGRADE=true

diff --git a/run.sh b/run.sh
@@ -62,6 +62,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/upg-h2o.
 if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi;
 if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi;
 if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi;
+if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "datafusion" ]]; then ./datafusion/upg-datafusion.sh; fi;
+if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./datafusion/ver-datafusion.sh; fi;
 
 # run
 if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after $(($(date +%s)-$BATCH))s due to 'stop' file" && rm -f ./stop && rm -f ./run.lock && exit; fi;