diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 386a810b..50309f4f 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -36,7 +36,7 @@ jobs: - name: Install libraries shell: bash - run: ./_utils/setup-small.sh + run: ./_setup_utils/setup-small.sh - name: Generate 500mb datasets shell: bash @@ -48,7 +48,7 @@ jobs: - name: Install all solutions shell: bash - run: source path.env && python3 _utils/install_all_solutions.py ${{ matrix.solution }} + run: source path.env && python3 ./_setup_utils/install_all_solutions.py ${{ matrix.solution }} - name: Turn swap off shell: bash @@ -68,14 +68,14 @@ jobs: - name: Run mini GroupBy benchmark shell: bash run: | - python3 _utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }} + python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }} source path.env TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh - name: Run mini Join benchmark shell: bash run: | - python3 _utils/prep_solutions.py --task=join --solution=${{ matrix.solution }} + python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }} source path.env TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh @@ -123,7 +123,7 @@ jobs: - name: Install libraries shell: bash - run: ./_utils/setup-small.sh + run: ./_setup_utils/setup-small.sh - name: Generate 500mb datasets shell: bash @@ -135,7 +135,7 @@ jobs: - name: Install all solutions shell: bash - run: source path.env && python3 _utils/install_all_solutions.py all + run: source path.env && python3 _setup_utils/install_all_solutions.py all - name: Turn swap off shell: bash @@ -144,14 +144,14 @@ jobs: - name: Run mini GroupBy benchmark shell: bash run: | - python3 _utils/prep_solutions.py --task=groupby --solution=all + python3 _setup_utils/prep_solutions.py --task=groupby --solution=all source path.env TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh - name: Run mini Join benchmark shell: bash run: | - python3 _utils/prep_solutions.py --task=join --solution=all + python3 _setup_utils/prep_solutions.py --task=join --solution=all source path.env TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh diff --git a/_benchplot/benchplot.R b/_benchplot/benchplot.R index 040f1d6e..7b5396da 100644 --- a/_benchplot/benchplot.R +++ b/_benchplot/benchplot.R @@ -1,8 +1,15 @@ ## Based on Matt Dowle scripts from 2014 ## https://github.com/h2oai/db-benchmark/commit/fce1b8c9177afb49471fcf483a438f619f1a992b ## Original grouping benchmark can be found in: https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping +suppressPackageStartupMessages(library(bit64)) -format_comma = function(x) format(as.integer(x), big.mark=",") +format_comma = function(x) { + if (x == 1e10) { + "10,000,000,000" + } else { + format(as.integer64(x), big.mark=",") + } +} format_num = function(x, digits=3L) { # at least 3+1 chars on output, there is surely some setting to achieve that better with base R but it is not obvious to find that among all features there cx = sprintf("%0.2f", x) int = sapply(strsplit(cx, ".", fixed=TRUE), `[`, 1L) diff --git a/_control/data.csv b/_control/data.csv index d68b271f..864104d9 100644 --- a/_control/data.csv +++ b/_control/data.csv @@ -14,6 +14,7 @@ groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1 groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1 groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1 groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1 +groupby,G1_1e10_1e4_10_0,1e10,1e4,10,0,1 join,J1_1e7_NA_0_0,1e7,NA,0,0,1 join,J1_1e7_NA_5_0,1e7,NA,5,0,1 join,J1_1e7_NA_0_1,1e7,NA,0,1,1 diff --git a/_control/data_groupby_xlarge.csv b/_control/data_groupby_xlarge.csv new file mode 100644 index 00000000..fe08705c --- /dev/null +++ b/_control/data_groupby_xlarge.csv @@ -0,0 +1,2 @@ +task,data,nrow,k,na,sort,active +groupby,G1_1e10_1e4_10_0,1e10,1e4,10,0,1 \ No newline at end of file diff --git a/_control/data_join_xlarge.csv b/_control/data_join_xlarge.csv new file mode 100644 index 00000000..86d3fc47 --- /dev/null +++ b/_control/data_join_xlarge.csv @@ -0,0 +1,2 @@ +task,data,nrow,k,na,sort,active +join,J1_1e10_NA_0_0,1e10,NA,0,0,1 \ No newline at end of file diff --git a/_control/data_large.csv b/_control/data_large.csv new file mode 100644 index 00000000..db0ba3a2 --- /dev/null +++ b/_control/data_large.csv @@ -0,0 +1,7 @@ +task,data,nrow,k,na,sort,active +groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1 +groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1 +groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1 +groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1 +groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1 +join,J1_1e9_NA_0_0,1e9,NA,0,0,1 \ No newline at end of file diff --git a/_control/data_small.csv b/_control/data_small.csv new file mode 100644 index 00000000..dd0b5378 --- /dev/null +++ b/_control/data_small.csv @@ -0,0 +1,17 @@ +task,data,nrow,k,na,sort,active +groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1 +groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1 +groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1 +groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1 +groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1 +groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1 +groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1 +groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1 +groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1 +groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1 +join,J1_1e7_NA_0_0,1e7,NA,0,0,1 +join,J1_1e7_NA_5_0,1e7,NA,5,0,1 +join,J1_1e7_NA_0_1,1e7,NA,0,1,1 +join,J1_1e8_NA_0_0,1e8,NA,0,0,1 +join,J1_1e8_NA_5_0,1e8,NA,5,0,1 +join,J1_1e8_NA_0_1,1e8,NA,0,1,1 \ No newline at end of file diff --git a/_control/extract_groupby_large.sql b/_control/extract_groupby_large.sql new file mode 100644 index 00000000..9ce180d8 --- /dev/null +++ b/_control/extract_groupby_large.sql @@ -0,0 +1,5 @@ +COPY G1_1e9_1e2_0_0 to 'G1_1e9_1e2_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e9_1e1_0_0 to 'G1_1e9_1e1_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e9_2e0_0_0 to 'G1_1e9_2e0_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e9_1e2_0_1 to 'G1_1e9_1e2_0_1.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e9_1e2_5_0 to 'G1_1e9_1e2_5_0.csv' (FORMAT CSV, HEADER 1); \ No newline at end of file diff --git a/_control/extract_groupby_small.sql b/_control/extract_groupby_small.sql new file mode 100644 index 00000000..7c76a790 --- /dev/null +++ b/_control/extract_groupby_small.sql @@ -0,0 +1,10 @@ +COPY G1_1e7_1e2_0_0 to 'G1_1e7_1e2_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e7_1e1_0_0 to 'G1_1e7_1e1_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e7_2e0_0_0 to 'G1_1e7_2e0_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e7_1e2_0_1 to 'G1_1e7_1e2_0_1.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e7_1e2_5_0 to 'G1_1e7_1e2_5_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e8_1e2_0_0 to 'G1_1e8_1e2_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e8_1e1_0_0 to 'G1_1e8_1e1_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e8_2e0_0_0 to 'G1_1e8_2e0_0_0.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e8_1e2_0_1 to 'G1_1e8_1e2_0_1.csv' (FORMAT CSV, HEADER 1); +COPY G1_1e8_1e2_5_0 to 'G1_1e8_1e2_5_0.csv' (FORMAT CSV, HEADER 1); diff --git a/_control/extract_join_small.sql b/_control/extract_join_small.sql new file mode 100644 index 00000000..72323dc7 --- /dev/null +++ b/_control/extract_join_small.sql @@ -0,0 +1,77 @@ +COPY J1_1e7_NA_0_0 to 'J1_1e7_NA_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_NA_5_0 to 'J1_1e7_NA_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_NA_0_1 to 'J1_1e7_NA_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_0_0 to 'J1_1e8_NA_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_5_0 to 'J1_1e8_NA_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_0_1 to 'J1_1e8_NA_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e9_NA_0_0 to 'J1_1e9_NA_0_0.csv' (FORMAT CSV, HEADER 1); + +COPY J1_1e7_1e7_0_1 to 'J1_1e7_1e7_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e5_0_1 to 'J1_1e8_1e5_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e9_1e6_0_0 to 'J1_1e9_1e6_0_0.csv' (FORMAT CSV, HEADER 1); + +COPY J1_1e7_1e7_5_0 to 'J1_1e7_1e7_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e5_5_0 to 'J1_1e8_1e5_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e9_1e9_0_0 to 'J1_1e9_1e9_0_0.csv' (FORMAT CSV, HEADER 1); + +COPY J1_1e7_1e1_0_0 to 'J1_1e7_1e1_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_NA_0_0 to 'J1_1e7_NA_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e8_0_0 to 'J1_1e8_1e8_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e9_NA_0_0 to 'J1_1e9_NA_0_0.csv' (FORMAT CSV, HEADER 1); + +COPY J1_1e7_1e1_0_1 to 'J1_1e7_1e1_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_NA_0_1 to 'J1_1e7_NA_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e8_0_1 to 'J1_1e8_1e8_0_1.csv' (FORMAT CSV, HEADER 1); + +COPY J1_1e7_1e1_5_0 to 'J1_1e7_1e1_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_NA_5_0 to 'J1_1e7_NA_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e8_5_0 to 'J1_1e8_1e8_5_0.csv' (FORMAT CSV, HEADER 1); + + + + +COPY J1_1e7_1e4_0_0 TO 'J1_1e7_1e4_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e2_0_0 TO 'J1_1e8_1e2_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_0_0 TO 'J1_1e8_NA_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_1e4_0_1 TO 'J1_1e7_1e4_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e2_0_1 TO 'J1_1e8_1e2_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_0_1 TO 'J1_1e8_NA_0_1.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_1e4_5_0 TO 'J1_1e7_1e4_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e2_5_0 TO 'J1_1e8_1e2_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_NA_5_0 TO 'J1_1e8_NA_5_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e7_1e7_0_0 TO 'J1_1e7_1e7_0_0.csv' (FORMAT CSV, HEADER 1); +COPY J1_1e8_1e5_0_0 TO 'J1_1e8_1e5_0_0.csv' (FORMAT CSV, HEADER 1); + + +CREATE TABLE J1_1e7_1e4_0_0 as select * from 'J1_1e7_1e4_0_0.csv'; +CREATE TABLE J1_1e8_1e2_0_0 as select * from 'J1_1e8_1e2_0_0.csv'; +CREATE TABLE J1_1e8_NA_0_0 as select * from 'J1_1e8_NA_0_0.csv'; +CREATE TABLE J1_1e7_1e4_0_1 as select * from 'J1_1e7_1e4_0_1.csv'; +CREATE TABLE J1_1e8_1e2_0_1 as select * from 'J1_1e8_1e2_0_1.csv'; +CREATE TABLE J1_1e8_NA_0_1 as select * from 'J1_1e8_NA_0_1.csv'; +CREATE TABLE J1_1e7_1e4_5_0 as select * from 'J1_1e7_1e4_5_0.csv'; +CREATE TABLE J1_1e8_1e2_5_0 as select * from 'J1_1e8_1e2_5_0.csv'; +CREATE TABLE J1_1e8_NA_5_0 as select * from 'J1_1e8_NA_5_0.csv'; +CREATE TABLE J1_1e7_1e7_0_0 as select * from 'J1_1e7_1e7_0_0.csv'; +CREATE TABLE J1_1e8_1e5_0_0 as select * from 'J1_1e8_1e5_0_0.csv'; +CREATE TABLE J1_1e7_1e7_0_1 as select * from 'J1_1e7_1e7_0_1.csv'; +CREATE TABLE J1_1e8_1e5_0_1 as select * from 'J1_1e8_1e5_0_1.csv'; +CREATE TABLE J1_1e7_1e7_5_0 as select * from 'J1_1e7_1e7_5_0.csv'; +CREATE TABLE J1_1e8_1e5_5_0 as select * from 'J1_1e8_1e5_5_0.csv'; +CREATE TABLE J1_1e7_1e1_0_0 as select * from 'J1_1e7_1e1_0_0.csv'; +CREATE TABLE J1_1e7_NA_0_0 as select * from 'J1_1e7_NA_0_0.csv'; +CREATE TABLE J1_1e8_1e8_0_0 as select * from 'J1_1e8_1e8_0_0.csv'; +CREATE TABLE J1_1e7_1e1_0_1 as select * from 'J1_1e7_1e1_0_1.csv'; +CREATE TABLE J1_1e7_NA_0_1 as select * from 'J1_1e7_NA_0_1.csv'; +CREATE TABLE J1_1e8_1e8_0_1 as select * from 'J1_1e8_1e8_0_1.csv'; +CREATE TABLE J1_1e7_1e1_5_0 as select * from 'J1_1e7_1e1_5_0.csv'; +CREATE TABLE J1_1e7_NA_5_0 as select * from 'J1_1e7_NA_5_0.csv'; +CREATE TABLE J1_1e8_1e8_5_0 as select * from 'J1_1e8_1e8_5_0.csv'; + + + +create table J1_1e9_1e3_0_0 as select * from 'J1_1e9_1e3_0_0.csv'; +CREATE TABLE J1_1e9_NA_0_0 as select * from 'J1_1e9_NA_0_0.csv'; +create table J1_1e9_1e6_0_0 as select * from 'J1_1e9_1e6_0_0.csv'; +create table J1_1e9_1e9_0_0 as select * from 'J1_1e9_1e9_0_0.csv'; + diff --git a/_control/timeout.csv b/_control/timeout.csv index 4860a032..fdbb824c 100644 --- a/_control/timeout.csv +++ b/_control/timeout.csv @@ -2,9 +2,11 @@ task,in_rows,minutes groupby,1e7,10 groupby,1e8,30 groupby,1e9,60 +groupby,1e10,360 join,1e7,10 join,1e8,30 join,1e9,60 +join,1e10,360 groupby2014,1e7,60 groupby2014,1e8,120 groupby2014,1e9,180 diff --git a/_report/index.Rmd b/_report/index.Rmd index b92250ca..b45a74c3 100644 --- a/_report/index.Rmd +++ b/_report/index.Rmd @@ -131,32 +131,29 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except ![](./groupby/G1_1e9_1e2_0_0_advanced.png) -### join {.tabset .tabset-fade .tabset-pills} - -#### 0.5 GB +#### 500 GB ##### **basic questions** -![](./join/J1_1e7_NA_0_0_basic.png) +![](./groupby/G1_1e10_1e4_10_0_basic.png) - +![](./groupby/G1_1e10_1e4_0_0_advanced.png) -#### 5 GB {.active} +### join {.tabset .tabset-fade .tabset-pills} + +#### 0.5 GB ##### **basic questions** -![](./join/J1_1e8_NA_0_0_basic.png) +![](./join/J1_1e7_NA_0_0_basic.png) +#### 5 GB {.active} - +![](./join/J1_1e8_NA_0_0_basic.png) #### 50 GB @@ -166,8 +163,6 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except --- diff --git a/_report/report.R b/_report/report.R index 01b27dd0..8367e458 100644 --- a/_report/report.R +++ b/_report/report.R @@ -13,6 +13,7 @@ get_data_levels = function() { in_rows = c("1e7","1e8","1e9") k_na_sort = c("1e2_0_0","1e1_0_0","2e0_0_0","1e2_0_1","1e2_5_0") groupby = paste("G1", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_") + groupby <- c(groupby, "G1_1e10_1e4_10_0") ## join in_rows = c("1e7","1e8","1e9") k_na_sort = c("NA_0_0","NA_5_0","NA_0_1") @@ -21,6 +22,7 @@ get_data_levels = function() { in_rows = c("1e7","1e8","1e9") k_na_sort = "1e2_0_0" groupby2014 = paste("G0", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_") + list(groupby=groupby, join=join, groupby2014=groupby2014) } get_excluded_batch = function() { diff --git a/_run/partitioned_run.sh b/_run/partitioned_run.sh new file mode 100644 index 00000000..f29ba9b5 --- /dev/null +++ b/_run/partitioned_run.sh @@ -0,0 +1,9 @@ +# set machine type +./_run/run_small_medium.sh + +./_run/run_large.sh + +./_run/run_groupby_xl.sh + + +# call code to rename images \ No newline at end of file diff --git a/_run/run_groupby_xl.sh b/_run/run_groupby_xl.sh new file mode 100755 index 00000000..7bae1d27 --- /dev/null +++ b/_run/run_groupby_xl.sh @@ -0,0 +1,18 @@ +# get groupby large (500GB dataset) +aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby-500gb.duckdb data/groupby-500gb.duckdb + + +# expand groupby-small datasets to csv +duckdb data/groupby-500gb.duckdb -c "copy G1_1e10_1e4_10_0 to 'data/G1_1e10_1e4_10_0.csv' (FORMAT CSV)" + + +cp _control/data_groupby_xlarge.csv _control/data.csv + +echo "Running groupby x-large (500GB) datasets" +./run.sh + +### +echo "done..." +echo "removing data files" +rm data/*.csv +rm data/*.duckdb diff --git a/_run/run_large.sh b/_run/run_large.sh new file mode 100755 index 00000000..d5fd5730 --- /dev/null +++ b/_run/run_large.sh @@ -0,0 +1,34 @@ +# download and expand large data + +# get groupby large (0.5GB and 5GB datasets) +aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb +# get join small (0.5GB and 5GB datasets) +aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb + + +# expand groupby-small datasets to csv +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)" + +# expand join-small datasets to csv +duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_NA_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e6_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e3_0_0.csv' (FORMAT CSV)" + + +cp _control/data_large.csv _control/data.csv + + +echo "Running all solutions on large (50GB) datasets" +./run.sh + + +### +echo "done..." +echo "removing data files" +rm data/*.csv +rm data/*.duckdb \ No newline at end of file diff --git a/_run/run_small_medium.sh b/_run/run_small_medium.sh new file mode 100755 index 00000000..4b6a14f4 --- /dev/null +++ b/_run/run_small_medium.sh @@ -0,0 +1,60 @@ +# first download and expand small data + +# get groupby small (0.5GB and 5GB datasets) +aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb +# get join small (0.5GB and 5GB datasets) +aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb + + +# expand groupby-small datasets to csv +duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)" +duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)" + +# expand join-small datasets to csv +duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)" +duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)" + + +cp _control/data_small.csv _control/data.csv + + +echo "Running all solutions on small (0.5GB and 5GB) datasets" +./run.sh + + +### +echo "done..." +echo "removing small data files" +rm data/*.csv +rm data/*.duckdb + diff --git a/_utils/install_all_solutions.py b/_setup_utils/install_all_solutions.py similarity index 100% rename from _utils/install_all_solutions.py rename to _setup_utils/install_all_solutions.py diff --git a/_setup_utils/mount.sh b/_setup_utils/mount.sh new file mode 100755 index 00000000..72c5faea --- /dev/null +++ b/_setup_utils/mount.sh @@ -0,0 +1,15 @@ +# script to format mount and copy data. + +# remove a leftover instance mount +rm -rf ~/db-benchmark-metal + +# format the mount +sudo mkfs -t xfs /dev/nvme1n1 + +mkdir ~/db-benchmark-metal +# mount the nvme volumn +sudo mount /dev/nvme1n1 ~/db-benchmark-metal +# change ownsership of the volume +sudo chown -R ubuntu ~/db-benchmark-metal/ + +git clone https://github.com/duckdblabs/db-benchmark.git ~/db-benchmark-metal \ No newline at end of file diff --git a/_setup_utils/mount_and_install_solutions.sh b/_setup_utils/mount_and_install_solutions.sh new file mode 100755 index 00000000..54166e27 --- /dev/null +++ b/_setup_utils/mount_and_install_solutions.sh @@ -0,0 +1,27 @@ +# script to format mount and copy data. +# mount the data +./_setup_utils/mount.sh + +# setup all the solutions on db-benchmark-metal. +# creates the necessary python virtual environments and creates the r-libraries +# needed +cd ~/db-benchmark-metal && source path.env && python3 _setup_utils/install_all_solutions.py all + + + +# setup mount for clickhouse spill +# sudo mkfs -t xfs /dev/nvme1n1 +# sudo mkdir /var/lib/clickhouse-nvme-mount/ +# sudo mount /dev/nvme1n1 /var/lib/clickhouse-nvme-mount/ +# # not sure if below is necessary. +# sudo cp -a /var/lib/clickhouse/. /var/lib/clickhouse-nvme-mount/ +# # change ownership of new mount to clickhouse +# sudo chown -R clickhouse:clickhouse /var/lib/clickhouse-nvme-mount/ +# sudo chown -R clickhouse:clickhouse /dev/nvme1n1 + +# # add config so clickhouse knows to use the mount to spill data +# sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml + +echo "------------------------------------------" +echo "------------------------------------------" +echo "READY TO RUN BENCHMARK. ./run.sh" diff --git a/_utils/prep_solutions.py b/_setup_utils/prep_solutions.py similarity index 100% rename from _utils/prep_solutions.py rename to _setup_utils/prep_solutions.py diff --git a/_utils/setup-small.sh b/_setup_utils/setup-small.sh similarity index 96% rename from _utils/setup-small.sh rename to _setup_utils/setup-small.sh index e54fb555..c7130cd2 100755 --- a/_utils/setup-small.sh +++ b/_setup_utils/setup-small.sh @@ -14,7 +14,7 @@ sudo apt-get -qq install -y r-base-dev virtualenv sudo apt-get -qq install openjdk-8-jdk sudo apt-get install -y zlib1g-dev -sudo apt-get install -y pandoc +sudo apt-get install -y pandoc unzip # update virtualenv python3 -m pip install virtualenv diff --git a/_utils/format_and_mount.sh b/_utils/format_and_mount.sh deleted file mode 100755 index 92eae753..00000000 --- a/_utils/format_and_mount.sh +++ /dev/null @@ -1,90 +0,0 @@ -# script to format mount and copy data. - -# remove a leftover instance mount -rm -rf ~/db-benchmark-metal - -# format the mount -sudo mkfs -t xfs /dev/nvme0n1 - -mkdir ~/db-benchmark-metal -# mount the nvme volumn -sudo mount /dev/nvme0n1 ~/db-benchmark-metal -# change ownsership of the volume -sudo chown -R ubuntu ~/db-benchmark-metal/ - -git clone https://github.com/duckdblabs/db-benchmark.git ~/db-benchmark-metal - -# if you have an EBS volume, you can generate the data once, save it on the ebs volume, and transfer it -# each time. - -if [[ $# -gt 0 ]] -then - echo "Creating data" - mkdir -p ~/db-benchmark-metal/data/ - cd ~/db-benchmark-metal/data/ - echo "Creating 500mb group by datasets" - Rscript ../_data/groupby-datagen.R 1e7 1e2 0 0 - Rscript ../_data/groupby-datagen.R 1e7 1e1 0 0 - Rscript ../_data/groupby-datagen.R 1e7 2e0 0 0 - Rscript ../_data/groupby-datagen.R 1e7 1e2 0 1 - Rscript ../_data/groupby-datagen.R 1e7 1e2 5 0 - echo "Creating 5gb group by datasets" - Rscript ../_data/groupby-datagen.R 1e8 1e2 0 0 - Rscript ../_data/groupby-datagen.R 1e8 1e1 0 0 - Rscript ../_data/groupby-datagen.R 1e8 2e0 0 0 - Rscript ../_data/groupby-datagen.R 1e8 1e2 0 1 - Rscript ../_data/groupby-datagen.R 1e8 1e2 5 0 - echo "Creating 50gb group by datasets" - Rscript ../_data/groupby-datagen.R 1e9 1e2 0 0 - Rscript ../_data/groupby-datagen.R 1e9 1e1 0 0 - Rscript ../_data/groupby-datagen.R 1e9 2e0 0 0 - Rscript ../_data/groupby-datagen.R 1e9 1e2 0 1 - Rscript ../_data/groupby-datagen.R 1e9 1e2 5 0 - echo "Creating 500mb join datasets" - Rscript ../_data/join-datagen.R 1e7 0 0 - Rscript ../_data/join-datagen.R 1e7 5 0 - Rscript ../_data/join-datagen.R 1e7 0 1 - echo "Creating 5gb join datasets" - Rscript ../_data/join-datagen.R 1e8 0 0 - Rscript ../_data/join-datagen.R 1e8 5 0 - Rscript ../_data/join-datagen.R 1e8 0 1 - echo "Creating 50gb join datasets" - Rscript ../_data/join-datagen.R 1e9 0 0 - cd .. -elif [[ ! -d "~/db-benchark/data" ]] -then - echo "no arguments passed. Copying data..." - echo "ERROR: directory ~/db-benchmark/data does not exist" -else - mkdir -p ~/db-benchmark-metal/data/ - cd ~/db-benchmark-metal/data/ - echo "Copying data from ~/db-benchark/data" - cp ~/db-benchmark/data/*.csv - cd ~/db-benchmark-metal -fi - - -./_launcher/setup.sh - -# setup all the solutions on db-benchmark-metal. -# creates the necessary python virtual environments and creates the r-libraries -# needed -source path.env && python3 _utils/install_all_solutions.py all - - -# setup mount for clickhouse spill -sudo mkfs -t xfs /dev/nvme1n1 -sudo mkdir /var/lib/clickhouse-nvme-mount/ -sudo mount /dev/nvme1n1 /var/lib/clickhouse-nvme-mount/ -# not sure if below is necessary. -sudo cp -a /var/lib/clickhouse/. /var/lib/clickhouse-nvme-mount/ -# change ownership of new mount to clickhouse -sudo chown -R clickhouse:clickhouse /var/lib/clickhouse-nvme-mount/ -sudo chown -R clickhouse:clickhouse /dev/nvme1n1 - -# add config so clickhouse knows to use the mount to spill data -sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml - -echo "------------------------------------------" -echo "------------------------------------------" -echo "READY TO RUN BENCHMARK. ./run.sh" diff --git a/_utils/partitioned_run.sh b/_utils/partitioned_run.sh new file mode 100644 index 00000000..9fbb1299 --- /dev/null +++ b/_utils/partitioned_run.sh @@ -0,0 +1,4 @@ +./_run/run_small_medium.sh +./_run/run_large.sh +./_run/run_groupby_xl.sh +./_run/run_join_xl.sh \ No newline at end of file diff --git a/_utils/repro.sh b/_utils/repro.sh index a8df441f..aed446b0 100644 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -38,7 +38,6 @@ cd .. Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")' - source ./pandas/py-pandas/bin/activate python3 -m pip install --upgrade psutil python3 -m pip install --upgrade pandas diff --git a/_utils/sleep_and_run.sh b/_utils/sleep_and_run.sh deleted file mode 100755 index 9d6b2249..00000000 --- a/_utils/sleep_and_run.sh +++ /dev/null @@ -1,9 +0,0 @@ -while [ -f run.lock ] -do - sleep 1800 -done - - -rm run.lock - -./run.sh diff --git a/clickhouse/exec.sh b/clickhouse/exec.sh index 2d085cc1..89b27ca2 100755 --- a/clickhouse/exec.sh +++ b/clickhouse/exec.sh @@ -34,6 +34,7 @@ IS_SORTED=$(clickhouse-client --query "SELECT splitByChar('_','$SRC_DATANAME')[5 ON_DISK=0 if [ $1 == 'groupby' ]; then + ON_DISK=$(clickhouse-client --query "SELECT (splitByChar('_','$SRC_DATANAME')[2])::Float32 >= 1e10::Float32 FORMAT TSV") clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME" if [ $HAS_NULL -eq 1 ]; then if [ $IS_SORTED -eq 1 ]; then diff --git a/collapse/groupby-collapse.R b/collapse/groupby-collapse.R index 171bbdf8..9cdc09b4 100755 --- a/collapse/groupby-collapse.R +++ b/collapse/groupby-collapse.R @@ -13,12 +13,13 @@ task = "groupby" solution = "collapse" fun = "group_by" cache = TRUE -on_disk = FALSE data_name = Sys.getenv("SRC_DATANAME") src_grp = file.path("data", paste(data_name, "csv", sep=".")) cat(sprintf("loading dataset %s\n", data_name)) +on_disk = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])>=1e10 + x = data.table::fread(src_grp, showProgress=FALSE, stringsAsFactors=TRUE, na.strings="", data.table=FALSE) print(nrow(x)) gc() diff --git a/collapse/join-collapse.R b/collapse/join-collapse.R index 534cbe83..4dc18cc7 100755 --- a/collapse/join-collapse.R +++ b/collapse/join-collapse.R @@ -21,6 +21,8 @@ src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names stopifnot(length(src_jn_y)==3L) cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", "))) +on_disk = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])>=1e10 + x = data.table::fread(src_jn_x, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings="") data.table::setDF(x) JN = lapply(sapply(simplify=FALSE, src_jn_y, data.table::fread, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""), as.data.frame) diff --git a/dask/setup-dask.sh b/dask/setup-dask.sh index c6fac985..3c24ab07 100755 --- a/dask/setup-dask.sh +++ b/dask/setup-dask.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -virtualenv dask/py-dask --python=python3.10 +virtualenv dask/py-dask --python=python3.12 source dask/py-dask/bin/activate # install binaries diff --git a/datafusion/groupby-datafusion.py b/datafusion/groupby-datafusion.py index af952de7..ee399376 100755 --- a/datafusion/groupby-datafusion.py +++ b/datafusion/groupby-datafusion.py @@ -32,12 +32,23 @@ def ans_shape(batches): on_disk = "FALSE" data_name = os.environ["SRC_DATANAME"] +mount_point = os.environ["MOUNT_POINT"] src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % data_name, flush=True) +scale_factor = data_name.replace("G1_","")[:4].replace("_", "") +on_disk = 'TRUE' if float(scale_factor) >= 1e10 else 'FALSE' + data = pacsv.read_csv(src_grp, convert_options=pacsv.ConvertOptions(auto_dict_encode=True)) + ctx = df.SessionContext() +if on_disk: + runtime = df.RuntimeConfig().with_temp_file_path(f"{mount_point}/datafusion/") + config = (df.SessionConfig()) + ctx = df.SessionContext(config, runtime) + + ctx.register_record_batches("x", [data.to_batches()]) in_rows = data.num_rows diff --git a/datafusion/join-datafusion.py b/datafusion/join-datafusion.py index 1d598757..ee106bd2 100755 --- a/datafusion/join-datafusion.py +++ b/datafusion/join-datafusion.py @@ -32,6 +32,8 @@ def ans_shape(batches): on_disk = "FALSE" data_name = os.environ["SRC_DATANAME"] +mount_point = os.environ["MOUNT_POINT"] + src_jn_x = os.path.join("data", data_name + ".csv") y_data_name = join_to_tbls(data_name) src_jn_y = [os.path.join("data", y_data_name[0] + ".csv"), os.path.join("data", y_data_name[1] + ".csv"), os.path.join("data", y_data_name[2] + ".csv")] @@ -39,9 +41,18 @@ def ans_shape(batches): raise Exception("Something went wrong in preparing files used for join") print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[2] + ", " + y_data_name[2], flush=True) +scale_factor = data_name.replace("J1_","")[:4].replace("_", "") +on_disk = 'TRUE' if float(scale_factor) >= 1e10 else 'FALSE' + ctx = df.SessionContext() +if on_disk: + runtime = df.RuntimeConfig().with_temp_file_path(f"{mount_point}/datafusion/") + config = (df.SessionConfig()) + ctx = df.SessionContext(config, runtime) + + x_data = pacsv.read_csv(src_jn_x, convert_options=pacsv.ConvertOptions(auto_dict_encode=True)) ctx.register_record_batches("x", [x_data.to_batches()]) small_data = pacsv.read_csv(src_jn_y[0], convert_options=pacsv.ConvertOptions(auto_dict_encode=True)) diff --git a/duckdb/groupby-duckdb.R b/duckdb/groupby-duckdb.R index 13703e74..e193dd44 100755 --- a/duckdb/groupby-duckdb.R +++ b/duckdb/groupby-duckdb.R @@ -21,7 +21,7 @@ cat(sprintf("loading dataset %s\n", data_name)) db_file = sprintf('%s-%s-%s.db', solution, task, data_name) -on_disk = FALSE # as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])>=1e9 +on_disk = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])>=1e10 uses_NAs = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][4L])>0 if (on_disk) { print("using disk memory-mapped data storage") diff --git a/duckdb/join-duckdb.R b/duckdb/join-duckdb.R index 7da2f26e..aad3a787 100755 --- a/duckdb/join-duckdb.R +++ b/duckdb/join-duckdb.R @@ -23,6 +23,7 @@ cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=" duckdb_join_db = sprintf('%s_%s_%s.db', solution, task, data_name) on_disk = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])>=1e9 + less_cores = as.numeric(strsplit("J1_1e7_NA_0_0", "_", fixed=TRUE)[[1L]][2L])<=1e7 uses_NAs = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][4L])>0 @@ -41,7 +42,6 @@ if (less_cores) { } invisible(dbExecute(con, sprintf("PRAGMA THREADS=%d", ncores))) -invisible(dbExecute(con, "SET memory_limit='220GB'")) git = dbGetQuery(con, "SELECT source_id FROM pragma_version()")[[1L]] invisible({ @@ -98,8 +98,13 @@ question = "small inner on int" # q1 fun = "inner_join" +table_type = "" +if (on_disk) { + table_type = "TEMP" +} + t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -107,7 +112,7 @@ chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FRO write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) invisible(dbExecute(con, "DROP TABLE IF EXISTS ans")) t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -122,7 +127,7 @@ fun = "inner_join" t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -130,7 +135,7 @@ chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FRO write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) invisible(dbExecute(con, "DROP TABLE IF EXISTS ans")) t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -144,7 +149,7 @@ question = "medium outer on int" # q3 fun = "left_join" t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -152,7 +157,7 @@ chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FRO write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) invisible(dbExecute(con, "DROP TABLE IF EXISTS ans")) t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -166,7 +171,7 @@ question = "medium inner on factor" # q4 fun = "inner_join" t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -174,7 +179,7 @@ chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FRO write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) invisible(dbExecute(con, "DROP TABLE IF EXISTS ans")) t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -188,7 +193,7 @@ question = "big inner on int" # q5 fun = "inner_join" t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() @@ -196,7 +201,7 @@ chkt = system.time(chk<-dbGetQuery(con, "SELECT SUM(v1) AS v1, SUM(v2) AS v2 FRO write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) invisible(dbExecute(con, "DROP TABLE IF EXISTS ans")) t = system.time({ - dbExecute(con, "CREATE TEMP TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)") + dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)", table_type)) print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0")))) })[["elapsed"]] m = memory_usage() diff --git a/juliadf/groupby-juliadf.jl b/juliadf/groupby-juliadf.jl index 9ed5e7cb..b01ea022 100755 --- a/juliadf/groupby-juliadf.jl +++ b/juliadf/groupby-juliadf.jl @@ -17,6 +17,7 @@ solution = "juliadf"; fun = "by"; cache = true; on_disk = false; +isondisk(indata) = parse(Float64, split(indata, "_")[2])>=10^10 data_name = ENV["SRC_DATANAME"]; src_grp = string("data/", data_name, ".csv"); @@ -31,6 +32,8 @@ x = CSV.read(src_grp, in_rows = size(x, 1); println(in_rows); flush(stdout); +on_disk = isondisk(data_name) + task_init = time(); print("grouping...\n"); flush(stdout); diff --git a/juliadf/join-juliadf.jl b/juliadf/join-juliadf.jl index 74b16aaa..020aee63 100755 --- a/juliadf/join-juliadf.jl +++ b/juliadf/join-juliadf.jl @@ -16,6 +16,7 @@ solution = "juliadf"; fun = "join"; cache = true; on_disk = false; +isondisk(indata) = parse(Float64, split(indata, "_")[2])>=10^10 data_name = ENV["SRC_DATANAME"]; src_jn_x = string("data/", data_name, ".csv"); @@ -25,6 +26,8 @@ if length(src_jn_y) != 3 error("Something went wrong in preparing files used for join") end; +on_disk = isondisk(data_name) + println(string("loading datasets ", data_name, ", ", y_data_name[1], ", ", y_data_name[2], ", ", y_data_name[3])); flush(stdout); x_df = CSV.read(src_jn_x, diff --git a/juliads/groupby-juliads.jl b/juliads/groupby-juliads.jl index da518cc9..9d4be11b 100755 --- a/juliads/groupby-juliads.jl +++ b/juliads/groupby-juliads.jl @@ -20,6 +20,7 @@ solution = "juliads"; fun = "combine"; cache = true; on_disk = false; +isondisk(indata) = parse(Float64, split(indata, "_")[2])>=10^10 data_name = ENV["SRC_DATANAME"]; src_grp = string("data/", data_name, ".csv"); @@ -28,6 +29,8 @@ println(string("loading dataset ", data_name)); flush(stdout); x = filereader(src_grp, types=[Characters{5}, Characters{5}, Characters{12}, Int32, Int32, Int32, Int32, Int32, Float64]); modify!(x, 1:3 => PooledArray) +on_disk = isondisk(data_name) + in_rows = size(x, 1); println(in_rows); flush(stdout); diff --git a/juliads/join-juliads.jl b/juliads/join-juliads.jl index 594e6049..28b725fb 100755 --- a/juliads/join-juliads.jl +++ b/juliads/join-juliads.jl @@ -21,7 +21,7 @@ solution = "juliads"; fun = "join"; cache = true; on_disk = false; -isondisk(indata) = false # It seems that the new machine has enough memory - parse(Float64, split(indata, "_")[2])>=10^9 +isondisk(indata) = parse(Float64, split(indata, "_")[2])>=10^10 data_name = ENV["SRC_DATANAME"]; src_jn_x = string("data/", data_name, ".csv"); @@ -31,6 +31,8 @@ if length(src_jn_y) != 3 error("Something went wrong in preparing files used for join") end; +on_disk = isondisk(data_name) + println(string("loading datasets ", data_name, ", ", y_data_name[1], ", ", y_data_name[2], ", ", y_data_name[3])); flush(stdout); # temporary file which will be deleted after the run - usually located at /tmp/ diff --git a/path.env b/path.env index 86335799..d4368ee1 100644 --- a/path.env +++ b/path.env @@ -1,4 +1,4 @@ export JULIA_HOME=/opt/julia-1.9.2 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 export PATH=$PATH:$JULIA_HOME/bin -export MOUNT_POINT=$HOME/db-benchmark-metal \ No newline at end of file +export MOUNT_POINT=$HOME/db-benchmark-metal diff --git a/polars/groupby-polars.py b/polars/groupby-polars.py index df3cb77a..df60d489 100755 --- a/polars/groupby-polars.py +++ b/polars/groupby-polars.py @@ -27,6 +27,9 @@ x = (pl.read_csv(src_grp, schema_overrides={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}, low_memory=True, rechunk=True) .with_columns(pl.col(["id1", "id2", "id3"]).cast(pl.Categorical))) +scale_factor = data_name.replace("G1_","")[:4].replace("_", "") +on_disk = 'TRUE' if float(scale_factor) >= 1e10 else 'FALSE' + in_rows = x.shape[0] x.write_ipc(f"{mount_point}/polars/tmp.ipc") del x diff --git a/polars/join-polars.py b/polars/join-polars.py index 91e793b0..3f05f84b 100755 --- a/polars/join-polars.py +++ b/polars/join-polars.py @@ -24,6 +24,8 @@ if len(src_jn_y) != 3: raise Exception("Something went wrong in preparing files used for join") +on_disk = 'TRUE' if float(scale_factor) >= 1e10 else 'FALSE' + print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[2] + ", " + y_data_name[2], flush=True) with pl.StringCache(): diff --git a/run.conf b/run.conf index c019b15f..f151bf28 100644 --- a/run.conf +++ b/run.conf @@ -16,5 +16,5 @@ export DO_REPORT=true export DO_PUBLISH=false # logging and timing files -export CSV_LOGS_FILE=logs.csv -export CSV_TIME_FILE=time.csv +export CSV_LOGS_FILE="logs.csv" +export CSV_TIME_FILE="time.csv" diff --git a/run.sh b/run.sh index 27a7b141..77337e0b 100755 --- a/run.sh +++ b/run.sh @@ -29,6 +29,8 @@ else Rscript -e 'swap_all<-data.table::fread("free -h | grep Swap", header=FALSE)[, -1L][, as.numeric(gsub("[^0-9.]", "", unlist(.SD)))]; swap_off<-!is.na(s<-sum(swap_all)) && s==0; q("no", status=as.numeric(swap_off))' && echo "# Benchmark run $BATCH aborted. swap is enabled, 'free -h' has to report only 0s for Swap, run 'swapoff -a' before calling 'run.sh'" && exit; fi + + # ensure directories exists mkdir -p ./out if [[ ! -d ./data ]]; then echo "# Benchmark run $BATCH aborted. './data' directory does not exists" && exit; fi;