Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
837149c
add 500GB runs
Tmonster Sep 4, 2024
24b6e91
add more helper files
Tmonster Sep 4, 2024
69d68bc
added 500GB. need to modify some solutions to go on disk for the 500G…
Tmonster Sep 4, 2024
96bb929
add back in on_disk check for some solutions
Tmonster Sep 4, 2024
758be4b
add script to set up and run benchmark
Tmonster Sep 5, 2024
f7f2a5b
make it easier to choose what gets run
Tmonster Sep 5, 2024
10a90c9
fixed some scripts
Tmonster Sep 5, 2024
28754db
change ~duckdb to duckdb
Tmonster Sep 5, 2024
aa76eb5
fix more path issues
Tmonster Sep 5, 2024
ae508c6
one last fix
Tmonster Sep 5, 2024
07ea245
change permissions on run files
Tmonster Sep 5, 2024
67b1ec5
move -c
Tmonster Sep 5, 2024
c64fd3e
small update
Tmonster Sep 5, 2024
1179bf4
fix polar src_dataname
Tmonster Sep 5, 2024
a637af5
fix polar scale factor again
Tmonster Sep 5, 2024
bcb2831
neato
Tmonster Sep 5, 2024
66dc1a6
modify setup script to be more modular
Tmonster Sep 6, 2024
39fdd45
use aws s3 copy and create a _setup_utils specifically for setting up…
Tmonster Sep 6, 2024
6a4c1bc
fix run.conf and run.sh
Tmonster Sep 6, 2024
aef8e50
modify regression benchmark runner
Tmonster Sep 6, 2024
5b31321
fix some datafusion things
Tmonster Sep 6, 2024
960c07c
datafusion needs better scale factor
Tmonster Sep 6, 2024
1dce3c6
fix datafusion
Tmonster Sep 6, 2024
894eeef
some more updates to duckdb
Tmonster Sep 10, 2024
293d5d0
typo
Tmonster Sep 10, 2024
a3cba57
fix merge conflicts
Tmonster Sep 12, 2024
1a30c75
Merge branch 'add_500GB_run' of github.com:Tmonster/db-benchmark into…
Tmonster Sep 12, 2024
a6dbf35
add new line to path.env
Tmonster Sep 12, 2024
7563d4a
add datafusion ability to go off disk
Tmonster Sep 12, 2024
0b0e43f
some updates
Tmonster Sep 13, 2024
31c1f29
remove code to run 500GB join.
Tmonster Sep 18, 2024
30fa0ed
add code to fix report
Tmonster Sep 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Install libraries
shell: bash
run: ./_utils/setup-small.sh
run: ./_setup_utils/setup-small.sh

- name: Generate 500mb datasets
shell: bash
Expand All @@ -48,7 +48,7 @@ jobs:

- name: Install all solutions
shell: bash
run: source path.env && python3 _utils/install_all_solutions.py ${{ matrix.solution }}
run: source path.env && python3 ./_setup_utils/install_all_solutions.py ${{ matrix.solution }}

- name: Turn swap off
shell: bash
Expand All @@ -68,14 +68,14 @@ jobs:
- name: Run mini GroupBy benchmark
shell: bash
run: |
python3 _utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
source path.env
TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh

- name: Run mini Join benchmark
shell: bash
run: |
python3 _utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
source path.env
TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh

Expand Down Expand Up @@ -123,7 +123,7 @@ jobs:

- name: Install libraries
shell: bash
run: ./_utils/setup-small.sh
run: ./_setup_utils/setup-small.sh

- name: Generate 500mb datasets
shell: bash
Expand All @@ -135,7 +135,7 @@ jobs:

- name: Install all solutions
shell: bash
run: source path.env && python3 _utils/install_all_solutions.py all
run: source path.env && python3 _setup_utils/install_all_solutions.py all

- name: Turn swap off
shell: bash
Expand All @@ -144,14 +144,14 @@ jobs:
- name: Run mini GroupBy benchmark
shell: bash
run: |
python3 _utils/prep_solutions.py --task=groupby --solution=all
python3 _setup_utils/prep_solutions.py --task=groupby --solution=all
source path.env
TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh

- name: Run mini Join benchmark
shell: bash
run: |
python3 _utils/prep_solutions.py --task=join --solution=all
python3 _setup_utils/prep_solutions.py --task=join --solution=all
source path.env
TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh

Expand Down
9 changes: 8 additions & 1 deletion _benchplot/benchplot.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
## Based on Matt Dowle scripts from 2014
## https://github.com/h2oai/db-benchmark/commit/fce1b8c9177afb49471fcf483a438f619f1a992b
## Original grouping benchmark can be found in: https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping
suppressPackageStartupMessages(library(bit64))

format_comma = function(x) format(as.integer(x), big.mark=",")
format_comma = function(x) {
if (x == 1e10) {
"10,000,000,000"
} else {
format(as.integer64(x), big.mark=",")
}
}
format_num = function(x, digits=3L) { # at least 3+1 chars on output, there is surely some setting to achieve that better with base R but it is not obvious to find that among all features there
cx = sprintf("%0.2f", x)
int = sapply(strsplit(cx, ".", fixed=TRUE), `[`, 1L)
Expand Down
1 change: 1 addition & 0 deletions _control/data.csv
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
groupby,G1_1e10_1e4_10_0,1e10,1e4,10,0,1
join,J1_1e7_NA_0_0,1e7,NA,0,0,1
join,J1_1e7_NA_5_0,1e7,NA,5,0,1
join,J1_1e7_NA_0_1,1e7,NA,0,1,1
Expand Down
2 changes: 2 additions & 0 deletions _control/data_groupby_xlarge.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
task,data,nrow,k,na,sort,active
groupby,G1_1e10_1e4_10_0,1e10,1e4,10,0,1
2 changes: 2 additions & 0 deletions _control/data_join_xlarge.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
task,data,nrow,k,na,sort,active
join,J1_1e10_NA_0_0,1e10,NA,0,0,1
7 changes: 7 additions & 0 deletions _control/data_large.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task,data,nrow,k,na,sort,active
groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
join,J1_1e9_NA_0_0,1e9,NA,0,0,1
17 changes: 17 additions & 0 deletions _control/data_small.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
task,data,nrow,k,na,sort,active
groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
join,J1_1e7_NA_0_0,1e7,NA,0,0,1
join,J1_1e7_NA_5_0,1e7,NA,5,0,1
join,J1_1e7_NA_0_1,1e7,NA,0,1,1
join,J1_1e8_NA_0_0,1e8,NA,0,0,1
join,J1_1e8_NA_5_0,1e8,NA,5,0,1
join,J1_1e8_NA_0_1,1e8,NA,0,1,1
5 changes: 5 additions & 0 deletions _control/extract_groupby_large.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
COPY G1_1e9_1e2_0_0 to 'G1_1e9_1e2_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e9_1e1_0_0 to 'G1_1e9_1e1_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e9_2e0_0_0 to 'G1_1e9_2e0_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e9_1e2_0_1 to 'G1_1e9_1e2_0_1.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e9_1e2_5_0 to 'G1_1e9_1e2_5_0.csv' (FORMAT CSV, HEADER 1);
10 changes: 10 additions & 0 deletions _control/extract_groupby_small.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
COPY G1_1e7_1e2_0_0 to 'G1_1e7_1e2_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e7_1e1_0_0 to 'G1_1e7_1e1_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e7_2e0_0_0 to 'G1_1e7_2e0_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e7_1e2_0_1 to 'G1_1e7_1e2_0_1.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e7_1e2_5_0 to 'G1_1e7_1e2_5_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e8_1e2_0_0 to 'G1_1e8_1e2_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e8_1e1_0_0 to 'G1_1e8_1e1_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e8_2e0_0_0 to 'G1_1e8_2e0_0_0.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e8_1e2_0_1 to 'G1_1e8_1e2_0_1.csv' (FORMAT CSV, HEADER 1);
COPY G1_1e8_1e2_5_0 to 'G1_1e8_1e2_5_0.csv' (FORMAT CSV, HEADER 1);
77 changes: 77 additions & 0 deletions _control/extract_join_small.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
COPY J1_1e7_NA_0_0 to 'J1_1e7_NA_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_NA_5_0 to 'J1_1e7_NA_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_NA_0_1 to 'J1_1e7_NA_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_0_0 to 'J1_1e8_NA_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_5_0 to 'J1_1e8_NA_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_0_1 to 'J1_1e8_NA_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e9_NA_0_0 to 'J1_1e9_NA_0_0.csv' (FORMAT CSV, HEADER 1);

COPY J1_1e7_1e7_0_1 to 'J1_1e7_1e7_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e5_0_1 to 'J1_1e8_1e5_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e9_1e6_0_0 to 'J1_1e9_1e6_0_0.csv' (FORMAT CSV, HEADER 1);

COPY J1_1e7_1e7_5_0 to 'J1_1e7_1e7_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e5_5_0 to 'J1_1e8_1e5_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e9_1e9_0_0 to 'J1_1e9_1e9_0_0.csv' (FORMAT CSV, HEADER 1);

COPY J1_1e7_1e1_0_0 to 'J1_1e7_1e1_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_NA_0_0 to 'J1_1e7_NA_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e8_0_0 to 'J1_1e8_1e8_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e9_NA_0_0 to 'J1_1e9_NA_0_0.csv' (FORMAT CSV, HEADER 1);

COPY J1_1e7_1e1_0_1 to 'J1_1e7_1e1_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_NA_0_1 to 'J1_1e7_NA_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e8_0_1 to 'J1_1e8_1e8_0_1.csv' (FORMAT CSV, HEADER 1);

COPY J1_1e7_1e1_5_0 to 'J1_1e7_1e1_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_NA_5_0 to 'J1_1e7_NA_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e8_5_0 to 'J1_1e8_1e8_5_0.csv' (FORMAT CSV, HEADER 1);




COPY J1_1e7_1e4_0_0 TO 'J1_1e7_1e4_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e2_0_0 TO 'J1_1e8_1e2_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_0_0 TO 'J1_1e8_NA_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_1e4_0_1 TO 'J1_1e7_1e4_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e2_0_1 TO 'J1_1e8_1e2_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_0_1 TO 'J1_1e8_NA_0_1.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_1e4_5_0 TO 'J1_1e7_1e4_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e2_5_0 TO 'J1_1e8_1e2_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_NA_5_0 TO 'J1_1e8_NA_5_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e7_1e7_0_0 TO 'J1_1e7_1e7_0_0.csv' (FORMAT CSV, HEADER 1);
COPY J1_1e8_1e5_0_0 TO 'J1_1e8_1e5_0_0.csv' (FORMAT CSV, HEADER 1);


CREATE TABLE J1_1e7_1e4_0_0 as select * from 'J1_1e7_1e4_0_0.csv';
CREATE TABLE J1_1e8_1e2_0_0 as select * from 'J1_1e8_1e2_0_0.csv';
CREATE TABLE J1_1e8_NA_0_0 as select * from 'J1_1e8_NA_0_0.csv';
CREATE TABLE J1_1e7_1e4_0_1 as select * from 'J1_1e7_1e4_0_1.csv';
CREATE TABLE J1_1e8_1e2_0_1 as select * from 'J1_1e8_1e2_0_1.csv';
CREATE TABLE J1_1e8_NA_0_1 as select * from 'J1_1e8_NA_0_1.csv';
CREATE TABLE J1_1e7_1e4_5_0 as select * from 'J1_1e7_1e4_5_0.csv';
CREATE TABLE J1_1e8_1e2_5_0 as select * from 'J1_1e8_1e2_5_0.csv';
CREATE TABLE J1_1e8_NA_5_0 as select * from 'J1_1e8_NA_5_0.csv';
CREATE TABLE J1_1e7_1e7_0_0 as select * from 'J1_1e7_1e7_0_0.csv';
CREATE TABLE J1_1e8_1e5_0_0 as select * from 'J1_1e8_1e5_0_0.csv';
CREATE TABLE J1_1e7_1e7_0_1 as select * from 'J1_1e7_1e7_0_1.csv';
CREATE TABLE J1_1e8_1e5_0_1 as select * from 'J1_1e8_1e5_0_1.csv';
CREATE TABLE J1_1e7_1e7_5_0 as select * from 'J1_1e7_1e7_5_0.csv';
CREATE TABLE J1_1e8_1e5_5_0 as select * from 'J1_1e8_1e5_5_0.csv';
CREATE TABLE J1_1e7_1e1_0_0 as select * from 'J1_1e7_1e1_0_0.csv';
CREATE TABLE J1_1e7_NA_0_0 as select * from 'J1_1e7_NA_0_0.csv';
CREATE TABLE J1_1e8_1e8_0_0 as select * from 'J1_1e8_1e8_0_0.csv';
CREATE TABLE J1_1e7_1e1_0_1 as select * from 'J1_1e7_1e1_0_1.csv';
CREATE TABLE J1_1e7_NA_0_1 as select * from 'J1_1e7_NA_0_1.csv';
CREATE TABLE J1_1e8_1e8_0_1 as select * from 'J1_1e8_1e8_0_1.csv';
CREATE TABLE J1_1e7_1e1_5_0 as select * from 'J1_1e7_1e1_5_0.csv';
CREATE TABLE J1_1e7_NA_5_0 as select * from 'J1_1e7_NA_5_0.csv';
CREATE TABLE J1_1e8_1e8_5_0 as select * from 'J1_1e8_1e8_5_0.csv';



create table J1_1e9_1e3_0_0 as select * from 'J1_1e9_1e3_0_0.csv';
CREATE TABLE J1_1e9_NA_0_0 as select * from 'J1_1e9_NA_0_0.csv';
create table J1_1e9_1e6_0_0 as select * from 'J1_1e9_1e6_0_0.csv';
create table J1_1e9_1e9_0_0 as select * from 'J1_1e9_1e9_0_0.csv';

2 changes: 2 additions & 0 deletions _control/timeout.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ task,in_rows,minutes
groupby,1e7,10
groupby,1e8,30
groupby,1e9,60
groupby,1e10,360
join,1e7,10
join,1e8,30
join,1e9,60
join,1e10,360
groupby2014,1e7,60
groupby2014,1e8,120
groupby2014,1e9,180
25 changes: 10 additions & 15 deletions _report/index.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -131,32 +131,29 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except

![](./groupby/G1_1e9_1e2_0_0_advanced.png)

### join {.tabset .tabset-fade .tabset-pills}

#### 0.5 GB
#### 500 GB

##### **basic questions**

![](./join/J1_1e7_NA_0_0_basic.png)
![](./groupby/G1_1e10_1e4_10_0_basic.png)

<!--
##### **advanced questions**

![](./join/J1_1e7_NA_0_0_advanced.png)
-->
![](./groupby/G1_1e10_1e4_0_0_advanced.png)

#### 5 GB {.active}
### join {.tabset .tabset-fade .tabset-pills}

#### 0.5 GB

##### **basic questions**

![](./join/J1_1e8_NA_0_0_basic.png)
![](./join/J1_1e7_NA_0_0_basic.png)

#### 5 GB {.active}

<!--
##### **advanced questions**
##### **basic questions**

![](./join/J1_1e8_NA_0_0_advanced.png)
-->
![](./join/J1_1e8_NA_0_0_basic.png)

#### 50 GB

Expand All @@ -166,8 +163,6 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except

<!--
##### **advanced questions**

![](./join/J1_1e9_NA_0_0_advanced.png)
-->

---
Expand Down
2 changes: 2 additions & 0 deletions _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ get_data_levels = function() {
in_rows = c("1e7","1e8","1e9")
k_na_sort = c("1e2_0_0","1e1_0_0","2e0_0_0","1e2_0_1","1e2_5_0")
groupby = paste("G1", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_")
groupby <- c(groupby, "G1_1e10_1e4_10_0")
## join
in_rows = c("1e7","1e8","1e9")
k_na_sort = c("NA_0_0","NA_5_0","NA_0_1")
Expand All @@ -21,6 +22,7 @@ get_data_levels = function() {
in_rows = c("1e7","1e8","1e9")
k_na_sort = "1e2_0_0"
groupby2014 = paste("G0", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_")

list(groupby=groupby, join=join, groupby2014=groupby2014)
}
get_excluded_batch = function() {
Expand Down
9 changes: 9 additions & 0 deletions _run/partitioned_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# set machine type
./_run/run_small_medium.sh

./_run/run_large.sh

./_run/run_groupby_xl.sh


# call code to rename images
18 changes: 18 additions & 0 deletions _run/run_groupby_xl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# get groupby large (500GB dataset)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby-500gb.duckdb data/groupby-500gb.duckdb


# expand groupby-small datasets to csv
duckdb data/groupby-500gb.duckdb -c "copy G1_1e10_1e4_10_0 to 'data/G1_1e10_1e4_10_0.csv' (FORMAT CSV)"


cp _control/data_groupby_xlarge.csv _control/data.csv

echo "Running groupby x-large (500GB) datasets"
./run.sh

###
echo "done..."
echo "removing data files"
rm data/*.csv
rm data/*.duckdb
34 changes: 34 additions & 0 deletions _run/run_large.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# download and expand large data

# get groupby large (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb
# get join small (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb


# expand groupby-small datasets to csv
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"

# expand join-small datasets to csv
duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_NA_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e6_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e3_0_0.csv' (FORMAT CSV)"


cp _control/data_large.csv _control/data.csv


echo "Running all solutions on large (50GB) datasets"
./run.sh


###
echo "done..."
echo "removing data files"
rm data/*.csv
rm data/*.duckdb
Loading