diff --git a/README.md b/README.md index 0e38ddb1..bb4b91fb 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,105 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Raymond Feng + * [LinkedIn](https://www.linkedin.com/in/raymond-ma-feng/), [personal website](https://www.rfeng.dev/) +* Tested on: Windows 11, i9-9900KF @ 3.60GHz 32GB, NVIDIA GeForce RTX 2070 SUPER (Personal Computer) -### (TODO: Your README) +This project implements the all-prefix sums scan algorithm on the CPU, a naive version on the GPU, a work-efficient version on the GPU, and an implementation using the thrust library. It also implements work-efficient parallel stream compaction. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Project Description +### CPU Scan +A sequential, O(n) scan on the CPU was first implemented. This is pretty trivial, looping through every item of the array and adding the previously summed number to the element at the current index. + +### Naive GPU Scan +A O(log2(n)) parallel scan was implemented on the GPU. For each "layer" of the scan, we increase the gaps between the items to be added. For cases where N was not a power of two, the length of the array was extended to the next power of two. + +### Work-Efficient GPU Scan and Stream Compaction +An O(n) work-efficient scan algorithm was also implemented on the GPU. This involves treating the array as a balanced binary tree, and performing an upsweep parallel reduction, then a downsweep to traverse back down the partial sums, performing the scan in place. +Like with the Naive GPU scan, the array was extended for arrays where N was not a power of two. + +Similarly, a parallel compaction algorithm was also implemented in three steps: First, computing a true/false array. Second, running an exclusive scan on the true/false array. Then, scattering the result using a custom scatter function. All three steps run in parallel, giving us a time complexity of O(n). + +### Thrust Scan +Using the thrust library provided with the base files, another scan algorithm was also implemented. This involved converting the input and output to thrust_vectors, and simply running thrust::exclusive_scan on them. + +## Performance Analysis + +### Block Size +I tested various block sizes, ranging from 64 threads, to 128, to 256, and 512. Each of these tests was run on an array size of 2^8 items. As you can see, there's not much of a difference in runtime between them. I decided to use a block size of 128 threads for the rest of my tests. +![](img/image1.1.png) +![](img/image1.png) + +For the following charts, I used this data: +![](img/image6.png) +### Naive GPU Scan +![](img/image2.png) +Here, the CPU scan is actually faster than the GPU scan. + +### Work-Efficient Scan +![](img/image3.png) +Similarly, the CPU scan is faster here. The work-efficient scan is also slower than the Naive scan as well. + +### Thrust +![](img/image4.png) + +### Work-Efficient Stream Compaction +![](img/image5.png) + +## Output +A sample of my output. N=2^8. + +``` +**************** +** SCAN TESTS ** +**************** + [ 8 28 26 35 8 29 6 13 3 36 31 18 45 ... 33 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 8 36 62 97 105 134 140 153 156 192 223 241 ... 6044 6077 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 8 36 62 97 105 134 140 153 156 192 223 241 ... 5968 5977 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.385344ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.07872ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.464288ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.151552ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.862272ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.03824ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 0 3 0 3 2 3 1 0 3 0 3 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 2 2 3 3 2 3 1 3 3 3 3 2 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 2 2 3 3 2 3 1 3 3 3 3 2 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0018ms (std::chrono Measured) + [ 2 2 3 3 2 3 1 3 3 3 3 2 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.464704ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.15792ms (CUDA Measured) + passed``` diff --git a/img/image1.1.png b/img/image1.1.png new file mode 100644 index 00000000..b00db9cd Binary files /dev/null and b/img/image1.1.png differ diff --git a/img/image1.png b/img/image1.png new file mode 100644 index 00000000..7b46c7a6 Binary files /dev/null and b/img/image1.png differ diff --git a/img/image2.png b/img/image2.png new file mode 100644 index 00000000..463850d4 Binary files /dev/null and b/img/image2.png differ diff --git a/img/image3.png b/img/image3.png new file mode 100644 index 00000000..074900b7 Binary files /dev/null and b/img/image3.png differ diff --git a/img/image4.png b/img/image4.png new file mode 100644 index 00000000..5f4cae88 Binary files /dev/null and b/img/image4.png differ diff --git a/img/image5.png b/img/image5.png new file mode 100644 index 00000000..1783e796 Binary files /dev/null and b/img/image5.png differ diff --git a/img/image6.png b/img/image6.png new file mode 100644 index 00000000..252beb24 Binary files /dev/null and b/img/image6.png differ diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..1646ccb3 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -19,8 +19,9 @@ list(SORT sources) source_group(Headers FILES ${headers}) source_group(Sources FILES ${sources}) - +find_package(CCCL REQUIRED) add_library(stream_compaction ${sources} ${headers}) +target_link_libraries(stream_compaction CCCL::Thrust) if(CMAKE_VERSION VERSION_LESS "3.23.0") set_target_properties(stream_compaction} PROPERTIES CUDA_ARCHITECTURES OFF) elseif(CMAKE_VERSION VERSION_LESS "3.24.0") diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..adaab689 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,17 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + if (idata[idx] == 0) { + bools[idx] = 0; + } + else { + bools[idx] = 1; + } } /** @@ -32,7 +42,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..c9a91b33 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,26 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int idx = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[idx] = idata[i]; + idx++; + } + } timer().endCpuTimer(); - return -1; + return n-idx; + } + + int scatter(int n, int* odata, const int* compact, const int* scan, const int* idata) { + int count = 0; + for (int i = 0; i < n; i++) { + if (compact[i] == 1) { + odata[scan[i]] = idata[i]; + count++; + } + } + return n - count; } /** @@ -42,9 +62,28 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* compact = new int[n]; + int compact_count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + compact[i] = 1; + compact_count++; + } + else { + compact[i] = 0; + } + } + + int* scan_result = new int[n]; + scan_result[0] = 0; + for (int i = 1; i < n; i++) { + scan_result[i] = compact[i - 1] + scan_result[i - 1]; + } + + int items_left = scatter(n, odata, compact, scan_result, idata); + timer().endCpuTimer(); - return -1; + return items_left; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..27bc74be 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,20 @@ #include "common.h" #include "efficient.h" +/***************** +* Configuration * +*****************/ + +#define blockSize 128 + +// buffers +int* dev_scan; + +int* dev_idata; +int* dev_bools; +int* dev_indices; +int* dev_scatter; + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +26,100 @@ namespace StreamCompaction { return timer; } + /** + * Helper functions from testing_helpers.hpp + */ + void printArray(int n, const int* a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + // kernel for upsweep + __global__ void upsweep_kernel(int n, int d, int* buffer) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + int div = pow(2, d + 1); + if (idx % div == 0) { + buffer[idx + (int)pow(2, d+1) - 1] += buffer[idx + (int)pow(2, d) - 1]; + } + } + + // kernel for downsweep + __global__ void downsweep_kernel(int n, int d, int* buffer) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + int div = pow(2, d + 1); + if (idx % div == 0) { + // save left child + int left = buffer[idx + (int)pow(2, d) - 1]; + + // set left child to this node's value + buffer[idx + (int)pow(2, d) - 1] = buffer[idx + (int)pow(2, d + 1) - 1]; + + // set right child to old left value + this node's value + buffer[idx + (int)pow(2, d + 1) - 1] += left; + } + } + + /** + * Helper function for doing scan + */ + void up_down_sweep(dim3 fullBlocksPerGrid, int n2, int* dev_array) { + for (int d = 0; d <= ilog2ceil(n2) - 1; d++) { + upsweep_kernel << > > (n2, d, dev_array); + } + + // set root to 0 + cudaMemset(dev_array + n2 - 1, 0, sizeof(int)); + + // downsweep + for (int d = ilog2ceil(n2) - 1; d >= 0; d--) { + downsweep_kernel << > > (n2, d, dev_array); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + // expand N to next power of 2 + int n2 = pow(2, ilog2ceil(n)); + + // setup block structure + dim3 fullBlocksPerGrid((n2 + blockSize - 1) / blockSize); + + // allocate memory on GPU + cudaMalloc((void**)&dev_scan, n2 * sizeof(int)); + + // copy only first n of input to device + cudaMemcpy(dev_scan, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // upsweep, downsweep + up_down_sweep(fullBlocksPerGrid, n2, dev_scan); + timer().endGpuTimer(); + + // copy output to host + cudaMemcpy(odata, dev_scan, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // free data from GPU + cudaFree(dev_scan); } /** @@ -31,10 +132,57 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + // expand N to next power of 2 + int n2 = pow(2, ilog2ceil(n)); + + // setup block structure + dim3 fullBlocksPerGrid((n2 + blockSize - 1) / blockSize); + + // initialize host arrays + int* host_bools = new int[n2]; + int* host_indices = new int[n2]; + + // allocate memory on GPU + cudaMalloc((void**)&dev_idata, n2 * sizeof(int)); + cudaMalloc((void**)&dev_bools, n2 * sizeof(int)); + cudaMalloc((void**)&dev_indices, n2 * sizeof(int)); + cudaMalloc((void**)&dev_scatter, n2 * sizeof(int)); + + // copy input to device + cudaMemcpy(dev_idata, idata, sizeof(int) * n2, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // compute temp array containing booleans + Common::kernMapToBoolean << > > (n2, dev_bools, dev_idata); + + // copy dev_bools to dev_indices + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * n2, cudaMemcpyDeviceToDevice); + + // run exclusive scan on bool array + up_down_sweep(fullBlocksPerGrid, n2, dev_indices); + + // run scatter to compute final array + Common::kernScatter << > > (n2, dev_scatter, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + // copy output to host + cudaMemcpy(odata, dev_scatter, sizeof(int) * n2, cudaMemcpyDeviceToHost); + cudaMemcpy(host_indices, dev_indices, sizeof(int) * n2, cudaMemcpyDeviceToHost); + + // free data from GPU + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_scatter); + + // compute remaining elements + if (idata[n - 1] == 0) + return n - host_indices[n - 1]; + else + return n - host_indices[n - 1] - 1; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..50dbefe4 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,24 @@ #include "common.h" #include "naive.h" +/***************** +* Configuration * +*****************/ + +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 + +/*********************************************** +* Kernel state (pointers are device pointers) * +***********************************************/ + +int numObjects; +dim3 threadsPerBlock(blockSize); + +// buffers +int* dev_in; +int* dev_out; + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +29,101 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + /** + * Helper functions from testing_helpers.hpp + */ + void printArray(int n, const int* a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + void zeroArray(int n, int* a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } + } + + // shift right kernel + __global__ void shift_right_kernel(int n, int* output, int* input) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + if (idx != 0) { + output[idx] = input[idx - 1]; + } + } + + // TODO: __global__ kernel + __global__ void scan_kernel(int n, int layer, int* output, int* input) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + + if (idx >= pow(2, layer-1)) { + output[idx] = input[idx - (int)pow(2, layer - 1)] + input[idx]; + } + else { + output[idx] = input[idx]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + // setup block structure + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // allocate memory on GPU + cudaMalloc((void**)&dev_in, n * sizeof(int)); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + + // copy input to device + cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // run kernel ilog2ceil(n) times + bool swap = true; + for (int d = 1; d <= ilog2ceil(n); d++) { + if (swap) { + scan_kernel << > > (n, d, dev_out, dev_in); + } + else { + scan_kernel << > > (n, d, dev_in, dev_out); + } + swap = !swap; + } + + // shift right to make exclusive and copy output to host + if (swap) { + shift_right_kernel << > > (n, dev_out, dev_in); + cudaMemcpy(odata, dev_out, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + else { + shift_right_kernel << > > (n, dev_in, dev_out); + cudaMemcpy(odata, dev_in, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + + odata[0] = 0; + timer().endGpuTimer(); + + // free data from GPU + cudaFree(dev_in); + cudaFree(dev_out); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..24799087 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + thrust::host_vector host_in(idata, idata + n); + thrust::host_vector host_out(n); + + thrust::device_vector dev_in = (thrust::device_vector)host_in; + thrust::device_vector dev_out = (thrust::device_vector)host_out; + + timer().startGpuTimer(); + + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + + timer().endGpuTimer(); + + thrust::copy(dev_out.begin(), dev_out.end(), odata); } } }