Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 72 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,78 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Daniel Chen
* https://www.linkedin.com/in/daniel-c-a02ba2229/
* Tested on: Windows 11, AMD Ryzen 7 8845HS w/ Radeon 780M Graphics (3.80 GHz), RTX 4070 notebook

### (TODO: Your README)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
# Writeup
This project contains several CPU/GPU prefix sum and stream compaction algorithms to compare their performance.

## Algorithm comparison
![algorithm comparison graphs](/img/graphs.png)

The above log-log plots show the running times of each of the implemented scan algorithms, at power-of-2 array sizes (left) and 3 less than each of those power-of-2 array sizes (right). Initial memory setup using `cudaMalloc`, `cudaMemset`, and `thrust::{host_vector, copy}` is excluded from the timings as per Part 7 of [INSTRUCTION.md](INSTRUCTION.md).

For small arrays, CPU scan was the fastest, followed by naive on average, then work-efficient, then thrust. However, throughout array sizes in the low millions, the order reverses: thrust is the fastest, followed by work-efficient, then naive, and then CPU. As the array sizes grow large, they all seem to increase by the same power (about linear). The reversal of the order would likely be due to overhead of the kernel launches at the smaller array sizes and/or possible block size misalignment with the number of threads. As the arrays grow large, however, the benefits of parallelization and GPU memory access optimization become more pronounced the more times the kernels are launched and the more blocks that are instantiated, even if the long-term complexity of all the algorithms remains $O(n)$ (not $O(\log(n))$ for the GPU implementations as the number of concurrent threads will be capped to be some fixed number based on the GPU hardware).

## Extra credit
As requested in Part 5 of [INSTRUCTION.md](INSTRUCTION.md), the work-efficient GPU implementation is over 4 times faster than the CPU implementation for array sizes greater than ~1 million even without further optimization.

## Test output
```

****************
** SCAN TESTS **
****************
[ 29 22 44 14 19 35 38 23 43 43 35 30 6 ... 20 0 ]
==== cpu scan, power-of-two ====
elapsed time: 0.0012ms (std::chrono Measured)
[ 0 29 51 95 109 128 163 201 224 267 310 345 375 ... 6278 6298 ]
==== cpu scan, non-power-of-two ====
elapsed time: 0.0006ms (std::chrono Measured)
[ 0 29 51 95 109 128 163 201 224 267 310 345 375 ... 6216 6219 ]
passed
==== naive scan, power-of-two ====
elapsed time: 1.81264ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.605056ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.677728ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.334272ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 2.58243ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 0.525216ms (CUDA Measured)
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 2 2 1 2 0 2 3 0 3 1 1 2 1 ... 2 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 0.0014ms (std::chrono Measured)
[ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 2 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 0.0011ms (std::chrono Measured)
[ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 3 ]
passed
==== cpu compact with scan ====
elapsed time: 0.0057ms (std::chrono Measured)
[ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 2 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 0.727648ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 1.61846ms (CUDA Measured)
passed
Press any key to continue . . .
```
Binary file added img/graphs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 29; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down
12 changes: 10 additions & 2 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;

bools[index] = idata[index] == 0 ? 0 : 1;
}

/**
Expand All @@ -32,7 +35,12 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;

if (idata[index] == 0) return;

odata[indices[index]] = idata[index];
}

}
Expand Down
4 changes: 4 additions & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ namespace StreamCompaction {
PerformanceTimer& operator=(const PerformanceTimer&) = delete;
PerformanceTimer& operator=(PerformanceTimer&&) = delete;

bool cpuTimerStarted() {
return cpu_timer_started;
}

private:
cudaEvent_t event_start = nullptr;
cudaEvent_t event_end = nullptr;
Expand Down
52 changes: 45 additions & 7 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,22 @@ namespace StreamCompaction {
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
timer().endCpuTimer();
bool timerStarted = false;
if (!timer().cpuTimerStarted()) {
timer().startCpuTimer();
timerStarted = true;
}

if (n > 0) {
odata[0] = 0;
}
for (int i = 1; i < n; i++) {
odata[i] = odata[i - 1] + idata[i - 1];
}

if (timerStarted) {
timer().endCpuTimer();
}
}

/**
Expand All @@ -30,9 +43,17 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int offset = 0;
for (int i = 0; i < n; i++) {
if (idata[i] == 0) {
offset++;
}
else {
odata[i - offset] = idata[i];
}
}
timer().endCpuTimer();
return -1;
return n - offset;
}

/**
Expand All @@ -42,9 +63,26 @@ namespace StreamCompaction {
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO


for (int i = 0; i < n; i++) {
odata[i] = idata[i] == 0 ? 0 : 1;
}

int* offsets = new int[n];

scan(n, offsets, odata);
for (int i = 0; i < n; i++) {
if (idata[i] == 0) continue;
odata[offsets[i]] = idata[i];
}

int out = offsets[n - 1];

delete[] offsets;

timer().endCpuTimer();
return -1;
return out;
}
}
}
115 changes: 113 additions & 2 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cuda_runtime.h>
#include "common.h"
#include "efficient.h"
#include <iostream>

namespace StreamCompaction {
namespace Efficient {
Expand All @@ -12,13 +13,75 @@ namespace StreamCompaction {
return timer;
}


int* dev_arr;
int *dev_arr2;
int* dev_bools;

__global__ void upsweepStep(int nc, int step, int* arr) {
unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x;

unsigned long long int indexRight = (index + 1) * step - 1;
if (indexRight >= nc) return;

unsigned long long int indexLeft = indexRight - step / 2;

arr[indexRight] = arr[indexLeft] + arr[indexRight];
}

__global__ void downsweepStep(int nc, int step, int* arr) {
unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x;

unsigned long long int indexRight = (index + 1) * step - 1;
if (indexRight >= nc) return;

unsigned long long int indexLeft = indexRight - step / 2;

int right = arr[indexRight];
arr[indexRight] += arr[indexLeft];
arr[indexLeft] = right;
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
int nc = 1 << ilog2ceil(n);

cudaMalloc((void**)&dev_arr, nc * sizeof(int));

cudaMemcpy(dev_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(dev_arr + n, 0, (nc - n) * sizeof(int));


timer().startGpuTimer();
// TODO



int blockSize = 256;


for (int step = 2; step <= nc; step <<= 1) {
int nBlocks = (nc / step + blockSize - 1) / blockSize;
upsweepStep<<<nBlocks, blockSize>>>(nc, step, dev_arr);
}

cudaMemset(dev_arr + nc - 1, 0, sizeof(int));

for (int step = nc; step >= 2; step >>= 1) {
int nBlocks = (nc / step + blockSize - 1) / blockSize;
downsweepStep<<<nBlocks, blockSize>>>(nc, step, dev_arr);
}


timer().endGpuTimer();



cudaMemcpy(odata, dev_arr, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(dev_arr);
}

/**
Expand All @@ -31,10 +94,58 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
if (n == 0) return 0;


int nc = 1 << ilog2ceil(n);

cudaMalloc((void**)&dev_arr, nc * sizeof(int));
cudaMalloc((void**)&dev_arr2, n * sizeof(int));
cudaMalloc((void**)&dev_bools, nc * sizeof(int));


int blockSize = 256;
int nBlocksOuter = (nc + blockSize - 1) / blockSize;


cudaMemcpy(dev_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(dev_arr + n, 0, (nc - n) * sizeof(int));


timer().startGpuTimer();
// TODO

Common::kernMapToBoolean<<<nBlocksOuter, blockSize>>>(nc, dev_bools, dev_arr);

for (int step = 2; step <= nc; step <<= 1) {
int nBlocks = (nc / step + blockSize - 1) / blockSize;
upsweepStep<<<nBlocks, blockSize>>>(nc, step, dev_bools);
}

cudaMemset(dev_bools + nc - 1, 0, sizeof(int));

for (int step = nc; step >= 2; step >>= 1) {
int nBlocks = (nc / step + blockSize - 1) / blockSize;
downsweepStep<<<nBlocks, blockSize>>>(nc, step, dev_bools);
}

Common::kernScatter<<<nBlocksOuter, blockSize>>>(n, dev_arr2, dev_arr, NULL, dev_bools);


timer().endGpuTimer();
return -1;



int nFinal;
cudaMemcpy(&nFinal, dev_bools + nc - 1, sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(odata, dev_arr2, nFinal * sizeof(int), cudaMemcpyDeviceToHost);


cudaFree(dev_arr);
cudaFree(dev_arr2);
cudaFree(dev_bools);

return nFinal;
}
}
}
Loading