Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Performance Data Collection.xlsx
Binary file not shown.
94 changes: 89 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,96 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* Yuning Wen
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Tested on: Windows 11, i9-12900H @ 2.50GHz 16GB, NVIDIA GeForce RTX 3060 Laptop GPU (Personal Laptop)

### (TODO: Your README)
### README

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
#### Description

* CPU Scan, Stream Compaction

* Naive GPU Scan Algorithm

* Work-Efficient GPU Scan & Stream Compaction

* Thrust test implemented

#### Performance Analysis (Answering Questions)

* Here are the two graphs for the performances of different GPU scan algorithms (Naive, Work-Efficient, and Thrust) and the serial CPU version of Scan.

![power of 2](./img/po2.png)
![non power of 2](./img/non%20po2.png)

* I have tried to use Nsight Systems only to check if I may find something. But it seems like cuda is mostly use by my own function, which means thrust may just used little stuff in cuda

![general](./img/nsys%20general.png)

But there is some tiny things inside and that might be used by thrust

![focus](./img/nsys%20focus.png)

As a result, I guess thurst functions actually make a great use of parallel calculation in CPU, or have excellent algorithm that runs in just hundred microseconds and finishes the use of CUDA.

#### Bottlenecks

* From the nsight charts above, I think the current bottleneck is that the efficiency of memory usage is too low, so if I may apply shared memory to my algorithm, the efficiency may then increase.

#### Result of running

```
****************
** SCAN TESTS **
****************
[ 32 7 4 15 27 47 43 11 5 18 44 41 47 ... 30 0 ]
==== cpu scan, power-of-two ====
elapsed time: 20.1042ms (std::chrono Measured)
[ 0 32 39 43 58 85 132 175 186 191 209 253 294 ... 205520488 205520518 ]
==== cpu scan, non-power-of-two ====
elapsed time: 19.9824ms (std::chrono Measured)
[ 0 32 39 43 58 85 132 175 186 191 209 253 294 ... 205520437 205520458 ]
passed
==== naive scan, power-of-two ====
elapsed time: 7.2087ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 6.83536ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 2.68765ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 2.97347ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 0.885536ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 1.02086ms (CUDA Measured)
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 2 1 2 1 1 1 3 3 3 2 0 3 1 ... 2 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 29.0733ms (std::chrono Measured)
[ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 2 2 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 45.5551ms (std::chrono Measured)
[ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 1 2 ]
passed
==== cpu compact with scan ====
elapsed time: 83.1376ms (std::chrono Measured)
[ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 2 2 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 2.79856ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 2.45539ms (CUDA Measured)
passed
```
Binary file added img/non po2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/nsys focus.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/nsys general.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/po2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 23; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down
19 changes: 19 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) {
return;
}

if (idata[index] == 0) {
bools[index] = 0;
}
else {
bools[index] = 1;
}
}

/**
Expand All @@ -33,6 +44,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) {
return;
}

if (bools[index] == 1) {
odata[indices[index]] = idata[index];
}
}

}
Expand Down
41 changes: 39 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int i = 1; i < n; i++) {
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +35,15 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int count = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[count] = idata[i];
count++;
}
}
timer().endCpuTimer();
return -1;
return count;
}

/**
Expand All @@ -43,8 +54,34 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
// map
int* map = new int[n];
for (int i = 0; i < n; i++) {
if (idata[i] == 0) {
map[i] = 0;
}
else {
map[i] = 1;
}
}

// scan
int* scan = new int[n];
scan[0] = 0;
for (int i = 1; i < n; i++) {
scan[i] = scan[i - 1] + map[i - 1];
}

// scatter
int count = 0;
for (int i = 0; i < n; i++) {
if (map[i] != 0) {
odata[scan[i]] = idata[i];
count++;
}
}
timer().endCpuTimer();
return -1;
return count;
}
}
}
92 changes: 87 additions & 5 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "common.h"
#include "efficient.h"

#define blockSize 128

namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,13 +14,64 @@ namespace StreamCompaction {
return timer;
}

// up sweep function
__global__ void upSweep(int N, int* data, int pow) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
int end = (index + 1) * (1 << (pow + 1)) - 1;
if (end >= N) {
return;
}
int start = end - (1 << pow);
data[end] += data[start];
}

// down sweep function
__global__ void downSweep(int N, int* data, int pow) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
int end = (index + 1) * (1 << (pow + 1)) - 1;
if (end >= N) {
return;
}
int start = end - (1 << pow);

int temp = data[end];
data[end] += data[start];
data[start] = temp;
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
int* dev_tree;

int depth = ilog2ceil(n);
int N = 1 << depth;

cudaMalloc((void**)&dev_tree, N * sizeof(int));
cudaMemcpy(dev_tree, idata, n * sizeof(int), cudaMemcpyHostToDevice);

timer().startGpuTimer(); // not to include any initial/final memory operations

// up sweep
for (int d = 0; d < depth; d++) {
dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize);
upSweep << <curGrid, blockSize >> > (N, dev_tree, d);
}

// down sweep
//dev_tree[n - 1] = 0;
cudaMemset(dev_tree + N - 1, 0, sizeof(int));
for (int d = depth - 1; d >= 0; d--) {
dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize);
downSweep << <curGrid, blockSize >> > (N, dev_tree, d);
}

timer().endGpuTimer(); // not to include any initial/final memory operations

cudaMemcpy(odata, dev_tree, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_tree);
}

/**
Expand All @@ -31,10 +84,39 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
timer().startGpuTimer();

//timer().startGpuTimer();��
// TODO
timer().endGpuTimer();
return -1;
int* dev_idata;
int* dev_odata;
int* dev_indices;
int* dev_bool;

cudaMalloc((void**)&dev_idata, n * sizeof(int));
cudaMalloc((void**)&dev_odata, n * sizeof(int));
cudaMalloc((void**)&dev_indices, n * sizeof(int));
cudaMalloc((void**)&dev_bool, n * sizeof(int));
cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);

dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);

StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bool, dev_idata);
StreamCompaction::Efficient::scan(n, dev_indices, dev_bool);
StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_indices);

int count;
int check;
cudaMemcpy(&count, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&check, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(dev_idata);
cudaFree(dev_odata);
cudaFree(dev_indices);
cudaFree(dev_bool);
//timer().endGpuTimer();��
return count + check;
}
}
}
Loading