CIS5650-Fall-2025 · YuningUPenn · Sep 14, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/Performance Data Collection.xlsx b/Performance Data Collection.xlsx
diff --git a/README.md b/README.md
@@ -3,12 +3,96 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
+* Yuning Wen
   * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Tested on: Windows 11, i9-12900H @ 2.50GHz 16GB, NVIDIA GeForce RTX 3060 Laptop GPU (Personal Laptop)
 
-### (TODO: Your README)
+### README
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+#### Description
 
+* CPU Scan, Stream Compaction
+
+* Naive GPU Scan Algorithm
+
+* Work-Efficient GPU Scan & Stream Compaction
+
+* Thrust test implemented
+
+#### Performance Analysis (Answering Questions)
+
+* Here are the two graphs for the performances of different GPU scan algorithms (Naive, Work-Efficient, and Thrust) and the serial CPU version of Scan.
+
+![power of 2](./img/po2.png)
+![non power of 2](./img/non%20po2.png)
+
+* I have tried to use Nsight Systems only to check if I may find something. But it seems like cuda is mostly use by my own function, which means thrust may just used little stuff in cuda
+
+![general](./img/nsys%20general.png)
+
+  But there is some tiny things inside and that might be used by thrust
+
+![focus](./img/nsys%20focus.png)
+
+  As a result, I guess thurst functions actually make a great use of parallel calculation in CPU, or have excellent algorithm that runs in just hundred microseconds and finishes the use of CUDA.
+
+#### Bottlenecks
+
+* From the nsight charts above, I think the current bottleneck is that the efficiency of memory usage is too low, so if I may apply shared memory to my algorithm, the efficiency may then increase.
+
+#### Result of running
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  32   7   4  15  27  47  43  11   5  18  44  41  47 ...  30   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 20.1042ms    (std::chrono Measured)
+    [   0  32  39  43  58  85 132 175 186 191 209 253 294 ... 205520488 205520518 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 19.9824ms    (std::chrono Measured)
+    [   0  32  39  43  58  85 132 175 186 191 209 253 294 ... 205520437 205520458 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 7.2087ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 6.83536ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 2.68765ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 2.97347ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.885536ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.02086ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   1   2   1   1   1   3   3   3   2   0   3   1 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 29.0733ms    (std::chrono Measured)
+    [   2   1   2   1   1   1   3   3   3   2   3   1   2 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 45.5551ms    (std::chrono Measured)
+    [   2   1   2   1   1   1   3   3   3   2   3   1   2 ...   1   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 83.1376ms    (std::chrono Measured)
+    [   2   1   2   1   1   1   3   3   3   2   3   1   2 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2.79856ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 2.45539ms    (CUDA Measured)
+    passed
+```
diff --git a/img/non po2.png b/img/non po2.png
diff --git a/img/nsys focus.png b/img/nsys focus.png
diff --git a/img/nsys general.png b/img/nsys general.png
diff --git a/img/po2.png b/img/po2.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 23; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,17 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (idata[index] == 0) {
+                bools[index] = 0;
+            }
+            else {
+                bools[index] = 1;
+            }
         }
 
         /**
@@ -33,6 +44,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[count] = idata[i];
+                    count++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -43,8 +54,34 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            // map
+            int* map = new int[n];
+            for (int i = 0; i < n; i++) {
+                if (idata[i] == 0) {
+                    map[i] = 0;
+                }
+                else {
+                    map[i] = 1;
+                }
+            }
+
+            // scan
+            int* scan = new int[n];
+            scan[0] = 0;
+            for (int i = 1; i < n; i++) {
+                scan[i] = scan[i - 1] + map[i - 1];
+            }
+
+            // scatter
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (map[i] != 0) {
+                    odata[scan[i]] = idata[i];
+                    count++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +14,64 @@ namespace StreamCompaction {
             return timer;
         }
 
+        // up sweep function
+        __global__ void upSweep(int N, int* data, int pow) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            int end = (index + 1) * (1 << (pow + 1)) - 1;
+            if (end >= N) {
+                return;
+            }
+            int start = end - (1 << pow);
+            data[end] += data[start];
+        }
+
+        // down sweep function
+        __global__ void downSweep(int N, int* data, int pow) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            int end = (index + 1) * (1 << (pow + 1)) - 1;
+            if (end >= N) {
+                return;
+            }
+            int start = end - (1 << pow);
+
+            int temp = data[end];
+            data[end] += data[start];
+            data[start] = temp;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
+            int* dev_tree;
+
+            int depth = ilog2ceil(n);
+            int N = 1 << depth;
+
+            cudaMalloc((void**)&dev_tree, N * sizeof(int));
+            cudaMemcpy(dev_tree, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            timer().startGpuTimer(); // not to include any initial/final memory operations
+
+            // up sweep
+            for (int d = 0; d < depth; d++) {
+                dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize);
+                upSweep << <curGrid, blockSize >> > (N, dev_tree, d);
+            }
+
+            // down sweep
+            //dev_tree[n - 1] = 0;
+            cudaMemset(dev_tree + N - 1, 0, sizeof(int));
+            for (int d = depth - 1; d >= 0; d--) {
+                dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize);
+                downSweep << <curGrid, blockSize >> > (N, dev_tree, d);
+            }
+
+            timer().endGpuTimer(); // not to include any initial/final memory operations
+
+            cudaMemcpy(odata, dev_tree, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_tree);
         }
 
         /**
@@ -31,10 +84,39 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+
+            //timer().startGpuTimer();��
             // TODO
-            timer().endGpuTimer();
-            return -1;
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_indices;
+            int* dev_bool;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_indices, n * sizeof(int));
+            cudaMalloc((void**)&dev_bool, n * sizeof(int));
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bool, dev_idata);
+            StreamCompaction::Efficient::scan(n, dev_indices, dev_bool);
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_indices);
+
+            int count;
+            int check;
+            cudaMemcpy(&count, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&check, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_indices);
+            cudaFree(dev_bool);
+            //timer().endGpuTimer();��
+            return count + check;
         }
     }
 }