diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..3eb1dadd --- /dev/null +++ b/.clang-format @@ -0,0 +1,18 @@ +BasedOnStyle: Chromium + +ColumnLimit: 120 +InsertNewlineAtEOF: true +AllowShortIfStatementsOnASingleLine: WithoutElse +WrapNamespaceBodyWithEmptyLines: Always +SeparateDefinitionBlocks: Always + +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^".*"' + Priority: 1 + - Regex: '^' + Priority: 2 + - Regex: '^' + Priority: 3 + - Regex: '^<.*>' + Priority: 4 \ No newline at end of file diff --git a/.cproject b/.cproject deleted file mode 100644 index 6615a581..00000000 --- a/.cproject +++ /dev/null @@ -1,212 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/.project b/.project deleted file mode 100644 index d95a4e38..00000000 --- a/.project +++ /dev/null @@ -1,27 +0,0 @@ - - - Project2-Stream-Compaction - - - - - - org.eclipse.cdt.managedbuilder.core.genmakebuilder - clean,full,incremental, - - - - - org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder - full,incremental, - - - - - - org.eclipse.cdt.core.cnature - org.eclipse.cdt.core.ccnature - org.eclipse.cdt.managedbuilder.core.managedBuildNature - org.eclipse.cdt.managedbuilder.core.ScannerConfigNature - - diff --git a/GNUmakefile b/GNUmakefile deleted file mode 100644 index 2b433114..00000000 --- a/GNUmakefile +++ /dev/null @@ -1,31 +0,0 @@ -CMAKE_ALT1 := /usr/local/bin/cmake -CMAKE_ALT2 := /Applications/CMake.app/Contents/bin/cmake -CMAKE := $(shell \ - which cmake 2>/dev/null || \ - ([ -e ${CMAKE_ALT1} ] && echo "${CMAKE_ALT1}") || \ - ([ -e ${CMAKE_ALT2} ] && echo "${CMAKE_ALT2}") \ - ) - -all: Release - - -Debug: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -MinSizeRel: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -Release: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -RelWithDebugInfo: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - - -build: - mkdir -p build - -clean: - ((cd build && make clean) 2>&- || true) - -.PHONY: all Debug MinSizeRel Release RelWithDebugInfo clean diff --git a/README.md b/README.md index 0e38ddb1..c2bde24a 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,198 @@ -CUDA Stream Compaction -====================== +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2** -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +* Charles Wang + * [LinkedIn](https://linkedin.com/in/zwcharl) + * [Personal website](https://charleszw.com) +* Tested on: + * Windows 11 Pro (26100.4946) + * Ryzen 5 7600X @ 4.7Ghz + * 32 GB RAM + * RTX 5060 Ti 16 GB (Studio Driver 580.97) -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +# CUDA Stream Compaction -### (TODO: Your README) +This project implements multiple commonly used GPU algorithms, which are reduction, computing prefix sums (scan), and stream compaction. Stream compaction uses the scan algorithm under the hood, and one of my implementations for finding prefix sums uses a parallel reduction, so these algorithms are all building on each other. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The purpose of this project was to understand these algorithms in more detail, and explore how their implementations change when we parallelize them on the GPU. It also taught me more about how CUDA works and how my kernels interact with the physical NVIDIA hardware. +## Implementations + +In order to explore potential performance differences when scaling the input size, this project includes three different versions of the scan and compaction algorithms. + +### CPU (single threaded) + +*Found in [`cpu.cu`](stream_compaction/cpu.cu).* + +These implementations run entirely on the CPU and are written in pure C++. They are single threaded by nature and are extremely simple. Given the input array, we iterate through each element to process it. + +- For the scan, we keep a variable that stores the current sum of all previous elements. We then add the current element and set that as the output. +- The stream compaction algorithm was implemented both with and without using scan. + - Without scan: maintain a separate index that we use to write to the output array, because it is (likely) to be less than the input array size. + - With scan: map the input array elements to $1$ if it's considered valid, and $0$ otherwise. We then run an exclusive scan on this mapping array. This tells us which output array index to write to for each valid element. + +### Naive GPU algorithms + +*Found in [`naive.cu`](stream_compaction/naive.cu).* + +This scan algorithm uses the GPU and is based on the naive algorithm described in [GPU Gems 3, Chapter 39.2.1](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). Essentially, we process the array in-place across multiple iterations. + +
+ +

Source: GPU Gems 3, Chapter 39.2.1

+
+ +As the figure above demonstrates, each iteration has us add pairs of numbers together and store it in the larger index of the two. Each addition operation is parallelized and performed in a separate thread. + +Some more notes: + +- The stride is calculated via $2^{i-1}$, where $i$ is the iteration and $1 \leq i \leq \text{ceil}(\lg(N))$. $N$ is the array size. Instead of using `pow(2, i - 1)`, I calculated it via bitshifts: `int stride = 1 << iteration - 1`. +- The number of blocks in my kernel dispatch depends on the number of operations needed for the current iteration. +- We need to maintain a separate read and write buffer to avoid potential race conditions. This increases memory usage and potentially affects performance. + +### Work-efficient parallel scan + +*Found in [`efficient.cu`](stream_compaction/efficient.cu).* + +Implementations of the scan and compaction algorithms which theoretically require less operations and therefore should run more efficiently. + +
+ + +

Source: GPU Gems 3, Chapter 39.2.2

+
+ +It is based on the work-efficient parallel scan algorithm described in [GPU Gems 3, Chapter 39.2.2](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda), and involves an "up-sweep" where we build up a balanced binary tree, and then a "down-sweep" where we calculate final terms using the node elements in the tree. + +## Performance benchmarks + +### Methodology + +First, I found the optimal block sizes for the naive and efficient CUDA kernels. This was mostly trial and error; I ended up using 64 for naive and 512 for efficient. + +I then added additional code that would run each `scan()` algorithm a certain number of iterations, and average each of the execution times. For instance, this is what the output looks like for a benchmark where I'm running 10 iterations for each algorithm, on an array size of $2^{30}$: + +``` +******************** +** SCAN BENCHMARK ** +******************** + +- Number of iterations: 10 +- Size of POT array: 1073741824 +- Size of NPOT array: 1073741821 + +[CPU/POT] Average scan() time: 268.988 +[CPU/NPOT] Average scan() time: 287.848 +[Naive/POT] Average scan() time: 1381.91 +[Naive/NPOT] Average scan() time: 1382.35 +[Efficient/POT] Average scan() time: 233.941 +[Efficient/NPOT] Average scan() time: 239.228 +[Thrust/POT] Average scan() time: 23.0701 +[Thrust/NPOT] Executing scan(): 6 of 10... +``` + +The `runBenchmarks` global variable in [`main.cpp`](src/main.cpp) controls this. I've also made available the raw data in the [`analysis`](analysis/) folder. Rows are CPU, naive, work-efficient, and thrust top to bottom. Columns are increasing array sizes left to right. + +### Graphs + +These are my graphs. The left column has array sizes that are powers of two (POT), while the right column subtract 3 from the sizes, therefore making them not powers of two (NPOT). + +|Powers of two|Not powers of two| +|:-:|:-:| +|![](analysis/graphs/scan_pot.png)|![](analysis/graphs/scan_npot.png)| + +### Analysis + +Given that we're working with increasing powers of two here, the exponential curve makes sense. While the naive algorithm is able to stay competitive at smaller array sizes, it is essentially doubles in execution time every time we double the array size. The additional $\log n$ factor is really causing this algorithm to suffer, and deems it highly inefficient for large inputs. + +The CPU and work-efficient algorithm are much closer in execution time, and this can again be explained by their theoretical runtime. As previously explained, the CPU algorithm iterates over each element sequentially, netting us a $O(n)$ runtime. The work-efficient GPU algorithm manages to execute its operations using $O(n)$ operations as well. I believe this explains why they performed essentially the same (although I definitely missed some of the hardware and indexing optimizations to truly make the GPU algorithm faster). + +### Thrust + +To briefly analyze Thrust, I executed the test program with just the Thrust implementations of exclusive scan and compaction. I then profiled the program with Nsight Systems. Here is a screenshot of the overall CUDA utilization: + +
+ +
+ +If I'm not mistaken, this is telling me that over 95% of the time the GPU hardware was simply dealing with memory-related operations. If we zoom in more closely on the timeline, we can confirm this: + +
+ +
+ +Here we can see that most of the time was spent on host-to-device and device-to-host memory operations. Meanwhile, the small blue rectangles between the larger green and red blocks indicate the *actual* time spent in the CUDA kernels. + +This tells me that the Thrust implementations of these algorithms are so highly optimized and efficient that it's not the algorithm that's causing the bottleneck, it's memory bandwidth speeds between the GPU and the rest of the system! + +### Miscellaneous: powers of scale + +Just wanted to share some other fun stuff I encountered while testing. + +I originally was testing with *much* smaller array sizes, like $2^4$ and $2^{12}$. When I tried increasing the array size past $2^{18}$, the program would instantly crash. I was really confused why at first, until I looked at the exception being thrown: *stack overflow*. Because I was using `std::array` for my input and output arrays, I was allocating too much stack memory and literally ran out. Switching to heap allocation solved the issue. + +I then tried testing with array sizes from $2^{18}$ to $2^{30}$, incrementing by 4. This turned out to not be helpful at all; my numbers ranged from 0.068ms using CPU and $2^{18}$ to 1380.23ms using naive and $2^{30}$. Furthermore, my naive at $2^{26}$ ran in 75ms, so there was a ~18× difference between two adjacent data points. This would have translated to a *horrible* graph, so I adjusted the numbers to what I have now. + +Both of these experiences really left me with a newfound appreciation for exponents and the powers of two. It's *scary* how fast numbers can scale. + +## Test output + +This is the complete output for my tests. I used an array size of $2^{16}$ here. + +``` +**************** +** SCAN TESTS ** +**************** + [ 29 10 21 39 47 19 41 42 5 25 49 34 4 ... 32 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0168ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0162ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.246016ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.313984ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.806944ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.3496ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.088352ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.09392ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 2 1 3 3 1 1 0 1 3 1 2 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.1223ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0794ms (std::chrono Measured) + passed +==== cpu compact with scan, power-of-two ==== + elapsed time: 0.116ms (std::chrono Measured) + passed +==== cpu compact with scan, non-power-of-two ==== + elapsed time: 0.1174ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.771552ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.634944ms (CUDA Measured) + passed +==== thrust compact, power-of-two ==== + elapsed time: 0.136608ms (CUDA Measured) + passed +==== thrust compact, non-power-of-two ==== + elapsed time: 0.241568ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/analysis/graphs/scan_npot.png b/analysis/graphs/scan_npot.png new file mode 100644 index 00000000..95db0859 Binary files /dev/null and b/analysis/graphs/scan_npot.png differ diff --git a/analysis/graphs/scan_pot.png b/analysis/graphs/scan_pot.png new file mode 100644 index 00000000..f449f06e Binary files /dev/null and b/analysis/graphs/scan_pot.png differ diff --git a/analysis/scan_npot.csv b/analysis/scan_npot.csv new file mode 100644 index 00000000..01bfafcf --- /dev/null +++ b/analysis/scan_npot.csv @@ -0,0 +1,4 @@ +8.5,16.9,35.2,67.8,135.0,287.8 +35.6,73.5,152.6,318.7,662.9,1382.4 +7.3,14.4,29.0,63.3,118.1,239.2 +1.3,2.0,3.5,6.6,11.9,23.5 \ No newline at end of file diff --git a/analysis/scan_pot.csv b/analysis/scan_pot.csv new file mode 100644 index 00000000..8b9686ab --- /dev/null +++ b/analysis/scan_pot.csv @@ -0,0 +1,4 @@ +8.5,17.0,38.1,67.8,135.7,269.0 +35.5,74.1,153.4,318.6,663.0,"1,381.9" +7.3,14.4,29.0,58.3,116.8,233.9 +1.3,2.0,3.6,6.5,12.0,23.1 \ No newline at end of file diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch deleted file mode 100644 index 4267429a..00000000 --- a/cis565_stream_compaction_test.launch +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/img/figure-39-3.jpg b/img/figure-39-3.jpg new file mode 100644 index 00000000..22c4da0f Binary files /dev/null and b/img/figure-39-3.jpg differ diff --git a/img/thrust_nsight_systems.png b/img/thrust_nsight_systems.png new file mode 100644 index 00000000..6f63594d Binary files /dev/null and b/img/thrust_nsight_systems.png differ diff --git a/img/thrust_nsight_systems_2.png b/img/thrust_nsight_systems_2.png new file mode 100644 index 00000000..ed9fb3da Binary files /dev/null and b/img/thrust_nsight_systems_2.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..a05b359e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,154 +1,354 @@ -/** - * @file main.cpp - * @brief Stream compaction test program - * @authors Kai Ninomiya - * @date 2015 - * @copyright University of Pennsylvania - */ +#include "testing_helpers.hpp" +#include #include +#include #include -#include #include +#include #include -#include "testing_helpers.hpp" +#include +#include + +constexpr int sizePOT = 1 << 26; // feel free to change the size of array +constexpr int sizeNPOT = sizePOT - 3; // Non-Power-Of-Two + +/// If true, run additional simpler tests. +constexpr bool runDebugTests = false; + +/// Run benchmarks instead of tests. +constexpr bool runBenchmarks = false; + +/// Print out resulting arrays from computation. +constexpr bool enablePrintingArrays = false; + +constexpr bool enableCPUScan = true; +constexpr bool enableNaiveScan = true; +constexpr bool enableEfficientScan = true; +constexpr bool enableThrustScan = true; + +constexpr bool enableCPUCompact = true; +constexpr bool enableEfficientCompact = true; +constexpr bool enableThrustCompact = true; + +namespace Perf { + +using TimerFn = std::function; +using ScanFn = std::function; +using CompactionFn = std::function; + +enum class Implementation { CPU, Naive, Efficient, Thrust }; + +constexpr int numIterations = 10; +constexpr int maxValue = 50; + +std::pair getScanImplementation(Implementation implementation) { + using namespace StreamCompaction; + + auto cpu = &Common::PerformanceTimer::getCpuElapsedTimeForPreviousOperation; + auto gpu = &Common::PerformanceTimer::getGpuElapsedTimeForPreviousOperation; + + switch (implementation) { + case Implementation::CPU: + return std::make_pair(CPU::scan, std::bind(cpu, &CPU::timer())); + + case Implementation::Naive: + return std::make_pair(Naive::scan, std::bind(gpu, &Naive::timer())); + + case Implementation::Efficient: + return std::make_pair(Efficient::scan, std::bind(gpu, &Efficient::timer())); + + case Implementation::Thrust: + return std::make_pair(Thrust::scan, std::bind(gpu, &Thrust::timer())); + + default: + throw std::invalid_argument("invalid enum"); + } +} + +void runScanBenchmark(Implementation implementation, int n, std::string_view benchmarkName) { + std::string prefix = "[" + std::string(benchmarkName) + "]"; + const auto [scan, getTime] = getScanImplementation(implementation); + + std::vector elapsedTimes; + std::unique_ptr out = std::make_unique(sizePOT); + std::unique_ptr in = std::make_unique(sizePOT); + genArray(sizePOT, in.get(), maxValue); + + for (int i = 1; i <= numIterations; ++i) { + std::cout << prefix << " Executing scan(): " << i << " of " << numIterations << "...\r"; + + scan(n, out.get(), in.get()); + elapsedTimes.push_back(getTime()); + } + + float average = 0.f; + for (const float& time : elapsedTimes) { + average += time; + } + average /= elapsedTimes.size(); + + std::cout << prefix << " Average scan() time: " << average << " " << std::endl; +} -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +} // namespace Perf + +int* a = new int[sizePOT]; +int* b = new int[sizePOT]; +int* c = new int[sizePOT]; int main(int argc, char* argv[]) { - // Scan tests + if constexpr (runBenchmarks) { + printf("********************\n"); + printf("** SCAN BENCHMARK **\n"); + printf("********************\n\n"); - printf("\n"); + std::cout << "- Number of iterations: " << Perf::numIterations << std::endl; + std::cout << "- Size of POT array: " << sizePOT << std::endl; + std::cout << "- Size of NPOT array: " << sizeNPOT << "\n" << std::endl; + + if constexpr (enableCPUScan) { + Perf::runScanBenchmark(Perf::Implementation::CPU, sizePOT, "CPU/POT"); + Perf::runScanBenchmark(Perf::Implementation::CPU, sizeNPOT, "CPU/NPOT"); + } + + if constexpr (enableNaiveScan) { + Perf::runScanBenchmark(Perf::Implementation::Naive, sizePOT, "Naive/POT"); + Perf::runScanBenchmark(Perf::Implementation::Naive, sizeNPOT, "Naive/NPOT"); + } + + if constexpr (enableEfficientScan) { + Perf::runScanBenchmark(Perf::Implementation::Efficient, sizePOT, "Efficient/POT"); + Perf::runScanBenchmark(Perf::Implementation::Efficient, sizeNPOT, "Efficient/NPOT"); + } + + if constexpr (enableThrustScan) { + Perf::runScanBenchmark(Perf::Implementation::Thrust, sizePOT, "Thrust/POT"); + Perf::runScanBenchmark(Perf::Implementation::Thrust, sizeNPOT, "Thrust/NPOT"); + } + } else { printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(sizePOT - 1, a, 50); // Leave a 0 at the end to test that edge case + a[sizePOT - 1] = 0; + printArray(sizePOT, a, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); + zeroArray(sizePOT, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + StreamCompaction::CPU::scan(sizePOT, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + if constexpr (enablePrintingArrays) printArray(sizePOT, b, true); - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + if constexpr (enableCPUScan) { + zeroArray(sizePOT, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + if constexpr (enablePrintingArrays) printArray(sizeNPOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + if constexpr (enableNaiveScan) { + zeroArray(sizePOT, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(sizePOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizePOT, b, c); + + zeroArray(sizePOT, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + if constexpr (enableEfficientScan) { + zeroArray(sizePOT, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(sizePOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizePOT, b, c); + + zeroArray(sizePOT, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizeNPOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + if constexpr (enableThrustScan) { + zeroArray(sizePOT, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(sizePOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizePOT, b, c); + + zeroArray(sizePOT, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizeNPOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + if constexpr (runDebugTests) { + printf("\n"); + printf("*************************\n"); + printf("** SCAN TESTS (ALL 1s) **\n"); + printf("*************************\n"); + + onesArray(sizePOT, a); + zeroArray(sizePOT, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(sizePOT, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, b, true); + + if constexpr (enableCPUScan) { + zeroArray(sizePOT, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + if constexpr (enablePrintingArrays) printArray(sizeNPOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + if constexpr (enableNaiveScan) { + zeroArray(sizePOT, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(sizePOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizePOT, b, c); + + zeroArray(sizePOT, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + + if constexpr (enableEfficientScan) { + zeroArray(sizePOT, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(sizePOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizePOT, c, true); + printCmpResult(sizePOT, b, c); + + zeroArray(sizePOT, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(sizeNPOT, c, true); + printCmpResult(sizeNPOT, b, c); + } + } + } + + if constexpr (runBenchmarks) { + } else { printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); printf("*****************************\n"); - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(sizePOT - 1, a, 4); // Leave a 0 at the end to test that edge case + a[sizePOT - 1] = 0; + printArray(sizePOT, a, true); int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); + zeroArray(sizePOT, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + count = StreamCompaction::CPU::compactWithoutScan(sizePOT, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); + if constexpr (enablePrintingArrays) printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); - zeroArray(SIZE, c); + zeroArray(sizePOT, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + count = StreamCompaction::CPU::compactWithoutScan(sizeNPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + if constexpr (enablePrintingArrays) printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + if constexpr (enableCPUCompact) { + zeroArray(sizePOT, b); + printDesc("cpu compact with scan, power-of-two"); + count = StreamCompaction::CPU::compactWithScan(sizePOT, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + expectedCount = count; + if constexpr (enablePrintingArrays) printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(sizePOT, c); + printDesc("cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + expectedNPOT = count; + if constexpr (enablePrintingArrays) printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + } + + if constexpr (enableEfficientCompact) { + zeroArray(sizePOT, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(sizePOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(sizePOT, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + } + + if constexpr (enableThrustCompact) { + zeroArray(sizePOT, c); + printDesc("thrust compact, power-of-two"); + count = StreamCompaction::Thrust::compact(sizePOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(sizePOT, c); + printDesc("thrust compact, non-power-of-two"); + count = StreamCompaction::Thrust::compact(sizeNPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + if constexpr (enablePrintingArrays) printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + } + } - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + delete[] a; + delete[] b; + delete[] c; } diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94aa..bd63a710 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,76 +1,72 @@ #pragma once -#include #include +#include +#include #include #include -#include -template -int cmpArrays(int n, T *a, T *b) { - for (int i = 0; i < n; i++) { - if (a[i] != b[i]) { - printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); - return 1; - } +template +int cmpArrays(int n, T* a, T* b) { + for (int i = 0; i < n; i++) { + if (a[i] != b[i]) { + printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); + return 1; } - return 0; + } + return 0; } -void printDesc(const char *desc) { - printf("==== %s ====\n", desc); +void printDesc(const char* desc) { + printf("==== %s ====\n", desc); } -template -void printCmpResult(int n, T *a, T *b) { - printf(" %s \n", - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +template +void printCmpResult(int n, T* a, T* b) { + printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } -template -void printCmpLenResult(int n, int expN, T *a, T *b) { - if (n != expN) { - printf(" expected %d elements, got %d\n", expN, n); - } - printf(" %s \n", - (n == -1 || n != expN) ? "FAIL COUNT" : - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +template +void printCmpLenResult(int n, int expN, T* a, T* b) { + if (n != expN) { + printf(" expected %d elements, got %d\n", expN, n); + } + printf(" %s \n", (n == -1 || n != expN) ? "FAIL COUNT" : cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } -void zeroArray(int n, int *a) { - for (int i = 0; i < n; i++) { - a[i] = 0; - } +void zeroArray(int n, int* a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } } -void onesArray(int n, int *a) { - for (int i = 0; i < n; i++) { - a[i] = 1; - } +void onesArray(int n, int* a) { + for (int i = 0; i < n; i++) { + a[i] = 1; + } } -void genArray(int n, int *a, int maxval) { - srand(time(nullptr)); +void genArray(int n, int* a, int maxval) { + srand(time(nullptr)); - for (int i = 0; i < n; i++) { - a[i] = rand() % maxval; - } + for (int i = 0; i < n; i++) { + a[i] = rand() % maxval; + } } -void printArray(int n, int *a, bool abridged = false) { - printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; - printf("... "); - } - printf("%3d ", a[i]); +void printArray(int n, int* a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); } - printf("]\n"); + printf("%3d ", a[i]); + } + printf("]\n"); } -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; +template +void printElapsedTime(T time, std::string note = "") { + std::cout << " elapsed time: " << time << "ms " << note << std::endl; } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..1f2b7092 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,43 @@ #include "common.h" -void checkCUDAErrorFn(const char *msg, const char *file, int line) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); +void checkCUDAErrorFn(const char* msg, const char* file, int line) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } +namespace StreamCompaction::Common { + +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernMapToBoolean(int n, int* bools, const int* idata) { + int tId = (blockIdx.x * blockDim.x) + threadIdx.x; -namespace StreamCompaction { - namespace Common { + if (tId >= n) return; - /** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO - } + bools[tId] = idata[tId] > 0 ? 1 : 0; +} - /** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO - } +/** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ +__global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) { + int tId = (blockIdx.x * blockDim.x) + threadIdx.x; - } + if (tId >= n || !bools[tId]) return; + + odata[indices[tId]] = idata[tId]; } + +} // namespace StreamCompaction::Common diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..0dd4d356 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,11 +3,12 @@ #include #include -#include -#include -#include #include #include +#include +#include +#include +#include #include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) @@ -16,117 +17,126 @@ /** * Check for CUDA errors; print and exit if there was a problem. */ -void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +void checkCUDAErrorFn(const char* msg, const char* file = NULL, int line = -1); inline int ilog2(int x) { - int lg = 0; - while (x >>= 1) { - ++lg; - } - return lg; + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; } inline int ilog2ceil(int x) { - return x == 1 ? 0 : ilog2(x - 1) + 1; + return x == 1 ? 0 : ilog2(x - 1) + 1; +} + +inline void printArray(int n, const int* data) { + std::cout << "[ "; + for (int i = 0; i < n; ++i) { + std::cout << data[i] << " "; + } + std::cout << "]" << std::endl; } namespace StreamCompaction { - namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); - - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); - - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer - { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; - }; +namespace Common { + +__global__ void kernMapToBoolean(int n, int* bools, const int* idata); + +__global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices); + +/** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ +class PerformanceTimer { + public: + PerformanceTimer() { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() { + if (cpu_timer_started) { + throw std::runtime_error("CPU timer already started"); } -} + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { + throw std::runtime_error("CPU timer not started"); + } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() { + if (gpu_timer_started) { + throw std::runtime_error("GPU timer already started"); + } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { + throw std::runtime_error("GPU timer not started"); + } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() // noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() // noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; +}; + +} // namespace Common +} // namespace StreamCompaction diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..cfcf27ad 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,50 +1,102 @@ -#include +#include "common.h" #include "cpu.h" -#include "common.h" +#include namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } - - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } +namespace CPU { + +using StreamCompaction::Common::PerformanceTimer; + +namespace { + +inline void scanImplementation(int n, int* odata, const int* idata) { + int currentSum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = currentSum; + currentSum += idata[i]; + } +} + +} // namespace + +PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; +} + +/** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function + * first. + */ +void scan(int n, int* odata, const int* idata) { + if (n <= 0) return; + + timer().startCpuTimer(); + scanImplementation(n, odata, idata); + timer().endCpuTimer(); +} + +int compact(int n, int* odata, const int* idata) { + return compactWithScan(n, odata, idata); +} + +/** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithoutScan(int n, int* odata, const int* idata) { + if (n <= 0) return 0; + + timer().startCpuTimer(); + + int outputIndex = 0; + for (int inputIndex = 0; inputIndex < n; ++inputIndex) { + int element = idata[inputIndex]; + + if (element > 0) { + odata[outputIndex] = element; + outputIndex++; } + } + + timer().endCpuTimer(); + + return outputIndex; } + +/** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithScan(int n, int* odata, const int* idata) { + if (n <= 0) return 0; + + std::unique_ptr valid = std::make_unique(n); + std::unique_ptr scanResult = std::make_unique(n); + + timer().startCpuTimer(); + + for (int i = 0; i < n; ++i) { + valid[i] = idata[i] > 0 ? 1 : 0; + } + + scanImplementation(n, scanResult.get(), valid.get()); + + for (int i = 0; i < n; ++i) { + if (valid[i] > 0) { + odata[scanResult[i]] = idata[i]; + } + } + + timer().endCpuTimer(); + + return scanResult[n - 1]; +} + +} // namespace CPU +} // namespace StreamCompaction diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c0476..322f3c92 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -3,13 +3,18 @@ #include "common.h" namespace StreamCompaction { - namespace CPU { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace CPU { - void scan(int n, int *odata, const int *idata); +StreamCompaction::Common::PerformanceTimer& timer(); - int compactWithoutScan(int n, int *odata, const int *idata); +void scan(int n, int* odata, const int* idata); - int compactWithScan(int n, int *odata, const int *idata); - } -} +/// By default, `compact` will use `scan` because that is also true for `Efficient::compact`. +int compact(int n, int* odata, const int* idata); + +int compactWithoutScan(int n, int* odata, const int* idata); + +int compactWithScan(int n, int* odata, const int* idata); + +} // namespace CPU +} // namespace StreamCompaction diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..a39a777f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,40 +1,208 @@ -#include -#include #include "common.h" #include "efficient.h" +#include +#include + +#include +#include +#include + namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } +namespace Efficient { + +namespace { + +bool enableScanMeasure = true; + +} + +using StreamCompaction::Common::PerformanceTimer; + +/// Number of threads per block. +constexpr int blockSize = 512; + +/// Enable `checkCUDAError()` calls within the performance measuring fence. +constexpr bool checkErrorsDuringTimer = true; + +PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; +} + +__global__ void kernReduceForLayer(int n, int* data, int layer, int stride) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k >= n) return; + + int offset = k * stride; + int previousStride = 1 << layer; + + int rightChild = offset + stride - 1; + int leftChild = offset + previousStride - 1; + + data[rightChild] += data[leftChild]; +} + +__global__ void kernTraverseDownLayer(int n, int* data, int layer, int stride) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k >= n) return; + + int offset = k * stride; + int previousStride = 1 << layer; + + int rightChild = offset + stride - 1; + int leftChild = offset + previousStride - 1; + + int leftValue = data[leftChild]; + data[leftChild] = data[rightChild]; + data[rightChild] += leftValue; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) { + int actualN = n; + size_t numBytes = n * sizeof(int); + + std::unique_ptr actualInputData = std::make_unique(n); + std::memcpy(actualInputData.get(), idata, numBytes); + + // Input array size is not a power of two; we have to pad the left with zeroes + std::optional paddingOpt; + if (int numLeaves = 1 << ilog2ceil(n); n < numLeaves) { + int offset = numLeaves - n; + + // Pad to the next power of two + std::unique_ptr paddedInputData = std::make_unique(numLeaves); + std::memcpy(paddedInputData.get() + offset, idata, numBytes); + + paddingOpt = offset; + actualN = numLeaves; + numBytes = numLeaves * sizeof(int); + actualInputData.swap(paddedInputData); + } + + int* dev_data = nullptr; + cudaMalloc(reinterpret_cast(&dev_data), numBytes); + checkCUDAError("cudaMalloc: dev_data failed!"); + cudaMemcpy(dev_data, actualInputData.get(), numBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy: actualInputData -> dev_data failed!"); + + if (enableScanMeasure) timer().startGpuTimer(); + + // Perform up-sweep via parallel reduction + for (int layer = 0; layer < ilog2(actualN); ++layer) { + int stride = 1 << (layer + 1); + int numDispatches = actualN / stride; + int numBlocks = (numDispatches + blockSize - 1) / blockSize; + + kernReduceForLayer<<>>(numDispatches, dev_data, layer, stride); + } + + // Zero out the root + int zero = 0; + cudaMemcpy(dev_data + (actualN - 1), &zero, sizeof(int), cudaMemcpyHostToDevice); + if constexpr (checkErrorsDuringTimer) checkCUDAError("cudaMemcpy: 0 -> dev_data failed!"); + + for (int layer = ilog2(actualN) - 1; layer >= 0; --layer) { + int stride = 1 << (layer + 1); + int numDispatches = actualN / stride; + int numBlocks = (numDispatches + blockSize - 1) / blockSize; + + kernTraverseDownLayer<<>>(numDispatches, dev_data, layer, stride); + } + + if (enableScanMeasure) timer().endGpuTimer(); + + if (paddingOpt) { + // If previously padded, remove extra zeroes + cudaMemcpy(actualInputData.get(), dev_data, numBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy: dev_data -> actualInputData failed!"); + std::memcpy(odata, actualInputData.get() + paddingOpt.value(), n * sizeof(int)); + } else { + cudaMemcpy(odata, dev_data, numBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy: dev_data -> odata failed!"); + } + + cudaFree(dev_data); + checkCUDAError("cudaFree: dev_data failed!"); } + +/** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +int compact(int n, int* odata, const int* idata) { + int* dev_bools = nullptr; + int* dev_indices = nullptr; + int* dev_odata = nullptr; + int* dev_idata = nullptr; + + int numBlocks = (n + blockSize - 1) / blockSize; + size_t numBytes = n * sizeof(int); + + cudaMalloc(reinterpret_cast(&dev_bools), numBytes); + checkCUDAError("cudaMalloc: dev_bools failed"); + cudaMalloc(reinterpret_cast(&dev_indices), numBytes); + checkCUDAError("cudaMalloc: dev_indices failed"); + cudaMalloc(reinterpret_cast(&dev_odata), numBytes); + checkCUDAError("cudaMalloc: dev_odata failed"); + cudaMalloc(reinterpret_cast(&dev_idata), numBytes); + checkCUDAError("cudaMalloc: dev_idata failed"); + + std::unique_ptr indices = std::make_unique(n); + std::unique_ptr bools = std::make_unique(n); + + cudaMemcpy(dev_idata, idata, numBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy: idata -> dev_idata failed"); + + timer().startGpuTimer(); + + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + cudaMemcpy(bools.get(), dev_bools, numBytes, cudaMemcpyDeviceToHost); + if constexpr (checkErrorsDuringTimer) checkCUDAError("cudaMemcpy: dev_bools -> bools failed"); + + enableScanMeasure = false; + scan(n, indices.get(), bools.get()); + enableScanMeasure = true; + + cudaMemcpy(dev_indices, indices.get(), numBytes, cudaMemcpyHostToDevice); + if constexpr (checkErrorsDuringTimer) checkCUDAError("cudaMemcpy: indices -> dev_indices failed"); + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_indices, numBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy: dev_indices -> odata failed"); + int numRemaining = odata[n - 1]; + + cudaMemcpy(odata, dev_odata, numBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy: dev_odata -> odata failed"); + + cudaFree(dev_bools); + checkCUDAError("cudaFree: dev_bools failed"); + cudaFree(dev_indices); + checkCUDAError("cudaFree: dev_indices failed"); + cudaFree(dev_odata); + checkCUDAError("cudaFree: dev_odata failed"); + cudaFree(dev_idata); + checkCUDAError("cudaFree: dev_idata failed"); + + // Since we're doing an exclusive scan, we need to manually check if the last element is valid + if (bools[n - 1]) { + return numRemaining + 1; + } else { + return numRemaining; + } +} + +} // namespace Efficient +} // namespace StreamCompaction diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..d64c3426 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -3,11 +3,13 @@ #include "common.h" namespace StreamCompaction { - namespace Efficient { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace Efficient { - void scan(int n, int *odata, const int *idata); +StreamCompaction::Common::PerformanceTimer& timer(); - int compact(int n, int *odata, const int *idata); - } -} +void scan(int n, int* odata, const int* idata); + +int compact(int n, int* odata, const int* idata); + +} // namespace Efficient +} // namespace StreamCompaction diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..477b447b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,25 +1,84 @@ -#include -#include #include "common.h" #include "naive.h" +#include +#include + namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } +namespace Naive { + +using StreamCompaction::Common::PerformanceTimer; + +/// Number of threads per block. +constexpr int blockSize = 64; + +/// Whether to return an inclusive or exclusive scan. +constexpr bool useExclusiveScan = true; + +PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; +} + +__global__ void kernSumPairsForIteration(int n, const int* in, int* out, int stride) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k >= n) return; + + int outIndex = stride + k; + + out[outIndex] = in[k] + in[outIndex]; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) { + int* dev_dataA = nullptr; + int* dev_dataB = nullptr; + + size_t numBytes = n * sizeof(int); + cudaMalloc(reinterpret_cast(&dev_dataA), numBytes); + checkCUDAError("cudaMalloc: dev_dataA failed!"); + cudaMalloc(reinterpret_cast(&dev_dataB), numBytes); + checkCUDAError("cudaMalloc: dev_dataB failed!"); + + cudaMemcpy(dev_dataA, idata, numBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy: idata -> dev_dataA failed!"); + cudaMemcpy(dev_dataB, dev_dataA, numBytes, cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy: dev_dataA -> dev_dataB failed!"); + + timer().startGpuTimer(); + + for (int iteration = 1; iteration <= ilog2ceil(n); ++iteration) { + int stride = 1 << (iteration - 1); + int numDispatches = n - stride; + int numBlocks = (numDispatches + blockSize - 1) / blockSize; + + kernSumPairsForIteration<<>>(numDispatches, dev_dataA, dev_dataB, stride); + + // Write new results back into A to be read from + cudaMemcpy(dev_dataA, dev_dataB, numBytes, cudaMemcpyDeviceToDevice); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_dataA, numBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy: dev_dataA -> odata failed!"); + + // Convert from inclusive scan to exclusive + if constexpr (useExclusiveScan) { + for (int i = n - 1; i > 0; --i) { + odata[i] = odata[i - 1]; } + odata[0] = 0; + } + + cudaFree(dev_dataA); + checkCUDAError("cudaFree: dev_dataA failed!"); + cudaFree(dev_dataB); + checkCUDAError("cudaFree: dev_dataB failed!"); } + +} // namespace Naive +} // namespace StreamCompaction diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb064..65e709e6 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,9 +3,11 @@ #include "common.h" namespace StreamCompaction { - namespace Naive { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace Naive { - void scan(int n, int *odata, const int *idata); - } -} +StreamCompaction::Common::PerformanceTimer& timer(); + +void scan(int n, int* odata, const int* idata); + +} // namespace Naive +} // namespace StreamCompaction diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..6074c0fd 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -1,28 +1,67 @@ +#include "common.h" +#include "thrust.h" + #include #include + #include +#include #include +#include #include -#include "common.h" -#include "thrust.h" namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } - } +namespace Thrust { + +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; } + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) { + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); + + thrust::exclusive_scan(thrust::device, dv_in.begin(), dv_in.end(), dv_out.begin()); + + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); +} + +struct isNotValid { + __host__ __device__ bool operator()(const int& element) { return element == 0; } +}; + +/** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +int compact(int n, int* odata, const int* idata) { + thrust::device_vector dv_in(idata, idata + n); + + timer().startGpuTimer(); + + auto newEndIt = thrust::remove_if(thrust::device, dv_in.begin(), dv_in.end(), isNotValid()); + + timer().endGpuTimer(); + + thrust::copy(dv_in.begin(), newEndIt, odata); + + return thrust::distance(dv_in.begin(), newEndIt); +} + +} // namespace Thrust +} // namespace StreamCompaction diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206b..a9b0d3d0 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -3,9 +3,13 @@ #include "common.h" namespace StreamCompaction { - namespace Thrust { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace Thrust { - void scan(int n, int *odata, const int *idata); - } -} +StreamCompaction::Common::PerformanceTimer& timer(); + +void scan(int n, int* odata, const int* idata); + +int compact(int n, int* odata, const int* idata); + +} // namespace Thrust +} // namespace StreamCompaction