diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69e2cba..63a9de0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,8 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
+
 # Enable C++11 for host code
 set(CMAKE_CXX_STANDARD 11)
 if(NOT DEFINED CMAKE_CUDA_STANDARD)
@@ -45,4 +47,13 @@ source_group(Sources FILES ${sources})
 
 add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers})
 target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction)
+if(CMAKE_VERSION VERSION_LESS "3.23.0")
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES OFF)
+elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major)
+else()
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
+endif()
+target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
 set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME})
+
diff --git a/README.md b/README.md
index 0e38ddb..40eba04 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,341 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**  
+**Author:** Yi Liu
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+## Tested On
+- **OS:** Windows 11 Home Version 24H2 (OS Build 26100.4061)  
+- **CPU:** Intel(R) Core(TM) i9-14900K @ 3.20GHz, 24 cores / 32 threads  
+- **RAM:** 64 GB  
+- **GPU:** NVIDIA GeForce RTX 4090, 24 GB VRAM  
+- **Environment:** Visual Studio 2022, CUDA 12.6, CMake 3.27
+---
 
-### (TODO: Your README)
+## 🧩 Overview
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project focuses on implementing and optimizing **stream compaction** using CUDA. Stream compaction is a fundamental parallel algorithm that removes unwanted elements (e.g., zeros) from an array while preserving the order of the remaining elements.
 
+To achieve this, multiple versions of **prefix sum (scan)** were implemented and benchmarked, as scan is a key building block in efficient stream compaction.
+
+The primary goals of this project were:
+- Implement different versions of stream compaction on the GPU.
+- Compare them against a CPU baseline.
+- Investigate performance characteristics and bottlenecks.
+
+---
+
+## ✅ Features Implemented
+
+### 1. **CPU Stream Compaction (Baseline)**
+- Simple sequential compaction for correctness verification and timing comparison.
+- Iterates through input array, writing out only non-zero elements.
+
+### 2. **GPU Stream Compaction with Scan**
+- Performs compaction in three steps:
+  1. **Map to Boolean**: Marks non-zero entries as 1 and zeros as 0.
+  2. **Exclusive Scan**: Computes destination indices for the non-zero values.
+  3. **Scatter**: Writes non-zero values to compacted output using scanned indices.
+
+### 3. **Scan Implementations (for Compaction Support)**
+
+#### • Naive GPU Scan
+- Textbook parallel scan with \( O(n log n) \) operations.
+- New kernel launched at every depth level.
+
+#### • Work-Efficient GPU Scan (Blelloch)
+- Upsweep and downsweep phases for \( O(n) \) total work.
+- Multiple kernel launches for each pass.
+
+#### • Shared Memory Optimized Scan
+- Uses block-level shared memory to reduce global memory traffic.
+- Suitable for small arrays or single-block inputs.
+
+#### • Thrust Library Scan
+- Calls `thrust::exclusive_scan()` as a baseline for optimized GPU performance.
+
+### 4. **GPU Radix Sort**
+- Implements a multi-pass Least Significant Bit (LSB) radix sort for 32-bit integers.
+- Each pass processes a single bit of all elements, classifying values based on 0 or 1 at that bit.
+- Performs bit extraction, scan on inverted bits, and scatter to sort data based on the current bit.
+- Uses the previously implemented work-efficient scan as a prefix sum primitive.
+- Buffers are swapped each round to maintain sorted data across 32 passes.
+- Suitable for sorting integer keys; stable and parallel-friendly.
+
+
+---
+
+## Test Cases Summary
+
+This project includes a comprehensive suite of test cases across **scan**, **stream compaction**, and **radix sort**, covering both correctness and performance. Each test is designed to validate behavior on different input types, sizes (including power-of-two and non-power-of-two), and algorithm variants.
+
+### Scan Tests
+Tested scan variants include CPU, naive GPU, work-efficient GPU, Thrust, and shared memory scans.
+
+| Test Description                               | Input Size         | Notes                                                |
+|------------------------------------------------|--------------------|------------------------------------------------------|
+| CPU scan                                       | 2¹⁹ = 524,288      | Baseline for correctness and performance             |
+| Naive GPU scan (power-of-two / non-power-of-two) | 2¹⁹ and 393,931  | Launches multiple kernels per level                 |
+| Work-efficient GPU scan (power-of-two / NPOT)  | 2¹⁹ and 393,931    | Implements upsweep/downsweep                        |
+| Thrust GPU scan (power-of-two / NPOT)          | 2¹⁹ and 393,931    | Uses `thrust::exclusive_scan`                       |
+| Shared memory naive scan                       | 512, 500, 32       | Small sizes for validating shared mem behavior      |
+| Shared memory efficient scan                   | 512, 32            | Efficient scan in shared memory with loop unrolling |
+
+
+### Stream Compaction Tests
+Tested both CPU and GPU compaction with and without scan, for both power-of-two and non-power-of-two sizes.
+
+| Test Description                               | Input Size         | Notes                                                |
+|------------------------------------------------|--------------------|------------------------------------------------------|
+| CPU compaction without scan (POT / NPOT)       | 524,288 / 393,931  | Sequential traversal                                 |
+| CPU compaction with scan                       | 524,288            | Uses scan + scatter                                  |
+| Work-efficient GPU compaction (POT / NPOT)     | 524,288 / 393,931  | Uses map-to-boolean + scan + scatter                |
+
+### Radix Sort Tests
+Includes correctness tests for various distributions and sizes, with special cases and large arrays.
+
+| Test Case Description                          | Input Size         | Notes                                                |
+|------------------------------------------------|--------------------|------------------------------------------------------|
+| Random values                                  | 10                 | Basic unsorted small input                          |
+| Already sorted                                 | 8                  | Best-case scenario                                  |
+| Reverse sorted                                 | 8                  | Worst-case scenario                                 |
+| Identical values                               | 6                  | Edge case: no change after sort                     |
+| Contains duplicates                            | 9                  | Tests stable ordering                               |
+| Large array (power-of-two)                     | 65,536             | Stress test for GPU sort                            |
+| Large array (non-power-of-two)                 | 65,519             | Non-POT performance and correctness                 |
+| Nearly sorted array with random swaps          | 16,384             | Realistic scenario with small local disorder        |
+
+---
+
+## Test Output Summary
+Below are the results from running the full test suite, including scan, stream compaction, and radix sort performance and correctness checks.
+
+<details>
+<summary><strong>Click to expand full test log</strong></summary>
+
+```plaintext
+****************
+** SCAN TESTS **
+****************
+    [  13  15  38  42  24   6  45   5  44  36  49  23  31 ...   8   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 175.799ms    (std::chrono Measured)
+    [   0  13  28  66 108 132 138 183 188 232 268 317 340 ... -1007801046 -1007801038 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 176.075ms    (std::chrono Measured)
+    [   0  13  28  66 108 132 138 183 188 232 268 317 340 ... -1007801093 -1007801087 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 34.0388ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 33.9692ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 13.3369ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 12.9604ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.2496ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.21722ms    (CUDA Measured)
+    passed
+==== shared memory naive scan, power-of-two ====
+   elapsed time: 0.005248ms    (CUDA Measured)
+    passed
+==== shared memory naive scan, non-power-of-two ====
+   elapsed time: 0.00512ms    (CUDA Measured)
+    passed
+==== shared memory naive scan, small manual ====
+   elapsed time: 0.004096ms    (CUDA Measured)
+    [   0   0   1   3   6  10  10  11  13  16  20  20  21 ...  60  60 ]
+    passed
+==== shared memory efficient scan, power-of-two ====
+   elapsed time: 0.00512ms    (CUDA Measured)
+    passed
+==== shared memory efficient scan, small manual ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   1   2   2   2   0   2   3   1   0   3   3   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 239.519ms    (std::chrono Measured)
+    [   2   1   2   2   2   2   3   1   3   3   1   3   1 ...   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 238.586ms    (std::chrono Measured)
+    [   2   1   2   2   2   2   3   1   3   3   1   3   1 ...   2   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 217.626ms    (std::chrono Measured)
+    [   2   1   2   2   2   2   3   1   3   3   1   3   1 ...   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 33.9799ms    (CUDA Measured)
+    [   2   1   2   2   2   2   3   1   3   3   1   3   1 ...   3   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 33.5489ms    (CUDA Measured)
+    [   2   1   2   2   2   2   3   1   3   3   1   3   1 ...   2   3 ]
+    passed
+
+***********************
+** RADIX SORT TESTS  **
+***********************
+==== radix sort - random ints ====
+    [   0   1   2   3   4   5   6   7   8   9 ]
+    passed
+==== radix sort - already sorted ====
+    [   0   1   2   3   4   5   6   7 ]
+    passed
+==== radix sort - reverse sorted ====
+    [   0   1   2   3   4   5   6   7 ]
+    passed
+==== radix sort - identical elements ====
+    [  42  42  42  42  42  42 ]
+    passed
+==== radix sort - contains duplicates ====
+    [   0   1   1   2   3   3   5   7   9 ]
+    passed
+==== radix sort - large array (pow2) ====
+   elapsed time: 0.174848ms    (CUDA Measured)
+    passed
+==== radix sort - large array (non-pow2) ====
+   elapsed time: 0.200704ms    (CUDA Measured)
+    passed
+==== radix sort - nearly sorted with random swaps ====
+   elapsed time: 0.233664ms    (CUDA Measured)
+    passed
+```
+</details> 
+
+---
+
+## Scan Runtime Analysis
+
+The figure below shows the elapsed time (in milliseconds) of four scan implementations measured across input sizes ranging from \(2^{13}\) to \(2^{27}\), using a fixed block size of 256. All tests were run in **Release mode** to ensure optimized performance, particularly for the Thrust-based implementation.
+
+![Scan Elapsed Time Plot](visualization/scan_size_plot_13_27.png)
+
+### Key Observations
+
+#### Performance at Small Sizes (\(2^{13} → 2^{17}\))
+- **Thrust Scan** performs exceptionally well, benefiting from Release-mode optimizations that eliminate overhead seen in debug builds.
+- **Naive GPU Scan** outperforms the Efficient GPU Scan in this range. This may be due to:
+  - Fewer kernel launches (one per depth level with simpler memory patterns)
+  - Lower overhead from global memory usage per pass
+- **CPU Scan** starts very fast but scales poorly beyond this point.
+
+#### Mid-Range Sizes (\(2^{18} → 2^{20}\))
+- **Thrust Scan** continues to lead, maintaining low latency while other methods begin to scale more steeply.
+- **Naive GPU Scan** begins to show inefficiencies due to redundant memory access and higher algorithmic complexity.
+
+#### Large Sizes (\(2^{21} → 2^{27}\))
+- **Thrust Scan** remains the fastest and scales efficiently, highlighting its well-optimized internal operations.
+- **Naive GPU Scan** slows down due to its \(O(n log n)\) complexity and less efficient memory use.
+- **CPU Scan** becomes the slowest by far, with consistent linear growth.
+
+Overall, Thrust offers the best performance across all input sizes when compiled in Release mode, while the Efficient GPU Scan provides a solid custom alternative with strong performance at small to mid-range sizes. The CPU scan, although fast for small inputs, follows a linear \(O(n)\) time complexity and becomes the slowest as input sizes grow.
+
+---
+
+## Compact Runtime Analysis
+
+The plot below shows the runtime performance (in milliseconds) of three different stream compaction implementations as a function of input size \(N\), ranging from \(2^5\) to \(2^{27}\). All tests were conducted using a fixed CUDA block size of 256 and compiled in **Release mode** to ensure optimal performance.
+
+![Compact Elapsed Time Plot](visualization/compact_size_plot.png)
+
+### Key Observations
+
+#### CPU vs CPU with Scan
+- For small input sizes (\(N < 2^{17}\)), **CPU with Scan** actually performs slightly better than pure CPU compaction.
+- As input size increases beyond \(2^{17}\), the runtimes of both CPU variants gradually converge, showing similar linear growth.
+- Both exhibit consistent linear growth on the log-log plot, confirming the expected **\(O(n)\)** time complexity for serial execution.
+
+#### Efficient GPU Scan
+- The GPU implementation shows near-constant runtime across small and mid-sized inputs (up to \(2^{21}\)), demonstrating its scalability due to parallel execution and efficient memory usage.
+- Beyond \(2^{21}\), the runtime begins to increase gradually. This likely reflects:
+  - The need to process more data in global memory
+  - Increased number of kernel launches
+- Even at \(N = 2^{27}\), GPU runtime remains well under 50 ms — significantly outperforming the CPU implementations, which exceed 100 ms at that scale.
+
+---
+
+## Efficient GPU Scan Runtime Analysis (Global Memory Implementation)
+
+The bar chart below displays the runtime (in milliseconds) of a work-efficient Blelloch-style GPU scan implementation that operates entirely in global memory. No shared memory or warp-level primitives are used. 
+
+![Efficient Scan Bar Plot](visualization/efficient_scan_bar_plot.png)
+
+### Key Observations
+
+- For mid-sized inputs (\(2^7\) to \(2^9\)), the scan shows consistently low runtimes (~0.07 ms), indicating effective thread-level parallelism despite global memory latency.
+- Noticeable spikes occur at sizes \(2^6\), \(2^{10}\), and \(2^{12}\), where runtime increases by 2–3× compared to neighboring sizes. These performance dips likely stem from:
+  - **Uncoalesced memory access** due to thread divergence at these specific sizes
+  - **Extra overhead** from partial warp utilization or thread underpopulation in early/late stages of the scan
+  - **Depth-related kernel launches**: For \(N = 2^{13}\), the number of upsweep and downsweep steps increases, amplifying launch and global memory access costs
+- The runtime drop at \(2^{11}\) suggests that thread/block configuration for that size aligns better with the kernel design, temporarily improving efficiency.
+
+### Best Performance
+
+The most optimal performance is observed between \(2^7\) and \(2^9\), where the runtime stabilizes around **0.07 ms**. This range can be considered the **sweet spot** for this global memory-based scan implementation, offering the lowest latency and most consistent performance across all tested input sizes. 
+
+---
+
+## Why Is The Efficient GPU Scan Sometimes Slower Than CPU and Even Slower Than Naive?
+
+The efficient GPU scan, although theoretically better in terms of work complexity \(O(n)\), is sometimes slower than both the CPU scan and the naive GPU scan. After analyzing my implementation and reviewing how GPUs work, here is my explanation:
+
+### kernel launch overhead becomes significant
+In my implementation, for each depth level of both upsweep and downsweep, I launch a separate kernel. Since the depth is \(log_2(n)\), I am launching multiple kernels even for small input sizes.
+On GPUs, kernel launches have some non-negligible overhead, especially when the amount of computation per launch is small. When input size is small, the kernel launch overhead actually dominates the total runtime.
+In contrast, CPU scan uses a simple loop without any such overhead.
+
+### thread utilization becomes poor at deeper levels
+At each level \(d\), the number of active threads launched is reduced:
+- At the first level, many threads are working.
+- As \(d\) increases, fewer and fewer threads are needed.
+- For very deep levels, I might be launching blocks where only a few threads are doing useful work while many others are idle.
+
+This results in low occupancy and poor utilization of GPU resources. Even though many threads exist, most of them are effectively "lazy" and just sitting idle at deeper levels.
+
+### global memory access pattern is inefficient
+In each kernel, threads read and write data directly to global memory:
+- The access pattern involves strided addresses like `data[k + (1 << d) - 1]` and `data[k + (1 << (d + 1)) - 1]`.
+- As \(d\) increases, these strides get larger, and global memory accesses become more scattered and less coalesced.
+- Uncoalesced memory access wastes memory bandwidth and increases latency.
+
+On the CPU, memory access is sequential and benefits from caching, which makes its memory much more efficient.
+
+## Summary Table
+
+| Factor | CPU Scan | Naive GPU Scan | Efficient GPU Scan |
+|--------|----------|----------------|---------------------|
+| Complexity | \(O(n)\) | \(O(n log n)\) | \(O(n)\) |
+| Memory Pattern | Sequential | Coalesced | Strided (degrades at deeper levels) |
+| Kernel Launches | 1 | \(log n\) | \(2 log n\) |
+| Warp Utilization | N/A | High | Drops significantly at deeper levels |
+| Global Memory Traffic | Low | Moderate | High |
+
+---
+
+## Extra Credit Summary
+
+For the extra credit, I implemented several additional features and optimizations beyond the baseline requirements:
+
+1. **Shared Memory Optimized Naive Scan (`sharednaivemem.cu/h`)**  
+   - Added shared memory support to the naive scan to reduce global memory traffic and improve performance on small input sizes.
+
+2. **Shared Memory Optimized Work-Efficient Scan (`sharedefficientmem.cu/h`)**  
+   - Applied shared memory to the work-efficient scan, optimizing intra-block upsweep and downsweep phases to minimize global memory accesses.
+
+3. **Radix Sort Implementation (`radix.cu/h`)**  
+   - Implemented Least Significant Bit (LSB) Radix Sort using scan as a building block to perform bitwise sorting across 32 passes.
+
+4. **Extra Credit Question: Why Efficient Scan Can Be Slower**  
+   - Provided a explanation on why the efficient scan may perform worse than naive scan or CPU scan for certain input sizes, due to factors such as kernel launch overhead, low thread occupancy at deeper levels, and non-coalesced memory accesses.
diff --git a/nsightoutput.ncu-rep b/nsightoutput.ncu-rep
new file mode 100644
index 0000000..8493b95
Binary files /dev/null and b/nsightoutput.ncu-rep differ
diff --git a/out.ncu-rep b/out.ncu-rep
new file mode 100644
index 0000000..42a7bc5
Binary files /dev/null and b/out.ncu-rep differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..3628d00 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,4 +1,4 @@
-/**
+﻿/**
  * @file      main.cpp
  * @brief     Stream compaction test program
  * @authors   Kai Ninomiya
@@ -11,15 +11,53 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix.h>
+#include <stream_compaction/sharednaivemem.h>
+#include <stream_compaction/sharedefficientmem.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <filesystem>  // C++17 required
+#include <direct.h>    // For _getcwd()
+
+
+
+void ensureFileExists(const std::string& filename) {
+    std::ifstream infile(filename);
+    if (!infile.good()) {
+        std::ofstream outfile(filename);  // Create the file
+        if (outfile) {
+            std::cout << "[INFO] File created: " << filename << std::endl;
+        }
+        else {
+            std::cerr << "[ERROR] Failed to create file: " << filename << std::endl;
+        }
+    }
+    else {
+        std::cout << "[INFO] File already exists: " << filename << std::endl;
+    }
+}
+
+
+
+void logTestCaseRuntimeToCSV(const std::string& filename, int size, float cpu, float naive, float efficient, float thrust) {
+    std::ofstream fout(filename, std::ios::app); // append mode
+    if (!fout.is_open()) {
+        std::cerr << "[ERROR] Could not open " << filename << " for logging scan test results." << std::endl;
+        return;
+    }
+
+    fout << size << "," << cpu << "," << naive << "," << efficient << "," << thrust << std::endl;
+    fout.close();
+}
+
+
+void runTests(int* a, int* b, int* c, int SIZE) {
+
+    int NPOT = SIZE - 3;
 
-int main(int argc, char* argv[]) {
     // Scan tests
 
     printf("\n");
@@ -37,7 +75,8 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    float cpu_scan = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    printElapsedTime(cpu_scan, "(std::chrono Measured)");
     printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
@@ -50,16 +89,11 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    float naive_scan = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+    printElapsedTime(naive_scan, "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
-    printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
-
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
@@ -70,7 +104,8 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    float eff_scan = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    printElapsedTime(eff_scan, "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
@@ -84,10 +119,15 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    float thrust_scan = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+    printElapsedTime(thrust_scan, "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
+    const std::string scanLogFile = "C:\\Users\\thero\\OneDrive\\Documents\\GitHub\\Project2-Stream-Compaction\\visualization\\scan_size_results.csv";
+    logTestCaseRuntimeToCSV(scanLogFile, SIZE, cpu_scan, naive_scan, eff_scan, thrust_scan);
+
+
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
@@ -95,6 +135,93 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+    const int SAFE_SIZE = 512;     // Power-of-two and ≤ 1024
+    const int SAFE_NPOT = 500;     // Non-power-of-two and ≤ 1024
+
+    // ======= shared memory naive scan, power-of-two =======
+    {
+        int* a = new int[SAFE_SIZE];
+        int* b = new int[SAFE_SIZE];
+        int* c = new int[SAFE_SIZE];
+
+        genArray(SAFE_SIZE, a, 50);
+        StreamCompaction::CPU::scan(SAFE_SIZE, b, a);
+
+        zeroArray(SAFE_SIZE, c);
+        printDesc("shared memory naive scan, power-of-two");
+        StreamCompaction::SharedNaiveMem::scanSharedNaive(SAFE_SIZE, c, a);
+        printElapsedTime(StreamCompaction::SharedNaiveMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(SAFE_SIZE, b, c);
+
+        delete[] a;
+        delete[] b;
+        delete[] c;
+    }
+
+    // ======= shared memory naive scan, non-power-of-two =======
+    {
+        int* a = new int[SAFE_NPOT];
+        int* b = new int[SAFE_NPOT];
+        int* c = new int[SAFE_NPOT];
+
+        genArray(SAFE_NPOT, a, 50);
+        StreamCompaction::CPU::scan(SAFE_NPOT, b, a);
+
+        zeroArray(SAFE_NPOT, c);
+        printDesc("shared memory naive scan, non-power-of-two");
+        StreamCompaction::SharedNaiveMem::scanSharedNaive(SAFE_NPOT, c, a);
+        printElapsedTime(StreamCompaction::SharedNaiveMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(SAFE_NPOT, b, c);
+
+        delete[] a;
+        delete[] b;
+        delete[] c;
+    }
+
+
+    {
+        const int N = 32;
+        int input[N], output[N], expected[N];
+        for (int i = 0; i < N; ++i) input[i] = i % 5;
+        printDesc("shared memory naive scan, small manual");
+        StreamCompaction::CPU::scan(N, expected, input);
+        StreamCompaction::SharedNaiveMem::scanSharedNaive(N, output, input);
+        printElapsedTime(StreamCompaction::SharedEfficientMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+    {
+        int* a = new int[SAFE_SIZE];
+        int* b = new int[SAFE_SIZE];
+        int* c = new int[SAFE_SIZE];
+
+        genArray(SAFE_SIZE, a, 50);
+        StreamCompaction::CPU::scan(SAFE_SIZE, b, a);
+
+        zeroArray(SAFE_SIZE, c);
+        printDesc("shared memory efficient scan, power-of-two");
+        StreamCompaction::SharedEfficientMem::scanSharedEfficient(SAFE_SIZE, c, a);
+        printElapsedTime(StreamCompaction::SharedEfficientMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(SAFE_SIZE, b, c);
+
+        delete[] a;
+        delete[] b;
+        delete[] c;
+    }
+
+
+    {
+        const int N = 32;
+        int input[N], output[N], expected[N];
+        for (int i = 0; i < N; ++i) input[i] = i % 4;
+        StreamCompaction::CPU::scan(N, expected, input);
+        StreamCompaction::SharedEfficientMem::scanSharedEfficient(N, output, input);
+        printDesc("shared memory efficient scan, small manual");
+        printCmpResult(N, expected, output);
+    }
+
+
     printf("\n");
     printf("*****************************\n");
     printf("** STREAM COMPACTION TESTS **\n");
@@ -113,7 +240,8 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, b);
     printDesc("cpu compact without scan, power-of-two");
     count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    float cpu_compact = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    printElapsedTime(cpu_compact, "(std::chrono Measured)");
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
@@ -129,26 +257,214 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    float cpu_compact_wScan = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    printElapsedTime(cpu_compact_wScan, "(std::chrono Measured)");
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
+
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    float eff_compact = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    printElapsedTime(eff_compact, "(CUDA Measured)");
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
-    system("pause"); // stop Win32 console from closing on exit
+    const std::string compactLogFile = "C:\\Users\\thero\\OneDrive\\Documents\\GitHub\\Project2-Stream-Compaction\\visualization\\compact_size_results.csv";
+    logTestCaseRuntimeToCSV(compactLogFile, SIZE, cpu_compact, cpu_compact_wScan, eff_compact, 0);
+
+    printf("\n");
+    printf("***********************\n");
+    printf("** RADIX SORT TESTS  **\n");
+    printf("***********************\n");
+
+    // Test 1: Random integers
+    {
+        const int N = 10;
+        int input[N] = { 3, 1, 4, 9, 2, 0, 5, 8, 7, 6 };
+        int output[N], expected[N];
+        memcpy(expected, input, sizeof(int) * N);
+        std::sort(expected, expected + N);
+
+        StreamCompaction::Radix::sort(N, output, input);
+        printDesc("radix sort - random ints");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+    // Test 2: Already sorted
+    {
+        const int N = 8;
+        int input[N] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+        int output[N], expected[N];
+        memcpy(expected, input, sizeof(int) * N);
+
+        StreamCompaction::Radix::sort(N, output, input);
+        printDesc("radix sort - already sorted");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+    // Test 3: Reverse sorted
+    {
+        const int N = 8;
+        int input[N] = { 7, 6, 5, 4, 3, 2, 1, 0 };
+        int output[N], expected[N] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+        StreamCompaction::Radix::sort(N, output, input);
+        printDesc("radix sort - reverse sorted");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+    // Test 4: All same
+    {
+        const int N = 6;
+        int input[N] = { 42, 42, 42, 42, 42, 42 };
+        int output[N], expected[N] = { 42, 42, 42, 42, 42, 42 };
+
+        StreamCompaction::Radix::sort(N, output, input);
+        printDesc("radix sort - identical elements");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+    // Test 5: Contains duplicates
+    {
+        const int N = 9;
+        int input[N] = { 5, 3, 3, 7, 1, 1, 2, 9, 0 };
+        int output[N], expected[N];
+        memcpy(expected, input, sizeof(int) * N);
+        std::sort(expected, expected + N);
+
+        StreamCompaction::Radix::sort(N, output, input);
+        printDesc("radix sort - contains duplicates");
+        printArray(N, output, true);
+        printCmpResult(N, expected, output);
+    }
+
+
+    // Test 6: Large array (power-of-two size)
+    {
+        const int N = 1 << 16; // 65536
+        int* input = new int[N];
+        int* output = new int[N];
+        int* expected = new int[N];
+
+        genArray(N, input, 10000);  // Fill with random ints in range [0, 9999]
+        memcpy(expected, input, sizeof(int) * N);
+        std::sort(expected, expected + N);
+
+        printDesc("radix sort - large array (pow2)");
+        StreamCompaction::Radix::sort(N, output, input);
+        printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(N, expected, output);
+
+        delete[] input;
+        delete[] output;
+        delete[] expected;
+    }
+
+    // Test 7: Large array (non-power-of-two size)
+    {
+        const int N = (1 << 16) - 17; // 65519
+        int* input = new int[N];
+        int* output = new int[N];
+        int* expected = new int[N];
+
+        genArray(N, input, 50000);  // Wider range of input
+        memcpy(expected, input, sizeof(int) * N);
+        std::sort(expected, expected + N);
+
+        printDesc("radix sort - large array (non-pow2)");
+        StreamCompaction::Radix::sort(N, output, input);
+        printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(N, expected, output);
+
+        delete[] input;
+        delete[] output;
+        delete[] expected;
+    }
+
+    // Test 8: Nearly sorted array with minor disorder
+    {
+        const int N = 1 << 14; // 16384
+        int* input = new int[N];
+        int* output = new int[N];
+        int* expected = new int[N];
+
+        // Generate ascending array
+        for (int i = 0; i < N; ++i) {
+            input[i] = i;
+        }
+
+        // Add small random noise
+        for (int i = 0; i < 100; ++i) {
+            int x = rand() % N;
+            int y = rand() % N;
+            std::swap(input[x], input[y]);
+        }
+
+        memcpy(expected, input, sizeof(int) * N);
+        std::sort(expected, expected + N);
+
+        printDesc("radix sort - nearly sorted with random swaps");
+        StreamCompaction::Radix::sort(N, output, input);
+        printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+        printCmpResult(N, expected, output);
+
+        delete[] input;
+        delete[] output;
+        delete[] expected;
+    }
+
     delete[] a;
     delete[] b;
     delete[] c;
+
+}
+
+
+
+int main(int argc, char* argv[]) {
+    const std::string scanLogFile = "C:\\Users\\thero\\OneDrive\\Documents\\GitHub\\Project2-Stream-Compaction\\visualization\\scan_size_results.csv";
+    std::ofstream fout(scanLogFile);
+    fout << "N,CPU(ms),Naive(ms),Efficient(ms),Thrust(ms)" << std::endl;
+    fout.close();
+
+
+
+    const std::string compactLogFile = "C:\\Users\\thero\\OneDrive\\Documents\\GitHub\\Project2-Stream-Compaction\\visualization\\compact_size_results.csv";
+    std::ofstream fout_2(compactLogFile);
+    fout_2 << "N,CPU_NoScan(ms),CPU_WithScan(ms),GPU_Efficient(ms),placeholder" << std::endl;
+    fout_2.close();
+
+
+    //for (int i = 0; i < 5; i++) {
+    //    const int SIZE = 1 << (5 + i); 
+    //    int* a = new int[SIZE];
+    //    int* b = new int[SIZE];
+    //    int* c = new int[SIZE];
+
+    //    runTests(a, b, c, SIZE);
+    //}
+
+
+    const int SIZE = 1 << 20;
+    int* a = new int[SIZE];
+    int* b = new int[SIZE];
+    int* c = new int[SIZE];
+
+    runTests(a, b, c, SIZE);
+
+    system("pause"); // stop Win32 console from closing on exit
+
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 19511ca..e0ec27c 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -21,12 +21,3 @@ source_group(Headers FILES ${headers})
 source_group(Sources FILES ${sources})
 
 add_library(stream_compaction ${sources} ${headers})
-if(CMAKE_VERSION VERSION_LESS "3.23.0")
-    set_target_properties(stream_compaction} PROPERTIES CUDA_ARCHITECTURES OFF)
-elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
-    set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major)
-else()
-    set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native)
-endif()
-target_compile_options(stream_compaction PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>")
-target_compile_options(stream_compaction PRIVATE "$<$<AND:$<CONFIG:Release>,$<COMPILE_LANGUAGE:CUDA>>:-lineinfo;-src-in-ptx>")
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..29fcd44 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,6 +1,6 @@
 #include "common.h"
 
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char* msg, const char* file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
         return;
@@ -22,17 +22,25 @@ namespace StreamCompaction {
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
             // TODO
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n) {
+                bools[i] = idata[i] != 0;
+            }
         }
 
         /**
          * Performs scatter on an array. That is, for each element in idata,
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
+        __global__ void kernScatter(int n, int* odata,
+            const int* idata, const int* bools, const int* indices) {
             // TODO
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n && bools[i]) {
+                odata[indices[i]] = idata[i];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..3eeb776 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -17,9 +17,20 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int* odata, const int* idata) {
             timer().startCpuTimer();
             // TODO
+
+            if (n == 0) {
+                timer().endCpuTimer();
+                return;
+            }
+
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
+
             timer().endCpuTimer();
         }
 
@@ -28,11 +39,18 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
+        int compactWithoutScan(int n, int* odata, const int* idata) {
             timer().startCpuTimer();
             // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[count++] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -40,11 +58,31 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
+        int compactWithScan(int n, int* odata, const int* idata) {
+            int* bools = new int[n];
+            int* indices = new int[n];
+
+            // Map: mark 1 if non-zero, else 0
+            for (int i = 0; i < n; i++) {
+                bools[i] = (idata[i] != 0) ? 1 : 0;
+            }
+
+            // Scan the bools to get indices
+            scan(n, indices, bools);
+
+            // Scatter: place valid elements in correct position
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (bools[i] == 1) {
+                    odata[indices[i]] = idata[i];
+                    count++;
+                }
+            }
+
+            delete[] bools;
+            delete[] indices;
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..109fc12 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,15 +12,73 @@ namespace StreamCompaction {
             return timer;
         }
 
+
+        __global__ void upsweep(int* data, int twod, int twod1, int n) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            int k = index * twod1;
+            if (k + twod1 - 1 < n) {
+                data[k + twod1 - 1] += data[k + twod - 1];
+            }
+        }
+
+
+        __global__ void downsweep(int* data, int twod, int twod1, int n) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            int k = index * twod1;
+            if (k + twod1 - 1 < n) {
+                int t = data[k + twod - 1];
+                data[k + twod - 1] = data[k + twod1 - 1];
+                data[k + twod1 - 1] += t;
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+
+         // Internal version that controls timing
+        void scan_internal(int n, int* odata, const int* idata, bool timing) {
+            if (timing) timer().startGpuTimer();
+
+            int pow2Len = 1 << ilog2ceil(n);
+            int* dev_data;
+            cudaMalloc(&dev_data, pow2Len * sizeof(int));
+            cudaMemset(dev_data, 0, pow2Len * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = 512;
+            for (int d = 1; d <= ilog2ceil(pow2Len); ++d) {
+                int twod = 1 << (d - 1);
+                int twod1 = 1 << d;
+                int numThreads = pow2Len / twod1;
+                int numBlocks = (numThreads + blockSize - 1) / blockSize;
+                upsweep << <numBlocks, blockSize >> > (dev_data, twod, twod1, pow2Len);
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemset(dev_data + pow2Len - 1, 0, sizeof(int)); // zero for exclusive
+
+            for (int d = ilog2ceil(pow2Len); d >= 1; --d) {
+                int twod = 1 << (d - 1);
+                int twod1 = 1 << d;
+                int numThreads = pow2Len / twod1;
+                int numBlocks = (numThreads + blockSize - 1) / blockSize;
+                downsweep << <numBlocks, blockSize >> > (dev_data, twod, twod1, pow2Len);
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+
+            if (timing) timer().endGpuTimer();
+        }
+
+        void scan(int n, int* odata, const int* idata) {
+            scan_internal(n, odata, idata, true);
         }
 
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +88,52 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int* odata, const int* idata) {
+            int* dev_idata, * dev_bools, * dev_indices, * dev_odata;
+
+            int pow2n = 1 << ilog2ceil(n);
+            cudaMalloc(&dev_idata, pow2n * sizeof(int));
+            cudaMalloc(&dev_bools, pow2n * sizeof(int));
+            cudaMalloc(&dev_indices, pow2n * sizeof(int));
+            cudaMalloc(&dev_odata, pow2n * sizeof(int));
+            cudaMemset(dev_odata, 0, pow2n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+
+            int blockSize = 128;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            StreamCompaction::Common::kernMapToBoolean<<<numBlocks, blockSize >>>(
+                n, dev_bools, dev_idata);
+            cudaDeviceSynchronize();
+
+            scan_internal(n, dev_indices, dev_bools, false);
+            cudaDeviceSynchronize();
+
+            StreamCompaction::Common::kernScatter<<<numBlocks, blockSize >>> (
+                n, dev_odata, dev_idata, dev_bools, dev_indices);
+            cudaDeviceSynchronize();
+
             timer().endGpuTimer();
-            return -1;
+
+            int count;
+            cudaMemcpy(&count, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            int lastBool;
+            cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            count += lastBool;
+
+            cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+            cudaFree(dev_odata);
+
+            return count;
         }
+
     }
-}
+}
\ No newline at end of file
diff --git a/stream_compaction/efficient_share.cu b/stream_compaction/efficient_share.cu
new file mode 100644
index 0000000..9721ba6
--- /dev/null
+++ b/stream_compaction/efficient_share.cu
@@ -0,0 +1,304 @@
+#include "common.h"
+#include "efficient_share.h"
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+
+#define blockSize 512
+#define itemPerBlock 1024
+#define CONFLICT_FREE_OFFSET(n) ((n) >> 5)
+
+namespace StreamCompaction {
+    namespace EfficientShare {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        __global__ void kernScan(int n, int* odata, int* sum)
+        {
+            extern __shared__ int s_odata[];
+            int bid = blockIdx.x;
+            int tid = threadIdx.x;
+            int blockOffset = bid * n;
+
+            // copy to shared buffer
+            int ai = tid;
+            int bi = tid + (n >> 1);
+            int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+            int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+            s_odata[ai + bankOffsetA] = odata[blockOffset + ai];
+            s_odata[bi + bankOffsetB] = odata[blockOffset + bi];
+
+            int offset = 1;
+
+            //s_odata[2 * tid] = odata[blockOffset + 2 * tid];
+            //s_odata[2 * tid + 1] = odata[blockOffset + 2 * tid + 1];
+
+            // up sweep
+            #pragma unroll
+            for (int d = itemPerBlock >> 1; d > 0; d >>= 1)
+            {
+                __syncthreads();
+                if (tid < d)
+                {
+                    int ai = offset * (2 * tid + 1) - 1;
+                    int bi = ai + offset;
+                    ai += CONFLICT_FREE_OFFSET(ai);
+                    bi += CONFLICT_FREE_OFFSET(bi);
+
+                    s_odata[bi] += s_odata[ai];
+                    offset <<= 1;
+                }
+            }
+
+            // set tail to zero
+            if (tid == 0)
+            {
+                sum[bid] = s_odata[n - 1 + CONFLICT_FREE_OFFSET(n - 1)];
+                s_odata[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
+            }
+
+            // down sweep
+            #pragma unroll
+            for (int d = 1; d < itemPerBlock; d <<= 1)
+            {
+                __syncthreads();
+                if (tid < d)
+                {
+                    offset >>= 1;
+                    int ai = offset * (2 * tid + 1) - 1;
+                    int bi = ai + offset;
+                    ai += CONFLICT_FREE_OFFSET(ai);
+                    bi += CONFLICT_FREE_OFFSET(bi);
+
+                    int t = s_odata[ai];
+                    s_odata[ai] = s_odata[bi];
+                    s_odata[bi] += t;
+                }
+            }
+
+            // write back
+            __syncthreads();
+            odata[blockOffset + ai] = s_odata[ai + bankOffsetA];
+            odata[blockOffset + bi] = s_odata[bi + bankOffsetB];
+            //odata[blockOffset + 2 * tid] = s_odata[2 * tid];
+            //odata[blockOffset + 2 * tid + 1] = s_odata[2 * tid + 1];
+        }
+
+        __global__ void kernScanSmall(int n, int* odata)
+        {
+            extern __shared__ int s_odata[];
+            int tid = threadIdx.x;
+
+            // copy to shared buffer
+            int ai = tid;
+            int bi = tid + (n >> 1);
+            int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+            int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+            if (tid < n)
+            {
+                s_odata[ai + bankOffsetA] = odata[ai];
+                s_odata[bi + bankOffsetB] = odata[bi];
+            }
+            else
+            {
+                s_odata[ai + bankOffsetA] = 0;
+                s_odata[bi + bankOffsetB] = 0;
+            }
+            
+
+            int offset = 1;
+
+            // up sweep
+            for (int d = n >> 1; d > 0; d >>= 1)
+            {
+                __syncthreads();
+                if (tid < d)
+                {
+                    int ai = offset * (2 * tid + 1) - 1;
+                    int bi = ai + offset;
+                    ai += CONFLICT_FREE_OFFSET(ai);
+                    bi += CONFLICT_FREE_OFFSET(bi);
+
+                    s_odata[bi] += s_odata[ai];
+                    offset <<= 1;
+                }
+            }
+
+            // set tail to zero
+            if (tid == 0)
+            {
+                s_odata[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
+            }
+
+            // down sweep
+            for (int d = 1; d < n; d <<= 1)
+            {
+                __syncthreads();
+                if (tid < d)
+                {
+                    offset >>= 1;
+                    int ai = offset * (2 * tid + 1) - 1;
+                    int bi = ai + offset;
+                    ai += CONFLICT_FREE_OFFSET(ai);
+                    bi += CONFLICT_FREE_OFFSET(bi);
+
+                    int t = s_odata[ai];
+                    s_odata[ai] = s_odata[bi];
+                    s_odata[bi] += t;
+                }
+            }
+
+            // write back
+            __syncthreads();
+            if (tid < n)
+            {
+                odata[ai] = s_odata[ai + bankOffsetA];
+                odata[bi] = s_odata[bi + bankOffsetB];
+            }
+            
+        }
+
+        __global__ void kernAdd(int n, int* odata, const int* incr)
+        {
+            int bid = blockIdx.x;
+            int tid = threadIdx.x;
+            int blockOffset = bid * n + tid;
+
+            int stride = n >> 2;
+            int base1 = incr[bid];
+
+            bid += gridDim.x;
+            int base2 = incr[bid];
+
+            odata[blockOffset] += base1;
+            odata[blockOffset + 1 * stride] += base1;
+            odata[blockOffset + 2 * stride] += base1;
+            odata[blockOffset + 3 * stride] += base1;
+
+            blockOffset = bid * n + tid;
+            
+            odata[blockOffset] += base2;
+            odata[blockOffset + 1 * stride] += base2;
+            odata[blockOffset + 2 * stride] += base2;
+            odata[blockOffset + 3 * stride] += base2;
+        }
+
+        // assume input is already padded
+        void scan_dev(int n, int* dev_odata)
+        {
+            int blockNum = (n + itemPerBlock - 1) / itemPerBlock;
+            int pot = nextPowerOfTwo(blockNum);
+
+            timer().pauseGpuTimer();
+
+            int* dev_sum;
+            cudaMalloc((void**)&dev_sum, pot * sizeof(int));
+            cudaMemset(dev_sum, 0, pot * sizeof(int));
+
+            timer().continueGpuTimer();
+            
+            kernScan << < blockNum, blockSize, (itemPerBlock + 10) * sizeof(int) >> > (itemPerBlock, dev_odata, dev_sum);
+
+            if (blockNum <= itemPerBlock)
+            {
+                
+                kernScanSmall << < 1, (pot >> 1), pot * sizeof(int) >> > (pot, dev_sum);
+            }
+            else
+            {
+                scan_dev(blockNum, dev_sum);
+            }
+
+            if (blockNum > 1)
+                kernAdd << < (blockNum >> 1), (itemPerBlock >> 2) >> > (itemPerBlock, dev_odata, dev_sum);
+
+            timer().pauseGpuTimer();
+            cudaFree(dev_sum);
+            timer().continueGpuTimer();
+
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int* odata, const int* idata) {
+            int blockNum = (n + itemPerBlock - 1) / itemPerBlock;
+            int paddedNum = blockNum * itemPerBlock;
+
+            int* dev_obuffer;
+            cudaMalloc((void**)&dev_obuffer, paddedNum * sizeof(int));
+            cudaMemset(dev_obuffer + n, 0, (paddedNum - n) * sizeof(int));
+            cudaMemcpy(dev_obuffer, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            nvtxRangePushA("Efficient Share");
+            timer().startGpuTimer();
+
+            scan_dev(paddedNum, dev_obuffer);
+
+            timer().endGpuTimer();
+            nvtxRangePop();
+
+            cudaMemcpy(odata, dev_obuffer, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_obuffer);
+        }
+
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+         */
+        int compact(int n, int* odata, const int* idata) {
+
+            int blockNum = (n + itemPerBlock - 1) / itemPerBlock;
+            int paddedNum = blockNum * itemPerBlock;
+
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, paddedNum * sizeof(int));
+            cudaMemset(dev_idata, 0, paddedNum * sizeof(int));
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            int* dev_bools;
+            cudaMalloc((void**)&dev_bools, paddedNum * sizeof(int));
+            cudaMemset(dev_bools, 0, paddedNum * sizeof(int));
+
+            int* dev_odata;
+            cudaMalloc((void**)&dev_odata, paddedNum * sizeof(int));
+            cudaMemset(dev_odata, 0, paddedNum * sizeof(int));
+
+            timer().startGpuTimer();
+
+            // map to bool
+            int gBlockNum = (paddedNum + blockSize - 1) / blockSize;
+            Common::kernMapToBoolean << < gBlockNum, blockSize >> > (paddedNum, dev_bools, dev_idata);
+
+            // scan
+            scan_dev(paddedNum, dev_bools);
+
+            // scatter
+            Common::kernScatter << < gBlockNum, blockSize >> > (paddedNum, dev_odata, dev_idata, nullptr, dev_bools);
+
+            // copy len back to host
+            int len = 0;
+            cudaMemcpy(&len, dev_bools + paddedNum - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, sizeof(int) * len, cudaMemcpyDeviceToHost);
+            cudaFree(dev_odata);
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+
+
+            return len;
+        }
+    }
+}
\ No newline at end of file
diff --git a/stream_compaction/efficient_share.h b/stream_compaction/efficient_share.h
new file mode 100644
index 0000000..a1a2e96
--- /dev/null
+++ b/stream_compaction/efficient_share.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace EfficientShare {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int* odata, const int* idata);
+
+        void scan_dev(int n, int* dev_odata);
+
+        int compact(int n, int* odata, const int* idata);
+    }
+}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..6cfa412 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#include <iostream>
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +13,53 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
 
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
+        //naive inclusive scan
+        __global__ void naiveScanStep(int n, int d, const int* input, int* output) {
+            int k = threadIdx.x + blockIdx.x * blockDim.x;
+            if (k >= n) return;
+
+            if (k >= (1 << (d - 1))) {
+                output[k] = input[k - (1 << (d - 1))] + input[k];
+            }
+            else {
+                output[k] = input[k];
+            }
+        }
+
+        void scan(int n, int* odata, const int* idata) {
+
+            int* dev_ping;
+            int* dev_pong;
+
+            cudaMalloc(&dev_ping, n * sizeof(int));
+            cudaMalloc(&dev_pong, n * sizeof(int));
+
+            cudaMemcpy(dev_ping, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = 1024;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            int depth = ilog2ceil(n);
+
             timer().startGpuTimer();
-            // TODO
+            for (int d = 1; d <= depth; d++) {
+                naiveScanStep << <numBlocks, blockSize >> > (n, d, dev_ping, dev_pong);
+
+                // Swap buffers
+                std::swap(dev_ping, dev_pong);
+            }
+            cudaDeviceSynchronize();
             timer().endGpuTimer();
+
+            // dev_ping now has the inclusive scan result
+            // Convert to exclusive scan
+            cudaMemcpy(odata + 1, dev_ping, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            odata[0] = 0;
+
+            cudaFree(dev_ping);
+            cudaFree(dev_pong);
+
         }
     }
 }
diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu
new file mode 100644
index 0000000..7355cee
--- /dev/null
+++ b/stream_compaction/radix.cu
@@ -0,0 +1,73 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "radix.h"
+#include "common.h"
+#include "efficient.h" // For scan()
+#include <iostream>
+
+namespace StreamCompaction {
+    namespace Radix {
+
+        __global__ void kernExtractBit(int n, int bit, int* bitArray, const int* idata) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n) {
+                bitArray[i] = (idata[i] >> bit) & 1;
+            }
+        }
+
+        __global__ void kernInvert(int n, int* out, const int* in) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n) {
+                out[i] = 1 - in[i];
+            }
+        }
+
+        __global__ void kernScatterBit(int n, int* odata, const int* idata,
+            const int* bitArray, const int* falseScan, int totalFalse) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n) {
+                int dest = bitArray[i] == 0 ? falseScan[i] : totalFalse + i - falseScan[i];
+                odata[dest] = idata[i];
+            }
+        }
+
+        void sort(int n, int* odata, const int* idata) {
+            int* dev_in, * dev_out, * dev_bits, * dev_notBits, * dev_scan;
+            int blockSize = 128;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            cudaMalloc(&dev_in, n * sizeof(int));
+            cudaMalloc(&dev_out, n * sizeof(int));
+            cudaMalloc(&dev_bits, n * sizeof(int));
+            cudaMalloc(&dev_notBits, n * sizeof(int));
+            cudaMalloc(&dev_scan, n * sizeof(int));
+
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            for (int bit = 0; bit < 32; ++bit) {
+                kernExtractBit << <numBlocks, blockSize >> > (n, bit, dev_bits, dev_in);
+                kernInvert << <numBlocks, blockSize >> > (n, dev_notBits, dev_bits);
+                StreamCompaction::Efficient::scan(n, dev_scan, dev_notBits);
+
+                int totalFalse;
+                int lastBool;
+                cudaMemcpy(&totalFalse, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                cudaMemcpy(&lastBool, dev_notBits + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                totalFalse += lastBool;
+
+                kernScatterBit << <numBlocks, blockSize >> > (n, dev_out, dev_in,
+                    dev_bits, dev_scan, totalFalse);
+
+                std::swap(dev_in, dev_out);  // next iteration reads from new sorted array
+            }
+
+            cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+            cudaFree(dev_bits);
+            cudaFree(dev_notBits);
+            cudaFree(dev_scan);
+        }
+    }
+}
diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h
new file mode 100644
index 0000000..802fa51
--- /dev/null
+++ b/stream_compaction/radix.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace StreamCompaction {
+    namespace Radix {
+        void sort(int n, int* odata, const int* idata);
+    }
+}
diff --git a/stream_compaction/sharedefficientmem.cu b/stream_compaction/sharedefficientmem.cu
new file mode 100644
index 0000000..9b9e775
--- /dev/null
+++ b/stream_compaction/sharedefficientmem.cu
@@ -0,0 +1,156 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "common.h"
+#include "sharedefficientmem.h"
+
+
+
+namespace StreamCompaction {
+    namespace SharedEfficientMem {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        __global__ void scanSharedEfficientKernel(int* odata, const int* idata, int n) {
+            extern __shared__ int temp[]; // allocated on invocation
+            int thid = threadIdx.x;
+            int offset = 1;
+
+            int ai = thid;
+            int bi = thid + (n / 2);
+
+            // Load input into shared memory
+            temp[ai] = idata[ai];
+            temp[bi] = idata[bi];
+
+            // Up-Sweep (Reduce) phase
+            for (int d = n >> 1; d > 0; d >>= 1) {
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    temp[bi] += temp[ai];
+                }
+                offset *= 2;
+            }
+
+            // Clear the last element
+            if (thid == 0) {
+                temp[n - 1] = 0;
+            }
+
+            // Down-Sweep phase
+            for (int d = 1; d < n; d *= 2) {
+                offset >>= 1;
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+
+                    int t = temp[ai];
+                    temp[ai] = temp[bi];
+                    temp[bi] += t;
+                }
+            }
+            __syncthreads();
+
+            // Write results to device memory
+            odata[ai] = temp[ai];
+            odata[bi] = temp[bi];
+        }
+
+        void scanSharedEfficient(int n, int* odata, const int* idata) {
+            if (n > 1024 || (n & (n - 1)) != 0) {
+                std::cerr << "[ERROR] scanSharedEfficient only supports up to 1024 elements and power-of-two sizes." << std::endl;
+                return;
+            }
+
+            int* dev_idata = nullptr;
+            int* dev_odata = nullptr;
+
+            cudaMalloc(&dev_idata, n * sizeof(int));
+            cudaMalloc(&dev_odata, n * sizeof(int));
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = n / 2;
+            int sharedMemSize = n * sizeof(int);
+
+            timer().startGpuTimer();
+            scanSharedEfficientKernel<<<1, blockSize, sharedMemSize>>>(dev_odata, dev_idata, n);
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+        }
+
+
+
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i >= n) return;
+            bools[i] = (idata[i] != 0) ? 1 : 0;
+        }
+
+        __global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i >= n) return;
+            if (bools[i] == 1) {
+                odata[indices[i]] = idata[i];
+            }
+        }
+
+        int compactEfficient(int n, int* odata, const int* idata) {
+            if (n > 1024 || (n & (n - 1)) != 0) {
+                std::cerr << "[ERROR] compactEfficient only supports up to 1024 power-of-two elements." << std::endl;
+                return 0;
+            }
+
+            int* dev_idata, * dev_bools, * dev_indices, * dev_odata;
+            cudaMalloc(&dev_idata, n * sizeof(int));
+            cudaMalloc(&dev_bools, n * sizeof(int));
+            cudaMalloc(&dev_indices, n * sizeof(int));
+            cudaMalloc(&dev_odata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = 1024;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            timer().startGpuTimer();
+
+            // 1. Map to boolean
+            kernMapToBoolean<<<numBlocks, blockSize>>>(n, dev_bools, dev_idata);
+            cudaDeviceSynchronize();
+
+            // 2. Scan boolean array
+            scanSharedEfficient(n, dev_indices, dev_bools);
+            cudaDeviceSynchronize();
+
+            // 3. Scatter
+            kernScatter<<<numBlocks, blockSize>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+            cudaDeviceSynchronize();
+
+            timer().endGpuTimer();
+
+            int lastBool, lastIndex;
+            cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            int total = lastBool + lastIndex;
+
+            cudaMemcpy(odata, dev_odata, total * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+            cudaFree(dev_odata);
+
+            return total;
+        }
+    }
+}
diff --git a/stream_compaction/sharedefficientmem.h b/stream_compaction/sharedefficientmem.h
new file mode 100644
index 0000000..41856b2
--- /dev/null
+++ b/stream_compaction/sharedefficientmem.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace SharedEfficientMem {
+
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scanSharedEfficient(int n, int* odata, const int* idata);
+
+        int compactEfficient(int n, int* odata, const int* idata);
+    }
+}
diff --git a/stream_compaction/sharednaivemem.cu b/stream_compaction/sharednaivemem.cu
new file mode 100644
index 0000000..d1c4198
--- /dev/null
+++ b/stream_compaction/sharednaivemem.cu
@@ -0,0 +1,138 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "common.h"
+#include "sharednaivemem.h"
+
+namespace StreamCompaction {
+    namespace SharedNaiveMem {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        __global__ void scanSharedNaiveKernel(int* odata, const int* idata, int n) {
+            extern __shared__ int temp[];  // size = 2 * blockDim.x * sizeof(int)
+            int tid = threadIdx.x;
+            int pout = 0, pin = 1;
+
+            if (tid < n) {
+                // exclusive scan: shift input right and insert 0 at beginning
+                temp[pout * n + tid] = (tid > 0) ? idata[tid - 1] : 0;
+            }
+            __syncthreads();
+
+            for (int offset = 1; offset < n; offset *= 2) {
+                pout = 1 - pout;  // swap ping-pong buffers
+                pin = 1 - pout;
+
+                if (tid < n) {
+                    if (tid >= offset)
+                        temp[pout * n + tid] = temp[pin * n + tid - offset] + temp[pin * n + tid];
+                    else
+                        temp[pout * n + tid] = temp[pin * n + tid];
+                }
+                __syncthreads();
+            }
+
+            if (tid < n) {
+                odata[tid] = temp[pout * n + tid];
+            }
+        }
+
+
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i >= n) return;
+            bools[i] = (idata[i] != 0) ? 1 : 0;
+        }
+
+        __global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i >= n) return;
+            if (bools[i] == 1) {
+                odata[indices[i]] = idata[i];
+            }
+        }
+
+
+        void scanSharedNaive(int n, int* odata, const int* idata) {
+            if (n > 1024) {
+                std::cerr << "[ERROR] scanSharedNaive only supports up to 1024 elements (1 block)." << std::endl;
+                return;
+            }
+
+            int* dev_idata = nullptr;
+            int* dev_odata = nullptr;
+
+            cudaMalloc(&dev_idata, n * sizeof(int));
+            cudaMalloc(&dev_odata, n * sizeof(int));
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = n;
+            int sharedMemSize = 2 * blockSize * sizeof(int);  // double buffer
+
+            timer().startGpuTimer();
+            scanSharedNaiveKernel<<<1, blockSize, sharedMemSize>>>(dev_odata, dev_idata, n);
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+        }
+
+        int compactNaive(int n, int* odata, const int* idata) {
+            if (n > 1024) {
+                std::cerr << "[ERROR] compactNaive only supports up to 1024 elements (1 block)." << std::endl;
+                return 0;
+            }
+
+            int* dev_idata, * dev_bools, * dev_indices, * dev_odata;
+            cudaMalloc(&dev_idata, n * sizeof(int));
+            cudaMalloc(&dev_bools, n * sizeof(int));
+            cudaMalloc(&dev_indices, n * sizeof(int));
+            cudaMalloc(&dev_odata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = 1024;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+
+            timer().startGpuTimer();
+
+            // 1. Map to boolean
+            kernMapToBoolean<<<numBlocks, blockSize>>>(n, dev_bools, dev_idata);
+            cudaDeviceSynchronize();
+
+            // 2. Scan boolean array
+            scanSharedNaive(n, dev_indices, dev_bools);
+            cudaDeviceSynchronize();
+
+            // 3. Scatter
+            kernScatter<<<numBlocks, blockSize>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+            cudaDeviceSynchronize();
+
+            timer().endGpuTimer();
+
+            // Copy back
+            int lastBool = 0, lastIndex = 0;
+            cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            int total = lastBool + lastIndex;
+
+            cudaMemcpy(odata, dev_odata, total * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+            cudaFree(dev_odata);
+
+            return total;
+        }
+
+
+    } // namespace SharedNaiveMem
+} // namespace StreamCompaction
diff --git a/stream_compaction/sharednaivemem.h b/stream_compaction/sharednaivemem.h
new file mode 100644
index 0000000..b4b9b9b
--- /dev/null
+++ b/stream_compaction/sharednaivemem.h
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace StreamCompaction {
+    namespace SharedNaiveMem {
+
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scanSharedNaive(int n, int* odata, const int* idata);
+
+        int compactNaive(int n, int* odata, const int* idata);
+
+    }
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..b221ee7 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -17,12 +17,22 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+        void scan(int n, int* odata, const int* idata) {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            // Copy input from raw pointer to Thrust device_vector
+            thrust::device_vector<int> d_input(idata, idata + n);
+            thrust::device_vector<int> d_output(n);
+
+            timer().startGpuTimer();
+            thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
             timer().endGpuTimer();
+
+            // Copy result back to output pointer
+            thrust::copy(d_output.begin(), d_output.end(), odata);
         }
     }
+
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index fe98206..0a957d8 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -6,6 +6,6 @@ namespace StreamCompaction {
     namespace Thrust {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int* odata, const int* idata);
     }
 }
diff --git a/visualization/best_bs_effScan.py b/visualization/best_bs_effScan.py
new file mode 100644
index 0000000..fc793df
--- /dev/null
+++ b/visualization/best_bs_effScan.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+df = pd.read_csv("scan_size_results.csv")  # Make sure this filename matches yours
+
+x_labels = [f'$2^{{{int(np.log2(n))}}}$' for n in df['N']]
+
+
+plt.figure(figsize=(12, 6))
+plt.bar(x_labels[1:8], df['Efficient(ms)'][1
+                                           :8], color='green')
+
+
+plt.xlabel('Input Size (N)')
+plt.ylabel('Efficient GPU Scan Time (ms)')
+plt.title('Efficient GPU Scan Runtime Across Input Sizes (Best Observed)')
+plt.xticks(rotation=45)
+plt.grid(axis='y', linestyle='--', linewidth=0.5)
+plt.tight_layout()
+
+plt.savefig('efficient_scan_bar_plot.png')
+plt.show()
diff --git a/visualization/compact_plot.py b/visualization/compact_plot.py
new file mode 100644
index 0000000..a6dceba
--- /dev/null
+++ b/visualization/compact_plot.py
@@ -0,0 +1,26 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+
+df = pd.read_csv("compact_size_results.csv")
+
+plt.figure(figsize=(10, 6))
+plt.yscale('log')
+
+plt.plot(df['N'], df['CPU_NoScan(ms)'], label='CPU', marker='o')
+plt.plot(df['N'], df['CPU_WithScan(ms)'], label='CPU with Scan', marker='o')
+plt.plot(df['N'], df['GPU_Efficient(ms)'], label='Efficient GPU Scan', marker='o')
+
+
+plt.xscale('log', base=2)
+plt.xticks(df['N'], labels=[f'2^{int(np.log2(n))}' for n in df['N']], rotation=45)
+plt.xlabel('Input Size (N)')
+plt.ylabel('Elapsed Time (ms)')
+plt.title('Compact Elapsed Time vs Input Size (Block Size = 256)')
+plt.grid(True, which="both", linestyle='--', linewidth=0.5)
+plt.legend()
+plt.tight_layout()
+
+plt.savefig('compact_size_plot.png')
+plt.show()
diff --git a/visualization/compact_size_plot.png b/visualization/compact_size_plot.png
new file mode 100644
index 0000000..d9f9b54
Binary files /dev/null and b/visualization/compact_size_plot.png differ
diff --git a/visualization/compact_size_results.csv b/visualization/compact_size_results.csv
new file mode 100644
index 0000000..ba1804b
--- /dev/null
+++ b/visualization/compact_size_results.csv
@@ -0,0 +1,2 @@
+N,CPU_NoScan(ms),CPU_WithScan(ms),GPU_Efficient(ms),placeholder
+1048576,1.8459,1.8084,1.89664,0
diff --git a/visualization/efficient_scan_bar_plot.png b/visualization/efficient_scan_bar_plot.png
new file mode 100644
index 0000000..96fe821
Binary files /dev/null and b/visualization/efficient_scan_bar_plot.png differ
diff --git a/visualization/scan_plot.py b/visualization/scan_plot.py
new file mode 100644
index 0000000..cd63b26
--- /dev/null
+++ b/visualization/scan_plot.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+df = pd.read_csv("scan_size_results.csv")
+
+
+plt.figure(figsize=(10, 6))
+plt.yscale('log')
+
+plt.plot(df['N'], df['CPU(ms)'], label='CPU Scan', marker='o')
+plt.plot(df['N'], df['Naive(ms)'], label='Naive GPU Scan', marker='o')
+plt.plot(df['N'], df['Efficient(ms)'], label='Efficient GPU Scan', marker='o')
+plt.plot(df['N'], df['Thrust(ms)'], label='Thrust Scan', marker='o')
+
+
+plt.xscale('log', base=2)
+plt.xticks(df['N'], labels=[f'2^{int(np.log2(n))}' for n in df['N']], rotation=45)
+plt.xlabel('Input Size (N)')
+plt.ylabel('Elapsed Time (ms)')
+plt.title('Scan Elapsed Time vs Input Size (Block Size = 256)')
+plt.grid(True, which="both", linestyle='--', linewidth=0.5)
+plt.legend()
+plt.tight_layout()
+
+plt.savefig('scan_size_plot.png')
+plt.show()
diff --git a/visualization/scan_size_plot_13_27.png b/visualization/scan_size_plot_13_27.png
new file mode 100644
index 0000000..ec5e48c
Binary files /dev/null and b/visualization/scan_size_plot_13_27.png differ
diff --git a/visualization/scan_size_results.csv b/visualization/scan_size_results.csv
new file mode 100644
index 0000000..62761ce
--- /dev/null
+++ b/visualization/scan_size_results.csv
@@ -0,0 +1,2 @@
+N,CPU(ms),Naive(ms),Efficient(ms),Thrust(ms)
+1048576,1.3921,0.996192,1.72554,0.310272