Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
728aaf1
Implement the new tuning API for DeviceReduce
bernhardmgruber Nov 6, 2025
7b08903
bra
bernhardmgruber Nov 19, 2025
9498ad3
cleanup
bernhardmgruber Nov 19, 2025
49adfdd
format
bernhardmgruber Nov 19, 2025
d607c97
printing
bernhardmgruber Nov 19, 2025
24d6f8f
OPERATORS
bernhardmgruber Nov 19, 2025
36a0f24
wtf
bernhardmgruber Nov 19, 2025
efc6c6b
format check
bernhardmgruber Nov 19, 2025
6813c5b
reviewer feedback
bernhardmgruber Nov 19, 2025
23e73a0
fix
bernhardmgruber Nov 19, 2025
8cf8edf
More operator<<
bernhardmgruber Nov 19, 2025
d9b3bd7
Fix policy check
bernhardmgruber Nov 19, 2025
5416079
arch_id
bernhardmgruber Nov 19, 2025
5a5d65f
use correct memory resource
bernhardmgruber Nov 19, 2025
5245838
name
bernhardmgruber Nov 19, 2025
0c95529
fix archid
bernhardmgruber Nov 19, 2025
f4b9345
refactoring
bernhardmgruber Nov 19, 2025
720ef57
docs
bernhardmgruber Nov 19, 2025
0d4d9e4
fix
bernhardmgruber Nov 19, 2025
40be1e1
some reverts
bernhardmgruber Nov 19, 2025
a79ea88
arch_id for nondet reduce
bernhardmgruber Nov 19, 2025
d7aff8a
refactor tuning
bernhardmgruber Nov 19, 2025
c3a1b08
fix
bernhardmgruber Nov 19, 2025
83770f9
launch bounds templ param
bernhardmgruber Nov 19, 2025
de7a34a
fix concepts
bernhardmgruber Nov 19, 2025
81ba358
Revert to C++17
bernhardmgruber Nov 19, 2025
b6ab425
cudaError_t
bernhardmgruber Nov 19, 2025
dd5d56f
needless cast
bernhardmgruber Nov 19, 2025
b057a19
Revert "launch bounds templ param"
bernhardmgruber Nov 19, 2025
6bb61c6
CI fixes
bernhardmgruber Nov 19, 2025
785739f
please GCC
bernhardmgruber Nov 19, 2025
b2302fe
Fix segmented_reduce
bernhardmgruber Nov 19, 2025
0df6910
Add cub::detail::ptx_arch_id
bernhardmgruber Nov 20, 2025
f9ce2e2
Fix transform_reduce
bernhardmgruber Nov 20, 2025
3d826be
drop comment
bernhardmgruber Nov 20, 2025
f06dea4
Fix warning: may be used uninitlaized
bernhardmgruber Nov 20, 2025
51e8cea
port transform_reduce benchmark as well
bernhardmgruber Nov 20, 2025
49c1d59
fix
bernhardmgruber Nov 20, 2025
1171bb7
fix CI
bernhardmgruber Nov 20, 2025
8309270
Fix concept checks
bernhardmgruber Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 61 additions & 71 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@

#include <cub/detail/choose_offset.cuh>
#include <cub/detail/launcher/cuda_driver.cuh>
#include <cub/detail/ptx-json-parser.h>
#include <cub/device/device_reduce.cuh>
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <cuda/std/algorithm>
Expand Down Expand Up @@ -44,29 +42,6 @@ static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "O

namespace reduce
{
struct reduce_runtime_tuning_policy
{
cub::detail::RuntimeReduceAgentPolicy single_tile;
cub::detail::RuntimeReduceAgentPolicy reduce;

auto SingleTile() const
{
return single_tile;
}
auto Reduce() const
{
return reduce;
}

using MaxPolicy = reduce_runtime_tuning_policy;

template <typename F>
cudaError_t Invoke(int, F& op)
{
return op.template Invoke<reduce_runtime_tuning_policy>(*this);
}
};

static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
{
// TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
Expand Down Expand Up @@ -179,7 +154,6 @@ CUresult cccl_device_reduce_build_ex(
{
const char* name = "device_reduce";

const int cc = cc_major * 10 + cc_minor;
const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init);
const auto accum_cpp = cccl_type_enum_to_name(accum_t.type);

Expand All @@ -193,7 +167,43 @@ CUresult cccl_device_reduce_build_ex(

const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);

auto policy_hub_expr = std::format("cub::detail::reduce::policy_hub<{}, {}, {}>", accum_cpp, offset_t, op_name);
const auto cub_arch_policies = [&] {
using namespace cub::detail::reduce;

auto accum_type = accum_type::other;
if (accum_t.type == CCCL_FLOAT32)
{
accum_type = accum_type::float32;
}
else if (accum_t.type == CCCL_FLOAT64)
{
accum_type = accum_type::double32;
}

auto operation_t = op_type::unknown;
switch (op.type)
{
case CCCL_PLUS:
operation_t = op_type::plus;
break;
case CCCL_MINIMUM:
case CCCL_MAXIMUM:
operation_t = op_type::min_or_max;
break;
default:
break;
}

const int offset_size = int{sizeof(OffsetT)};
return arch_policies{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
}();

// TODO(bgruber): drop this if tuning policies become formattable
std::stringstream cub_arch_policies_str;
cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));

auto policy_hub_expr =
std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);

std::string final_src = std::format(
R"XXX(
Expand All @@ -206,21 +216,19 @@ struct __align__({2}) storage_t {{
{3}
{4}
{5}
using device_reduce_policy = {6}::MaxPolicy;

#include <cub/detail/ptx-json/json.h>
__device__ consteval auto& policy_generator() {{
return ptx_json::id<ptx_json::string("device_reduce_policy")>()
= cub::detail::reduce::ReducePolicyWrapper<device_reduce_policy::ActivePolicy>::EncodedPolicy();
}};
using device_reduce_policy = {6};
using namespace cub;
using namespace cub::detail::reduce;
static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7}, "Host generated and JIT compiled policy mismatch");
)XXX",
jit_template_header_contents, // 0
input_it.value_type.size, // 1
input_it.value_type.alignment, // 2
input_iterator_src, // 3
output_iterator_src, // 4
op_src, // 5
policy_hub_expr); // 6
policy_hub_expr, // 6
cub_arch_policies_str.view()); // 7

#if false // CCCL_DEBUGGING_SWITCH
fflush(stderr);
Expand Down Expand Up @@ -249,7 +257,6 @@ __device__ consteval auto& policy_generator() {{
"-rdc=true",
"-dlto",
"-DCUB_DISABLE_CDP",
"-DCUB_ENABLE_POLICY_PTX_JSON",
"-std=c++20"};

// Add user's extra flags if config is provided
Expand Down Expand Up @@ -286,18 +293,11 @@ __device__ consteval auto& policy_generator() {{
&build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str()));
check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str()));

nlohmann::json runtime_policy =
cub::detail::ptx_json::parse("device_reduce_policy", {result.data.get(), result.size});

using cub::detail::RuntimeReduceAgentPolicy;
auto reduce_policy = RuntimeReduceAgentPolicy::from_json(runtime_policy, "ReducePolicy");
auto st_policy = RuntimeReduceAgentPolicy::from_json(runtime_policy, "SingleTilePolicy");

build->cc = cc;
build->cc = cc_major * 10 + cc_minor;
build->cubin = (void*) result.data.release();
build->cubin_size = result.size;
build->accumulator_size = accum_t.size;
build->runtime_policy = new reduce::reduce_runtime_tuning_policy{st_policy, reduce_policy};
build->runtime_policy = new cub::detail::reduce::arch_policies{cub_arch_policies};
}
catch (const std::exception& exc)
{
Expand Down Expand Up @@ -330,30 +330,19 @@ CUresult cccl_device_reduce(
CUdevice cu_device;
check(cuCtxGetDevice(&cu_device));

auto exec_status = cub::DispatchReduce<
indirect_arg_t, // InputIteratorT
indirect_arg_t, // OutputIteratorT
::cuda::std::size_t, // OffsetT
indirect_arg_t, // ReductionOpT
indirect_arg_t, // InitT
void, // AccumT
::cuda::std::identity, // TransformOpT
reduce::reduce_runtime_tuning_policy, // PolicyHub
reduce::reduce_kernel_source, // KernelSource
cub::detail::CudaDriverLauncherFactory>:: // KernelLauncherFactory
Dispatch(
d_temp_storage,
*temp_storage_bytes,
d_in,
d_out,
num_items,
op,
init,
stream,
{},
{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
*reinterpret_cast<reduce::reduce_runtime_tuning_policy*>(build.runtime_policy));
auto exec_status = cub::detail::reduce::dispatch<void>(
d_temp_storage,
*temp_storage_bytes,
indirect_arg_t{d_in}, // could be indirect_iterator_t, but CUB does not need to increment it
indirect_arg_t{d_out}, // could be indirect_iterator_t, but CUB does not need to increment it
static_cast<OffsetT>(num_items),
indirect_arg_t{op},
indirect_arg_t{init},
stream,
::cuda::std::identity{},
*static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
reduce::reduce_kernel_source{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});

error = static_cast<CUresult>(exec_status);
}
Expand Down Expand Up @@ -383,8 +372,9 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr
return CUDA_ERROR_INVALID_VALUE;
}

std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
using namespace cub::detail::reduce;
std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
std::unique_ptr<arch_policies> policy(static_cast<arch_policies*>(build_ptr->runtime_policy));
check(cuLibraryUnload(build_ptr->library));
}
catch (const std::exception& exc)
Expand Down
133 changes: 78 additions & 55 deletions cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -5,88 +5,111 @@

#include <cub/device/device_reduce.cuh>

#ifndef TUNE_BASE
# define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
#endif
#include <cuda/__device/all_devices.h>
#include <cuda/__memory_resource/device_memory_pool.h>

#include <nvbench_helper.cuh>

#if !TUNE_BASE
template <typename AccumT, typename OffsetT>
struct policy_hub_t
struct arch_policies
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
{
static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK;
static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD;
static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;

using ReducePolicy =
cub::AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
cub::LOAD_DEFAULT>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

using MaxPolicy = policy_t;
const auto policy = cub::agent_reduce_policy{
TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
cub::LOAD_DEFAULT};
return {policy, policy, policy, policy};
}
};
#endif // !TUNE_BASE

template <typename T, typename OffsetT>
void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using accum_t = T;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using output_t = T;
using init_t = T;
using dispatch_t = cub::DispatchReduce<
input_it_t,
output_it_t,
offset_t,
op_t,
init_t,
accum_t
#if !TUNE_BASE
,
::cuda::std::identity, // pass the default TransformOpT which due to policy_hub_t instantiation is not deduced
// automatically
policy_hub_t<accum_t, offset_t>
#endif // !TUNE_BASE
>;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using init_t = T;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));

thrust::device_vector<T> in = generate(elements);
thrust::device_vector<T> out(1);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());
auto d_in = thrust::raw_pointer_cast(in.data());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: prior to this change d_in had the type const T*, but with this the const is gone. Could this affect performance?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say it shouldn't. But I have to do a benchmark or SASS test anyway, so we will see.

auto d_out = thrust::raw_pointer_cast(out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<T>(1);

// Allocate temporary storage:
// FIXME(bgruber): the previous implementation did target cub::DispatchReduce, and provided T as accumulator type.
// This is not realistic, since a user cannot override the accumulator type the same way at the public API. For
// example, reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces
// it to I8. This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is
// called (with accum_t I32) over the benchmark (forced accum_t of I8). See also:
// https://github.com/NVIDIA/cccl/issues/6576
#if 0
auto mr = cuda::device_default_memory_pool(cuda::devices[0]);
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
auto env = ::cuda::std::execution::env{
::cuda::stream_ref{launch.get_stream().get_stream()},
::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr}
# if !TUNE_BASE
,
::cuda::std::execution::prop{
::cuda::execution::__get_tuning_t,
::cuda::std::execution::env{
::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}}
# endif
};
static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
(void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env);
});
#endif

// So for now, we have to call into the dispatcher again to override the accumulator type:
auto transform_op = ::cuda::std::identity{};

std::size_t temp_size;
dispatch_t::Dispatch(
nullptr, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, 0 /* stream */);
cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
nullptr,
temp_size,
d_in,
d_out,
elements,
op_t{},
init_t{},
0 /* stream */,
transform_op
#if !TUNE_BASE
,
arch_policies{}
#endif
);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
temp_storage, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, launch.get_stream());
cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
temp_storage,
temp_size,
d_in,
d_out,
elements,
op_t{},
init_t{},
launch.get_stream(),
transform_op
#if !TUNE_BASE
,
arch_policies{}
#endif
);
});
}

Expand Down
Loading
Loading