-
Notifications
You must be signed in to change notification settings - Fork 294
Implement the new tuning API for DeviceReduce
#6544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
728aaf1
7b08903
9498ad3
49adfdd
d607c97
24d6f8f
36a0f24
efc6c6b
6813c5b
23e73a0
8cf8edf
d9b3bd7
5416079
5a5d65f
5245838
0c95529
f4b9345
720ef57
0d4d9e4
40be1e1
a79ea88
d7aff8a
c3a1b08
83770f9
de7a34a
81ba358
b6ab425
dd5d56f
b057a19
6bb61c6
785739f
b2302fe
0df6910
f9ce2e2
3d826be
f06dea4
51e8cea
49c1d59
1171bb7
8309270
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,88 +5,111 @@ | |
|
|
||
| #include <cub/device/device_reduce.cuh> | ||
|
|
||
| #ifndef TUNE_BASE | ||
| # define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2) | ||
| #endif | ||
| #include <cuda/__device/all_devices.h> | ||
| #include <cuda/__memory_resource/device_memory_pool.h> | ||
|
|
||
| #include <nvbench_helper.cuh> | ||
|
|
||
| #if !TUNE_BASE | ||
| template <typename AccumT, typename OffsetT> | ||
| struct policy_hub_t | ||
| struct arch_policies | ||
| { | ||
| struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> | ||
| _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy | ||
| { | ||
| static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK; | ||
| static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD; | ||
| static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD; | ||
|
|
||
| using ReducePolicy = | ||
| cub::AgentReducePolicy<threads_per_block, | ||
| items_per_thread, | ||
| AccumT, | ||
| items_per_vec_load, | ||
| cub::BLOCK_REDUCE_WARP_REDUCTIONS, | ||
| cub::LOAD_DEFAULT>; | ||
|
|
||
| // SingleTilePolicy | ||
| using SingleTilePolicy = ReducePolicy; | ||
|
|
||
| // SegmentedReducePolicy | ||
| using SegmentedReducePolicy = ReducePolicy; | ||
| }; | ||
|
|
||
| using MaxPolicy = policy_t; | ||
| const auto policy = cub::agent_reduce_policy{ | ||
| TUNE_THREADS_PER_BLOCK, | ||
| TUNE_ITEMS_PER_THREAD, | ||
| 1 << TUNE_ITEMS_PER_VEC_LOAD_POW2, | ||
| cub::BLOCK_REDUCE_WARP_REDUCTIONS, | ||
| cub::LOAD_DEFAULT}; | ||
| return {policy, policy, policy, policy}; | ||
bernhardmgruber marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| }; | ||
| #endif // !TUNE_BASE | ||
|
|
||
| template <typename T, typename OffsetT> | ||
| void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>) | ||
| { | ||
| using accum_t = T; | ||
| using input_it_t = const T*; | ||
| using output_it_t = T*; | ||
| using offset_t = cub::detail::choose_offset_t<OffsetT>; | ||
| using output_t = T; | ||
| using init_t = T; | ||
| using dispatch_t = cub::DispatchReduce< | ||
| input_it_t, | ||
| output_it_t, | ||
| offset_t, | ||
| op_t, | ||
| init_t, | ||
| accum_t | ||
| #if !TUNE_BASE | ||
| , | ||
| ::cuda::std::identity, // pass the default TransformOpT which due to policy_hub_t instantiation is not deduced | ||
| // automatically | ||
| policy_hub_t<accum_t, offset_t> | ||
| #endif // !TUNE_BASE | ||
| >; | ||
| using offset_t = cub::detail::choose_offset_t<OffsetT>; | ||
| using init_t = T; | ||
|
|
||
| // Retrieve axis parameters | ||
| const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
| const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}")); | ||
|
|
||
| thrust::device_vector<T> in = generate(elements); | ||
| thrust::device_vector<T> out(1); | ||
|
|
||
| input_it_t d_in = thrust::raw_pointer_cast(in.data()); | ||
| output_it_t d_out = thrust::raw_pointer_cast(out.data()); | ||
| auto d_in = thrust::raw_pointer_cast(in.data()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: prior to this change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would say it shouldn't. But I have to do a benchmark or SASS test anyway, so we will see. |
||
| auto d_out = thrust::raw_pointer_cast(out.data()); | ||
|
|
||
| // Enable throughput calculations and add "Size" column to results. | ||
| state.add_element_count(elements); | ||
| state.add_global_memory_reads<T>(elements, "Size"); | ||
| state.add_global_memory_writes<T>(1); | ||
|
|
||
| // Allocate temporary storage: | ||
| // FIXME(bgruber): the previous implementation did target cub::DispatchReduce, and provided T as accumulator type. | ||
| // This is not realistic, since a user cannot override the accumulator type the same way at the public API. For | ||
| // example, reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces | ||
| // it to I8. This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is | ||
| // called (with accum_t I32) over the benchmark (forced accum_t of I8). See also: | ||
| // https://github.com/NVIDIA/cccl/issues/6576 | ||
| #if 0 | ||
| auto mr = cuda::device_default_memory_pool(cuda::devices[0]); | ||
| state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { | ||
| auto env = ::cuda::std::execution::env{ | ||
| ::cuda::stream_ref{launch.get_stream().get_stream()}, | ||
| ::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr} | ||
| # if !TUNE_BASE | ||
| , | ||
| ::cuda::std::execution::prop{ | ||
| ::cuda::execution::__get_tuning_t, | ||
| ::cuda::std::execution::env{ | ||
| ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}} | ||
| # endif | ||
| }; | ||
| static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>); | ||
| (void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env); | ||
| }); | ||
| #endif | ||
|
|
||
| // So for now, we have to call into the dispatcher again to override the accumulator type: | ||
| auto transform_op = ::cuda::std::identity{}; | ||
|
|
||
| std::size_t temp_size; | ||
| dispatch_t::Dispatch( | ||
| nullptr, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, 0 /* stream */); | ||
| cub::detail::reduce::dispatch</* OverrideAccumT = */ T>( | ||
| nullptr, | ||
| temp_size, | ||
| d_in, | ||
| d_out, | ||
| elements, | ||
| op_t{}, | ||
| init_t{}, | ||
| 0 /* stream */, | ||
| transform_op | ||
| #if !TUNE_BASE | ||
| , | ||
| arch_policies{} | ||
| #endif | ||
| ); | ||
|
|
||
| thrust::device_vector<nvbench::uint8_t> temp(temp_size); | ||
| thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init); | ||
| auto* temp_storage = thrust::raw_pointer_cast(temp.data()); | ||
|
|
||
| state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { | ||
| dispatch_t::Dispatch( | ||
| temp_storage, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, launch.get_stream()); | ||
| cub::detail::reduce::dispatch</* OverrideAccumT = */ T>( | ||
| temp_storage, | ||
| temp_size, | ||
| d_in, | ||
| d_out, | ||
| elements, | ||
| op_t{}, | ||
| init_t{}, | ||
| launch.get_stream(), | ||
| transform_op | ||
| #if !TUNE_BASE | ||
| , | ||
| arch_policies{} | ||
| #endif | ||
| ); | ||
| }); | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.