Skip to content

Commit 7a5a806

Browse files
Switch back to calling dispatch rountine
1 parent a7eb660 commit 7a5a806

File tree

1 file changed

+48
-4
lines changed

1 file changed

+48
-4
lines changed

cub/benchmarks/bench/reduce/base.cuh

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ struct policy_hub_t
2323
};
2424
#endif // !TUNE_BASE
2525

26+
#if 0
2627
struct caching_last_alloc_mr
2728
{
2829
void* last_ptr = nullptr;
@@ -101,6 +102,7 @@ struct caching_last_alloc_mr
101102
};
102103

103104
static_assert(cuda::mr::resource<caching_last_alloc_mr>);
105+
#endif
104106

105107
template <typename T, typename OffsetT>
106108
void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
@@ -126,24 +128,66 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
126128
// This is not realistic, since a user cannot set the accumulator type the same way at the public API. For example,
127129
// reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces it to I8.
128130
// This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is called (with
129-
// accum_t I32) over the benchmark (forced accum_t of I8).
130-
131+
// accum_t I32) over the benchmark (forced accum_t of I8). See also: https://github.com/NVIDIA/cccl/issues/6576
132+
#if 0
131133
caching_last_alloc_mr mr;
132134
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
133135
auto env = ::cuda::std::execution::env{
134136
::cuda::stream_ref{launch.get_stream().get_stream()},
135137
::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr}
136-
#if !TUNE_BASE
138+
# if !TUNE_BASE
137139
,
138140
::cuda::std::execution::prop{
139141
::cuda::execution::__get_tuning_t,
140142
::cuda::std::execution::env{
141143
::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_hub_t{}}}}
142-
#endif
144+
# endif
143145
};
144146
static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
145147
(void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env);
146148
});
149+
#endif
150+
151+
// So for now, we have to call into the dispatcher again to override the accumulator type:
152+
auto transform_op = ::cuda::std::identity{};
153+
154+
std::size_t temp_size;
155+
cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
156+
nullptr,
157+
temp_size,
158+
d_in,
159+
d_out,
160+
static_cast<offset_t>(elements),
161+
op_t{},
162+
init_t{},
163+
0 /* stream */,
164+
transform_op
165+
#if !TUNE_BASE
166+
,
167+
policy_hub_t{}
168+
#endif
169+
);
170+
171+
thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
172+
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
173+
174+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
175+
cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
176+
temp_storage,
177+
temp_size,
178+
d_in,
179+
d_out,
180+
static_cast<offset_t>(elements),
181+
op_t{},
182+
init_t{},
183+
launch.get_stream(),
184+
transform_op
185+
#if !TUNE_BASE
186+
,
187+
policy_hub_t{}
188+
#endif
189+
);
190+
});
147191
}
148192

149193
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))

0 commit comments

Comments
 (0)