@@ -23,6 +23,7 @@ struct policy_hub_t
2323};
2424#endif // !TUNE_BASE
2525
26+ #if 0
2627struct caching_last_alloc_mr
2728{
2829 void* last_ptr = nullptr;
@@ -101,6 +102,7 @@ struct caching_last_alloc_mr
101102};
102103
103104static_assert(cuda::mr::resource<caching_last_alloc_mr>);
105+ #endif
104106
105107template <typename T, typename OffsetT>
106108void reduce (nvbench::state& state, nvbench::type_list<T, OffsetT>)
@@ -126,24 +128,66 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
126128 // This is not realistic, since a user cannot set the accumulator type the same way at the public API. For example,
127129 // reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces it to I8.
128130 // This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is called (with
129- // accum_t I32) over the benchmark (forced accum_t of I8).
130-
131+ // accum_t I32) over the benchmark (forced accum_t of I8). See also: https://github.com/NVIDIA/cccl/issues/6576
132+ # if 0
131133 caching_last_alloc_mr mr;
132134 state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
133135 auto env = ::cuda::std::execution::env{
134136 ::cuda::stream_ref{launch.get_stream().get_stream()},
135137 ::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr}
136- #if !TUNE_BASE
138+ # if !TUNE_BASE
137139 ,
138140 ::cuda::std::execution::prop{
139141 ::cuda::execution::__get_tuning_t,
140142 ::cuda::std::execution::env{
141143 ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_hub_t{}}}}
142- #endif
144+ # endif
143145 };
144146 static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
145147 (void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env);
146148 });
149+ #endif
150+
151+ // So for now, we have to call into the dispatcher again to override the accumulator type:
152+ auto transform_op = ::cuda::std::identity{};
153+
154+ std::size_t temp_size;
155+ cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
156+ nullptr ,
157+ temp_size,
158+ d_in,
159+ d_out,
160+ static_cast <offset_t >(elements),
161+ op_t {},
162+ init_t {},
163+ 0 /* stream */ ,
164+ transform_op
165+ #if !TUNE_BASE
166+ ,
167+ policy_hub_t {}
168+ #endif
169+ );
170+
171+ thrust::device_vector<nvbench::uint8_t > temp (temp_size, thrust::no_init);
172+ auto * temp_storage = thrust::raw_pointer_cast (temp.data ());
173+
174+ state.exec (nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
175+ cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
176+ temp_storage,
177+ temp_size,
178+ d_in,
179+ d_out,
180+ static_cast <offset_t >(elements),
181+ op_t {},
182+ init_t {},
183+ launch.get_stream (),
184+ transform_op
185+ #if !TUNE_BASE
186+ ,
187+ policy_hub_t {}
188+ #endif
189+ );
190+ });
147191}
148192
149193NVBENCH_BENCH_TYPES (reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
0 commit comments