Switch back to calling dispatch rountine

bernhardmgruber · bernhardmgruber · commit 7a5a8062ba27 · 2025-11-11T10:37:40.000+01:00
diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh
@@ -23,6 +23,7 @@ struct policy_hub_t
 };
 #endif // !TUNE_BASE
 
+#if 0
 struct caching_last_alloc_mr
 {
   void* last_ptr = nullptr;
@@ -101,6 +102,7 @@ struct caching_last_alloc_mr
 };
 
 static_assert(cuda::mr::resource<caching_last_alloc_mr>);
+#endif
 
 template <typename T, typename OffsetT>
 void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
@@ -126,24 +128,66 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   // This is not realistic, since a user cannot set the accumulator type the same way at the public API. For example,
   // reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces it to I8.
   // This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is called (with
-  // accum_t I32) over the benchmark (forced accum_t of I8).
-
+  // accum_t I32) over the benchmark (forced accum_t of I8). See also: https://github.com/NVIDIA/cccl/issues/6576
+#if 0
   caching_last_alloc_mr mr;
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
     auto env = ::cuda::std::execution::env{
       ::cuda::stream_ref{launch.get_stream().get_stream()},
       ::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr}
-#if !TUNE_BASE
+#  if !TUNE_BASE
       ,
       ::cuda::std::execution::prop{
         ::cuda::execution::__get_tuning_t,
         ::cuda::std::execution::env{
           ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, policy_hub_t{}}}}
-#endif
+#  endif
     };
     static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
     (void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env);
   });
+#endif
+
+  // So for now, we have to call into the dispatcher again to override the accumulator type:
+  auto transform_op = ::cuda::std::identity{};
+
+  std::size_t temp_size;
+  cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
+    nullptr,
+    temp_size,
+    d_in,
+    d_out,
+    static_cast<offset_t>(elements),
+    op_t{},
+    init_t{},
+    0 /* stream */,
+    transform_op
+#if !TUNE_BASE
+    ,
+    policy_hub_t{}
+#endif
+  );
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_out,
+      static_cast<offset_t>(elements),
+      op_t{},
+      init_t{},
+      launch.get_stream(),
+      transform_op
+#if !TUNE_BASE
+      ,
+      policy_hub_t{}
+#endif
+    );
+  });
 }
 
 NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))