NVIDIA · bernhardmgruber · Nov 6, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
@@ -10,9 +10,7 @@
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/launcher/cuda_driver.cuh>
-#include <cub/detail/ptx-json-parser.h>
 #include <cub/device/device_reduce.cuh>
-#include <cub/grid/grid_even_share.cuh>
 #include <cub/util_device.cuh>
 
 #include <cuda/std/algorithm>
@@ -44,29 +42,6 @@ static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "O
 
 namespace reduce
 {
-struct reduce_runtime_tuning_policy
-{
-  cub::detail::RuntimeReduceAgentPolicy single_tile;
-  cub::detail::RuntimeReduceAgentPolicy reduce;
-
-  auto SingleTile() const
-  {
-    return single_tile;
-  }
-  auto Reduce() const
-  {
-    return reduce;
-  }
-
-  using MaxPolicy = reduce_runtime_tuning_policy;
-
-  template <typename F>
-  cudaError_t Invoke(int, F& op)
-  {
-    return op.template Invoke<reduce_runtime_tuning_policy>(*this);
-  }
-};
-
 static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
 {
   // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
@@ -179,7 +154,6 @@ CUresult cccl_device_reduce_build_ex(
   {
     const char* name = "device_reduce";
 
-    const int cc                 = cc_major * 10 + cc_minor;
     const cccl_type_info accum_t = reduce::get_accumulator_type(op, input_it, init);
     const auto accum_cpp         = cccl_type_enum_to_name(accum_t.type);
 
@@ -193,7 +167,43 @@ CUresult cccl_device_reduce_build_ex(
 
     const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
-    auto policy_hub_expr = std::format("cub::detail::reduce::policy_hub<{}, {}, {}>", accum_cpp, offset_t, op_name);
+    const auto cub_arch_policies = [&] {
+      using namespace cub::detail::reduce;
+
+      auto accum_type = accum_type::other;
+      if (accum_t.type == CCCL_FLOAT32)
+      {
+        accum_type = accum_type::float32;
+      }
+      else if (accum_t.type == CCCL_FLOAT64)
+      {
+        accum_type = accum_type::double32;
+      }
+
+      auto operation_t = op_type::unknown;
+      switch (op.type)
+      {
+        case CCCL_PLUS:
+          operation_t = op_type::plus;
+          break;
+        case CCCL_MINIMUM:
+        case CCCL_MAXIMUM:
+          operation_t = op_type::min_or_max;
+          break;
+        default:
+          break;
+      }
+
+      const int offset_size = int{sizeof(OffsetT)};
+      return arch_policies{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
+    }();
+
+    // TODO(bgruber): drop this if tuning policies become formattable
+    std::stringstream cub_arch_policies_str;
+    cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+
+    auto policy_hub_expr =
+      std::format("cub::detail::reduce::arch_policies_from_types<{}, {}, {}>", accum_cpp, offset_t, op_name);
 
     std::string final_src = std::format(
       R"XXX(
@@ -206,21 +216,19 @@ struct __align__({2}) storage_t {{
 {3}
 {4}
 {5}
-using device_reduce_policy = {6}::MaxPolicy;
-
-#include <cub/detail/ptx-json/json.h>
-__device__ consteval auto& policy_generator() {{
-  return ptx_json::id<ptx_json::string("device_reduce_policy")>()
-    = cub::detail::reduce::ReducePolicyWrapper<device_reduce_policy::ActivePolicy>::EncodedPolicy();
-}};
+using device_reduce_policy = {6};
+using namespace cub;
+using namespace cub::detail::reduce;
+static_assert(device_reduce_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {7}, "Host generated and JIT compiled policy mismatch");
 )XXX",
       jit_template_header_contents, // 0
       input_it.value_type.size, // 1
       input_it.value_type.alignment, // 2
       input_iterator_src, // 3
       output_iterator_src, // 4
       op_src, // 5
-      policy_hub_expr); // 6
+      policy_hub_expr, // 6
+      cub_arch_policies_str.view()); // 7
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -249,7 +257,6 @@ __device__ consteval auto& policy_generator() {{
       "-rdc=true",
       "-dlto",
       "-DCUB_DISABLE_CDP",
-      "-DCUB_ENABLE_POLICY_PTX_JSON",
       "-std=c++20"};
 
     // Add user's extra flags if config is provided
@@ -286,18 +293,11 @@ __device__ consteval auto& policy_generator() {{
       &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name.c_str()));
 
-    nlohmann::json runtime_policy =
-      cub::detail::ptx_json::parse("device_reduce_policy", {result.data.get(), result.size});
-
-    using cub::detail::RuntimeReduceAgentPolicy;
-    auto reduce_policy = RuntimeReduceAgentPolicy::from_json(runtime_policy, "ReducePolicy");
-    auto st_policy     = RuntimeReduceAgentPolicy::from_json(runtime_policy, "SingleTilePolicy");
-
-    build->cc               = cc;
+    build->cc               = cc_major * 10 + cc_minor;
     build->cubin            = (void*) result.data.release();
     build->cubin_size       = result.size;
     build->accumulator_size = accum_t.size;
-    build->runtime_policy   = new reduce::reduce_runtime_tuning_policy{st_policy, reduce_policy};
+    build->runtime_policy   = new cub::detail::reduce::arch_policies{cub_arch_policies};
   }
   catch (const std::exception& exc)
   {
@@ -330,30 +330,19 @@ CUresult cccl_device_reduce(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    auto exec_status = cub::DispatchReduce<
-      indirect_arg_t, // InputIteratorT
-      indirect_arg_t, // OutputIteratorT
-      ::cuda::std::size_t, // OffsetT
-      indirect_arg_t, // ReductionOpT
-      indirect_arg_t, // InitT
-      void, // AccumT
-      ::cuda::std::identity, // TransformOpT
-      reduce::reduce_runtime_tuning_policy, // PolicyHub
-      reduce::reduce_kernel_source, // KernelSource
-      cub::detail::CudaDriverLauncherFactory>:: // KernelLauncherFactory
-      Dispatch(
-        d_temp_storage,
-        *temp_storage_bytes,
-        d_in,
-        d_out,
-        num_items,
-        op,
-        init,
-        stream,
-        {},
-        {build},
-        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-        *reinterpret_cast<reduce::reduce_runtime_tuning_policy*>(build.runtime_policy));
+    auto exec_status = cub::detail::reduce::dispatch<void>(
+      d_temp_storage,
+      *temp_storage_bytes,
+      indirect_arg_t{d_in}, // could be indirect_iterator_t, but CUB does not need to increment it
+      indirect_arg_t{d_out}, // could be indirect_iterator_t, but CUB does not need to increment it
+      static_cast<OffsetT>(num_items),
+      indirect_arg_t{op},
+      indirect_arg_t{init},
+      stream,
+      ::cuda::std::identity{},
+      *static_cast<cub::detail::reduce::arch_policies*>(build.runtime_policy),
+      reduce::reduce_kernel_source{build},
+      cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
 
     error = static_cast<CUresult>(exec_status);
   }
@@ -383,8 +372,9 @@ CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* build_ptr
       return CUDA_ERROR_INVALID_VALUE;
     }
 
-    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
-    std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
+    using namespace cub::detail::reduce;
+    std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
+    std::unique_ptr<arch_policies> policy(static_cast<arch_policies*>(build_ptr->runtime_policy));
     check(cuLibraryUnload(build_ptr->library));
   }
   catch (const std::exception& exc)

@@ -5,88 +5,111 @@
 
 #include <cub/device/device_reduce.cuh>
 
-#ifndef TUNE_BASE
-#  define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
-#endif
+#include <cuda/__device/all_devices.h>
+#include <cuda/__memory_resource/device_memory_pool.h>
+
+#include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-template <typename AccumT, typename OffsetT>
-struct policy_hub_t
+struct arch_policies
 {
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> ::cub::reduce_arch_policy
   {
-    static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
-    static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
-    static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
-
-    using ReducePolicy =
-      cub::AgentReducePolicy<threads_per_block,
-                             items_per_thread,
-                             AccumT,
-                             items_per_vec_load,
-                             cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-                             cub::LOAD_DEFAULT>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-  };
-
-  using MaxPolicy = policy_t;
+    const auto policy = cub::agent_reduce_policy{
+      TUNE_THREADS_PER_BLOCK,
+      TUNE_ITEMS_PER_THREAD,
+      1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
+      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+      cub::LOAD_DEFAULT};
+    return {policy, policy, policy, policy};
+  }
 };
 #endif // !TUNE_BASE
 
 template <typename T, typename OffsetT>
 void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
-  using accum_t     = T;
-  using input_it_t  = const T*;
-  using output_it_t = T*;
-  using offset_t    = cub::detail::choose_offset_t<OffsetT>;
-  using output_t    = T;
-  using init_t      = T;
-  using dispatch_t  = cub::DispatchReduce<
-     input_it_t,
-     output_it_t,
-     offset_t,
-     op_t,
-     init_t,
-     accum_t
-#if !TUNE_BASE
-    ,
-    ::cuda::std::identity, // pass the default TransformOpT which due to policy_hub_t instantiation is not deduced
-                           // automatically
-    policy_hub_t<accum_t, offset_t>
-#endif // !TUNE_BASE
-    >;
+  using offset_t = cub::detail::choose_offset_t<OffsetT>;
+  using init_t   = T;
 
   // Retrieve axis parameters
-  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<T> in = generate(elements);
   thrust::device_vector<T> out(1);
 
-  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
-  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+  auto d_in  = thrust::raw_pointer_cast(in.data());
+  auto d_out = thrust::raw_pointer_cast(out.data());
 
   // Enable throughput calculations and add "Size" column to results.
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements, "Size");
   state.add_global_memory_writes<T>(1);
 
-  // Allocate temporary storage:
+  // FIXME(bgruber): the previous implementation did target cub::DispatchReduce, and provided T as accumulator type.
+  // This is not realistic, since a user cannot override the accumulator type the same way at the public API. For
+  // example, reducing I8 over cuda::std::plus deduces accumulator type I32 at the public API, but the benchmark forces
+  // it to I8. This skews the MemBoundScaling, leading to 20% regression for the same tuning when the public API is
+  // called (with accum_t I32) over the benchmark (forced accum_t of I8). See also:
+  // https://github.com/NVIDIA/cccl/issues/6576
+#if 0
+  auto mr = cuda::device_default_memory_pool(cuda::devices[0]);
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    auto env = ::cuda::std::execution::env{
+      ::cuda::stream_ref{launch.get_stream().get_stream()},
+      ::cuda::std::execution::prop{::cuda::mr::__get_memory_resource, mr}
+#  if !TUNE_BASE
+      ,
+      ::cuda::std::execution::prop{
+        ::cuda::execution::__get_tuning_t,
+        ::cuda::std::execution::env{
+          ::cuda::std::execution::prop{::cub::detail::reduce::get_tuning_query_t, arch_policies{}}}}
+#  endif
+    };
+    static_assert(::cuda::std::execution::__queryable_with<decltype(env), ::cuda::mr::__get_memory_resource_t>);
+    (void) cub::DeviceReduce::Reduce(d_in, d_out, elements, op_t{}, init_t{}, env);
+  });
+#endif
+
+  // So for now, we have to call into the dispatcher again to override the accumulator type:
+  auto transform_op = ::cuda::std::identity{};
+
   std::size_t temp_size;
-  dispatch_t::Dispatch(
-    nullptr, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, 0 /* stream */);
+  cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
+    nullptr,
+    temp_size,
+    d_in,
+    d_out,
+    elements,
+    op_t{},
+    init_t{},
+    0 /* stream */,
+    transform_op
+#if !TUNE_BASE
+    ,
+    arch_policies{}
+#endif
+  );
 
-  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
   auto* temp_storage = thrust::raw_pointer_cast(temp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      temp_storage, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, launch.get_stream());
+    cub::detail::reduce::dispatch</* OverrideAccumT = */ T>(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_out,
+      elements,
+      op_t{},
+      init_t{},
+      launch.get_stream(),
+      transform_op
+#if !TUNE_BASE
+      ,
+      arch_policies{}
+#endif
+    );
   });
 }