Skip to content

Commit 04ee487

Browse files
Comments and reviewer feedback
1 parent 98753f3 commit 04ee487

File tree

3 files changed

+17
-13
lines changed

3 files changed

+17
-13
lines changed

c/parallel/src/reduce.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,11 +260,11 @@ using device_reduce_policy = {6};
260260
auto at = accum_type::other;
261261
if (accum_t.type == CCCL_FLOAT32)
262262
{
263-
at = accum_type::_float;
263+
at = accum_type::float32;
264264
}
265265
if (accum_t.type == CCCL_FLOAT64)
266266
{
267-
at = accum_type::_double;
267+
at = accum_type::double32;
268268
}
269269

270270
auto ot = op_type::unknown;

cub/cub/device/dispatch/dispatch_reduce.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,12 +579,13 @@ namespace detail::reduce
579579
struct no_override
580580
{};
581581

582+
// select the accumulator type using an overload set, so __accumulator_t and invoke_result_t are not instantiated when
583+
// an overriding accumulator type is present. This is needed by CCCL.C.
582584
template <typename InputIteratorT, typename InitT, typename ReductionOpT, typename TransformOpT>
583585
_CCCL_API auto select_accum_t(no_override*)
584586
-> ::cuda::std::__accumulator_t<ReductionOpT,
585587
::cuda::std::invoke_result_t<TransformOpT, ::cuda::std::iter_value_t<InputIteratorT>>,
586588
InitT>;
587-
588589
template <typename InputIteratorT,
589590
typename InitT,
590591
typename ReductionOpT,

cub/cub/device/dispatch/tuning/tuning_reduce.cuh

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020
#include <cuda/std/optional>
2121

22+
#if !_CCCL_COMPILER(NVRTC)
23+
# include <ostream>
24+
#endif
25+
2226
CUB_NAMESPACE_BEGIN
2327

2428
struct agent_reduce_policy // equivalent of AgentReducePolicy
@@ -226,17 +230,17 @@ struct sm100_tuning<double, OffsetT, op_type::plus, offset_size::_4, accum_size:
226230

227231
enum class accum_type
228232
{
229-
_float,
230-
_double,
233+
float32,
234+
double32,
231235
other,
232236
};
233237

234238
template <typename AccumT>
235239
_CCCL_HOST_DEVICE constexpr accum_type classify_accum_type()
236240
{
237-
return ::cuda::std::is_same_v<AccumT, float> ? accum_type::_float
241+
return ::cuda::std::is_same_v<AccumT, float> ? accum_type::float32
238242
: ::cuda::std::is_same_v<AccumT, double>
239-
? accum_type::_double
243+
? accum_type::double32
240244
: accum_type::other;
241245
}
242246

@@ -252,11 +256,11 @@ _CCCL_API constexpr auto get_sm100_tuning(accum_type at, op_type ot, offset_size
252256
{
253257
if (ot == op_type::plus)
254258
{
255-
if (at == accum_type::_float && os == offset_size::_4 && as == accum_size::_4)
259+
if (at == accum_type::float32 && os == offset_size::_4 && as == accum_size::_4)
256260
{
257261
return sm100_tuning_values{16, 512, 2};
258262
}
259-
if (at == accum_type::_double && os == offset_size::_4 && as == accum_size::_8)
263+
if (at == accum_type::double32 && os == offset_size::_4 && as == accum_size::_8)
260264
{
261265
return sm100_tuning_values{16, 640, 1};
262266
}
@@ -379,7 +383,7 @@ struct arch_policies // equivalent to the policy_hub, holds policies for a bunch
379383
// IDEA(bgruber): instead of the constexpr function, we could also provide a map<int, reduce_arch_policy> and move the
380384
// selection mechanism elsewhere
381385

382-
_CCCL_API constexpr auto operator()(int arch) const -> reduce_arch_policy
386+
[[nodiscard]] _CCCL_API constexpr auto operator()(int arch) const -> reduce_arch_policy
383387
{
384388
if (arch >= 1000)
385389
{
@@ -431,8 +435,7 @@ struct arch_policies // equivalent to the policy_hub, holds policies for a bunch
431435
constexpr int items_per_thread = 20;
432436
constexpr int items_per_vec_load = 4;
433437

434-
auto [scaled_items,
435-
scaled_threads] = scale_mem_bound(threads_per_block, items_per_thread, accum_size);
438+
auto [scaled_items, scaled_threads] = scale_mem_bound(threads_per_block, items_per_thread, accum_size);
436439
const auto rp =
437440
agent_reduce_policy{scaled_threads, scaled_items, items_per_vec_load, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG};
438441

@@ -450,7 +453,7 @@ struct arch_policies // equivalent to the policy_hub, holds policies for a bunch
450453
template <typename AccumT, typename OffsetT, typename ReductionOpT>
451454
struct arch_policies_from_types
452455
{
453-
_CCCL_API constexpr auto operator()(int arch) const -> reduce_arch_policy
456+
[[nodiscard]] _CCCL_API constexpr auto operator()(int arch) const -> reduce_arch_policy
454457
{
455458
constexpr auto policies = arch_policies{
456459
classify_accum_type<AccumT>(), classify_op<ReductionOpT>(), classify_offset_size<OffsetT>(), int{sizeof(AccumT)}};

0 commit comments

Comments
 (0)