Some (internal) cuda::memcpy_async examples began to hang after #5996.
Example: https://godbolt.org/z/W5KvMh36e
__global__ void kernel_pipeline(double* __restrict__ ary) {
__shared__ alignas(16) double val[16];
__shared__ cuda::barrier<cuda::thread_scope_block> bar;
if (threadIdx.x == 0) {
init(&bar, 32);
}
__syncthreads();
if (threadIdx.x == 0) {
cuda::memcpy_async(val, ary,
cuda::aligned_size_t<16>(16 * sizeof(double)),
bar);
}
bar.arrive_and_wait();
}