Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 50 additions & 8 deletions src/colvar_gpu_calc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,13 @@ int colvarmodule_gpu_calc::cvc_calc_total_force(
}
}
if (!g.nodes.empty()) {
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
g.graph_exec_initialized = true;
}
}
Expand Down Expand Up @@ -213,7 +219,13 @@ int colvarmodule_gpu_calc::atom_group_read_data_gpu(
if (error_code != COLVARS_OK) return error_code;
g.nodes.push_back({*ag, child_graph_node, require_cpu_buffers});
}
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
if (error_code != COLVARS_OK) return error_code;
g.graph_exec_initialized = true;
if (cvmodule->debug()) {
Expand Down Expand Up @@ -286,7 +298,13 @@ int colvarmodule_gpu_calc::cvc_calc_value(
}
}
if (!g.nodes.empty()) {
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
g.graph_exec_initialized = true;
}
}
Expand Down Expand Up @@ -373,7 +391,13 @@ int colvarmodule_gpu_calc::cvc_calc_gradients(
}
}
if (!g.nodes.empty()) {
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
g.graph_exec_initialized = true;
}
}
Expand Down Expand Up @@ -484,7 +508,13 @@ int colvarmodule_gpu_calc::atom_group_calc_fit_gradients(
if (error_code != COLVARS_OK) return error_code;
g.nodes.push_back({*ag, child_graph_node, require_cpu_buffers});
}
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
if (error_code != COLVARS_OK) return error_code;
g.graph_exec_initialized = true;
if (cvmodule->debug()) {
Expand Down Expand Up @@ -564,7 +594,13 @@ int colvarmodule_gpu_calc::cvc_calc_Jacobian_derivative(
}
}
if (!g.nodes.empty()) {
error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
g.graph_exec_initialized = true;
}
}
Expand Down Expand Up @@ -743,6 +779,7 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector<colvar*>& colvars, col
if (error_code != COLVARS_OK) return error_code;\
} while (0);
colvarproxy* p = cvmodule->proxy;
cudaStream_t stream = p->get_default_stream();
if (!apply_forces_compute.graph_exec_initialized) {
forced_atom_groups.clear();
// Find all unique atom groups requiring forces
Expand Down Expand Up @@ -810,7 +847,13 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector<colvar*>& colvars, col
&child_graph_node, apply_forces_compute.graph, NULL, 0, ag_graph)));
checkColvarsError(checkGPUError(cudaGraphDestroy(ag_graph)));
}
checkColvarsError(checkGPUError(cudaGraphInstantiate(&apply_forces_compute.graph_exec, apply_forces_compute.graph)));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
checkColvarsError(checkGPUError(cudaGraphInstantiateWithParams(&apply_forces_compute.graph_exec, apply_forces_compute.graph, &params)));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
apply_forces_compute.graph_exec_initialized = true;
// Debug graph
if (cvmodule->debug()) {
Expand All @@ -832,7 +875,6 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector<colvar*>& colvars, col
}
}
}
cudaStream_t stream = p->get_default_stream();
#if defined (COLVARS_NVTX_PROFILING)
apply_forces_prof.start();
#endif
Expand Down
93 changes: 92 additions & 1 deletion src/colvar_gpu_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,24 @@
#ifdef COLVARS_NVTX_PROFILING
#include <nvtx3/nvToolsExt.h>
#endif
#define COLVARS_SYNC_WARP __syncwarp()
#endif // defined(COLVARS_CUDA)

#if defined(COLVARS_HIP)
#include <hip/hip_runtime.h>
#if defined(__HIP_PLATFORM_AMD__)
#if HIP_VERSION_MAJOR >= 7
#define COLVARS_SYNC_WARP __syncwarp()
#else
#define COLVARS_SYNC_WARP __threadfence_block()
#endif
#elif defined(__HIP_PLATFORM_NVIDIA__)
#define COLVARS_SYNC_WARP __syncwarp()
#else
#error "Unknown HIP platform"
#endif
#endif // defined(COLVARS_HIP)

#if defined(COLVARS_HIP)
#ifndef cudaError_t
#define cudaError_t hipError_t
Expand All @@ -28,6 +44,10 @@
#define cudaFreeHost hipFreeHost
#endif // cudaFreeHost

#ifndef cudaFreeAsync
#define cudaFreeAsync hipFreeAsync
#endif // cudaFreeAsync

#ifndef cudaGetErrorString
#define cudaGetErrorString hipGetErrorString
#endif // cudaGetErrorString
Expand Down Expand Up @@ -72,6 +92,22 @@
#define cudaGraphInstantiate hipGraphInstantiate
#endif // cudaGraphInstantiate

#ifndef cudaGraphInstantiateWithParams
#define cudaGraphInstantiateWithParams hipGraphInstantiateWithParams
#endif // cudaGraphInstantiateWithParams

#ifndef cudaGraphInstantiateParams
#define cudaGraphInstantiateParams hipGraphInstantiateParams
#endif // cudaGraphInstantiateParams

#ifndef cudaGraphInstantiateFlagUpload
#define cudaGraphInstantiateFlagUpload hipGraphInstantiateFlagUpload
#endif // cudaGraphInstantiateFlagUpload

#ifndef cudaGraphInstantiateSuccess
#define cudaGraphInstantiateSuccess hipGraphInstantiateSuccess
#endif // cudaGraphInstantiateSuccess

#ifndef cudaGraphLaunch
#define cudaGraphLaunch hipGraphLaunch
#endif // cudaGraphLaunch
Expand All @@ -84,10 +120,34 @@
#define cudaGraph_t hipGraph_t
#endif // cudaGraph_t

#ifndef cudaGraphDebugDotPrint
#define cudaGraphDebugDotPrint hipGraphDebugDotPrint
#endif // cudaGraphDebugDotPrint

#ifndef cudaGraphDebugDotFlags
#define cudaGraphDebugDotFlags hipGraphDebugDotFlags
#endif // cudaGraphDebugDotFlags

#ifndef cudaGraphDebugDotFlagsVerbose
#define cudaGraphDebugDotFlagsVerbose hipGraphDebugDotFlagsVerbose
#endif // cudaGraphDebugDotFlagsVerbose

#ifndef cudaHostAllocMapped
#define cudaHostAllocMapped hipHostAllocMapped
#endif // cudaHostAllocMapped

#ifndef cudaHostAllocDefault
#define cudaHostAllocDefault hipHostAllocDefault
#endif // cudaHostAllocDefault

#ifndef cudaHostAlloc
#define cudaHostAlloc hipHostAlloc
#endif // cudaHostAlloc

#ifndef cudaLaunchKernel
#define cudaLaunchKernel hipLaunchKernel
#endif // cudaLaunchKernel

#ifndef cudaKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#endif // cudaKernelNodeParams
Expand All @@ -96,10 +156,18 @@
#define cudaMalloc hipMalloc
#endif // cudaMalloc

#ifndef cudaMallocAsync
#define cudaMallocAsync hipMallocAsync
#endif // cudaMallocAsync

#ifndef cudaMallocHost
#define cudaMallocHost hipMallocHost
#endif // cudaMallocHost

#ifndef cudaGraphAddMemcpyNode1D
#define cudaGraphAddMemcpyNode1D hipGraphAddMemcpyNode1D
#endif

#ifndef cudaMemcpy
#define cudaMemcpy hipMemcpy
#endif // cudaMemcpy
Expand Down Expand Up @@ -132,6 +200,10 @@
#define cudaMemset hipMemset
#endif // cudaMemset

#ifndef cudaMemsetParams
#define cudaMemsetParams hipMemsetParams
#endif // cudaMemsetParams

#ifndef cudaMemsetAsync
#define cudaMemsetAsync hipMemsetAsync
#endif // cudaMemsetAsync
Expand Down Expand Up @@ -188,6 +260,25 @@ static unsigned int default_reduce_max_num_blocks = 64;
#define COLVARS_DEVICE
#endif

// HIP does not have cuda::std::array since libhipcxx is not a part of the ROCm distribution,
// so reinvent the wheel...
#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
template <typename T, unsigned long N>
class array1d {
public:
T m_data[N];
using value_type = T;
using size_type = decltype(N);
using reference = value_type&;
using const_reference = const value_type&;
using pointer = T*;
using const_pointer = const T*;
COLVARS_HOST_DEVICE constexpr size_type size() const {return N;}
COLVARS_HOST_DEVICE reference operator[](size_type pos) {return m_data[pos];}
COLVARS_HOST_DEVICE const_reference operator[](size_type pos) const {return m_data[pos];}
};
#endif

// TODO: What about SYCL?
#if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) )
/**
Expand Down Expand Up @@ -219,7 +310,7 @@ class CudaHostAllocator {
return ptr;
}
void deallocate(T* ptr, size_t n) noexcept {
cudaFreeHost(ptr);
(void)cudaFreeHost(ptr);
}
template<typename U, typename... Args>
void construct(U* p, Args&&... args) {
Expand Down
12 changes: 9 additions & 3 deletions src/colvaratoms_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -886,8 +886,8 @@ int colvaratoms_gpu::read_positions_gpu_debug(
cpu_atoms->fitting_group->atoms_pos.data(),
3 * cpu_atoms->fitting_group->num_atoms, stream);
}
error_code |= checkGPUError(cudaStreamSynchronize(stream));
}
error_code |= checkGPUError(cudaStreamSynchronize(stream));
return error_code;
}

Expand All @@ -904,8 +904,14 @@ int colvaratoms_gpu::calc_required_properties_gpu_debug(
error_code |= add_update_cpu_buffers_nodes(
cpu_atoms, debug_graphs.graph_calc_required_properties, nodes_map);
}
error_code |= checkGPUError(cudaGraphInstantiate(
&debug_graphs.graph_exec_calc_required_properties, debug_graphs.graph_calc_required_properties));
cudaGraphInstantiateParams params{0};
params.flags = cudaGraphInstantiateFlagUpload;
params.uploadStream = stream;
error_code |= checkGPUError(cudaGraphInstantiateWithParams(
&debug_graphs.graph_exec_calc_required_properties, debug_graphs.graph_calc_required_properties, &params));
if (params.result_out != cudaGraphInstantiateSuccess) {
error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR);
}
debug_graphs.initialized = true;
}
error_code |= checkGPUError(cudaGraphLaunch(
Expand Down
2 changes: 1 addition & 1 deletion src/colvarcomp_distances.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ int colvar::rmsd::init(std::string const &conf)
num_ref_pos = ref_pos.size();
ref_pos_soa = cvm::atom_group::pos_aos_to_soa(ref_pos);
if (has_gpu_implementation()) {
#if defined (COLVARS_CUDA) || defined (COLVARS_GPU)
#if defined (COLVARS_CUDA) || defined (COLVARS_HIP)
colvarproxy* p = cvmodule->proxy;
error_code |= p->reallocate_device(&d_ref_pos_soa, 3 * num_ref_pos);
error_code |= p->copy_HtoD(ref_pos_soa.data(), d_ref_pos_soa, 3 * num_ref_pos);
Expand Down
2 changes: 1 addition & 1 deletion src/colvarproxy_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using namespace colvars_gpu;
#if defined (COLVARS_CUDA) || defined (COLVARS_HIP) || defined (COLVARS_SYCL)
int colvarproxy_gpu::allocate_host_T(void **pp, const size_t len, const size_t sizeofT) {
int error_code = COLVARS_OK;
error_code |= checkGPUError(cudaMallocHost(pp, sizeofT*len));
error_code |= checkGPUError(cudaHostAlloc(pp, sizeofT*len, cudaHostAllocDefault));
return error_code;
}

Expand Down
Loading