From 5f3bf339adab1e4dba0be49f92987448b6ff28b6 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Tue, 25 Nov 2025 13:49:53 -0600 Subject: [PATCH 01/14] build: add HIP support --- src/cuda/colvaratoms_kernel.cu | 6 +- src/cuda/colvartypes_kernel.cu | 4 +- tests/functional_gpu/CMakeLists.txt | 77 +++++++++++++++---- .../functional_gpu/run_colvars_test_cuda.cpp | 4 +- 4 files changed, 71 insertions(+), 20 deletions(-) diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index 89ee4a5f6..1b14baf83 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -7,7 +7,11 @@ #include #endif -// TODO: HIP CUB +#if defined (COLVARS_HIP) +#include +// Require libhipcxx +#include +#endif namespace colvars_gpu { #if defined(COLVARS_CUDA) || defined(COLVARS_HIP) diff --git a/src/cuda/colvartypes_kernel.cu b/src/cuda/colvartypes_kernel.cu index d1a03cb39..d016acc19 100644 --- a/src/cuda/colvartypes_kernel.cu +++ b/src/cuda/colvartypes_kernel.cu @@ -6,7 +6,9 @@ #include #endif -// TODO: HIP CUB +#if defined (COLVARS_HIP) +#include +#endif namespace colvars_gpu { #if defined(COLVARS_CUDA) || defined(COLVARS_HIP) diff --git a/tests/functional_gpu/CMakeLists.txt b/tests/functional_gpu/CMakeLists.txt index 8ed038adb..5c0bef884 100644 --- a/tests/functional_gpu/CMakeLists.txt +++ b/tests/functional_gpu/CMakeLists.txt @@ -2,26 +2,54 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) include(CheckIncludeFile) project(run_colvars_test_cuda LANGUAGES CXX) +# Supported GPU types +set(gpu_type_avail CUDA HIP) + option(BUILD_TESTS "Build tests" ON) option(COLVARS_LEPTON "Build Colvars with Lepton" ON) +set(GPU_TYPE CUDA CACHE STRING "GPU Type") +set_property(CACHE GPU_TYPE PROPERTY STRINGS ${gpu_type_avail}) -if(NOT EXISTS ${COLVARS_SOURCE_DIR}) - set(COLVARS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.." CACHE STRING "Colvars source code directory") +if(GPU_TYPE STREQUAL "CUDA") + message(STATUS "Build the test interface with CUDA") +else() + message(STATUS "Build the test interface with HIP") endif() -if(NOT DEFINED CMAKE_CUDA_STANDARD) - set(CMAKE_CUDA_STANDARD 17) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) -endif() +if(GPU_TYPE STREQUAL "CUDA") + if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + endif() + + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES native) + endif() + + find_package(CUDAToolkit) + include(CheckLanguage) + check_language(CUDA) + enable_language(CUDA) +else() + if(NOT DEFINED CMAKE_HIP_STANDARD) + set(CMAKE_HIP_STANDARD 17) + set(CMAKE_HIP_STANDARD_REQUIRED ON) + endif() + + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES native) + endif() -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES native) + find_package(hip) + include(CheckLanguage) + check_language(HIP) + enable_language(HIP) + find_package(libhipcxx) endif() -find_package(CUDAToolkit) -include(CheckLanguage) -check_language(CUDA) -enable_language(CUDA) +if(NOT EXISTS ${COLVARS_SOURCE_DIR}) + set(COLVARS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.." CACHE STRING "Colvars source code directory") +endif() set(COLVARS_CUDA_DIR ${COLVARS_SOURCE_DIR}/src/cuda) @@ -63,7 +91,12 @@ if(COLVARS_LEPTON) target_compile_options(lepton PRIVATE $<$:-Wno-tautological-undefined-compare -Wno-unknown-warning-option>) endif() -add_definitions(-DCOLVARS_CUDA) +if(GPU_TYPE STREQUAL "CUDA") + add_definitions(-DCOLVARS_CUDA) +else() + add_definitions(-DCOLVARS_HIP) +endif() + if(COLVARS_LEPTON) add_definitions(-DLEPTON) endif() @@ -81,10 +114,22 @@ target_include_directories(run_colvars_test_cuda PUBLIC "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" PUBLIC $<$:${LEPTON_DIR}/include>) -if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 10) - target_link_libraries(run_colvars_test_cuda PUBLIC CUDA::cudart PUBLIC CUDA::nvtx3) +if(GPU_TYPE STREQUAL "CUDA") + set_property(TARGET run_colvars_test_cuda PROPERTY LANGUAGE CUDA) + if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 10) + target_link_libraries(run_colvars_test_cuda PRIVATE CUDA::cudart PRIVATE CUDA::nvtx3) + else() + target_link_libraries(run_colvars_test_cuda PRIVATE CUDA::cudart PRIVATE CUDA::nvToolsExt) + endif() else() - target_link_libraries(run_colvars_test_cuda PUBLIC CUDA::cudart PUBLIC CUDA::nvToolsExt) + target_link_libraries(run_colvars_test_cuda PRIVATE hip::device hip::host PRIVATE libhipcxx::libhipcxx) + set_property(TARGET run_colvars_test_cuda PROPERTY LANGUAGE HIP) + set_source_files_properties(${COLVARS_SOURCES} PROPERTIES LANGUAGE CXX) + set_source_files_properties(${LEPTON_SOURCES} PROPERTIES LANGUAGE CXX) + set_source_files_properties(CLI11.hpp PROPERTIES LANGUAGE CXX) + set_source_files_properties(run_colvars_test_cuda.cpp PROPERTIES LANGUAGE CXX) + set_source_files_properties(${COLVARS_CUDA_HEADER} PROPERTIES LANGUAGE HIP) + set_source_files_properties(${COLVARS_CUDA_SOURCE} PROPERTIES LANGUAGE HIP) endif() if(COLVARS_LEPTON) diff --git a/tests/functional_gpu/run_colvars_test_cuda.cpp b/tests/functional_gpu/run_colvars_test_cuda.cpp index 6e3f3defe..a5b56468b 100644 --- a/tests/functional_gpu/run_colvars_test_cuda.cpp +++ b/tests/functional_gpu/run_colvars_test_cuda.cpp @@ -12,7 +12,7 @@ #define COLVARPROXY_VERSION COLVARS_VERSION -#if defined (COLVARS_CUDA) +#if defined (COLVARS_CUDA) || defined (COLVARS_HIP) class colvarproxy_stub_gpu : public colvarproxy { public: colvarproxy_stub_gpu(); @@ -363,5 +363,5 @@ int main(int argc, char *argv[]) { #else std::cout << "This program requires CUDA to test." << std::endl; return 1; -#endif // defined (COLVARS_CUDA) +#endif // defined (COLVARS_CUDA) || defined (COLVARS_HIP) } From 4ba98f685d5075fa11cf9978b3ef1676fa2d89b6 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Thu, 2 Apr 2026 12:42:10 -0500 Subject: [PATCH 02/14] refactor: remove the dependency to cuda::std::array --- src/colvar_gpu_support.h | 19 +++++++++++++++++++ src/cuda/colvaratoms_kernel.cu | 5 +---- tests/functional_gpu/CMakeLists.txt | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index e8064e426..4c10adb8a 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -188,6 +188,25 @@ static unsigned int default_reduce_max_num_blocks = 64; #define COLVARS_DEVICE #endif +// HIP does not have cuda::std::array since libhipcxx is not a part of the ROCm distribution, +// so reinvent the wheel... +#if defined(COLVARS_CUDA) || defined(COLVARS_HIP) +template +class array1d { +public: + T m_data[N]; + using value_type = T; + using size_type = decltype(N); + using reference = value_type&; + using const_reference = const value_type&; + using pointer = T*; + using const_pointer = const T*; + COLVARS_HOST_DEVICE constexpr size_type size() const {return N;} + COLVARS_HOST_DEVICE reference operator[](size_type pos) {return m_data[pos];} + COLVARS_HOST_DEVICE const_reference operator[](size_type pos) const {return m_data[pos];} +}; +#endif + // TODO: What about SYCL? #if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) ) /** diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index 1b14baf83..535b8f3df 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -4,13 +4,10 @@ #if defined(COLVARS_CUDA) #include -#include #endif #if defined (COLVARS_HIP) #include -// Require libhipcxx -#include #endif namespace colvars_gpu { @@ -635,7 +632,7 @@ __global__ void calc_fit_forces_impl_loop1_kernel( } if (threadIdx.x == 0) { if (B_ag_rotate) { - cuda::std::array partial_dxdq; + colvars_gpu::array1d partial_dxdq; partial_dxdq = q->derivative_element_wise_product_sum(C); atomicAdd(&(sum_dxdq[0]), partial_dxdq[0]); diff --git a/tests/functional_gpu/CMakeLists.txt b/tests/functional_gpu/CMakeLists.txt index 5c0bef884..e1a67ecc6 100644 --- a/tests/functional_gpu/CMakeLists.txt +++ b/tests/functional_gpu/CMakeLists.txt @@ -122,7 +122,7 @@ if(GPU_TYPE STREQUAL "CUDA") target_link_libraries(run_colvars_test_cuda PRIVATE CUDA::cudart PRIVATE CUDA::nvToolsExt) endif() else() - target_link_libraries(run_colvars_test_cuda PRIVATE hip::device hip::host PRIVATE libhipcxx::libhipcxx) + target_link_libraries(run_colvars_test_cuda PRIVATE hip::device hip::host) set_property(TARGET run_colvars_test_cuda PROPERTY LANGUAGE HIP) set_source_files_properties(${COLVARS_SOURCES} PROPERTIES LANGUAGE CXX) set_source_files_properties(${LEPTON_SOURCES} PROPERTIES LANGUAGE CXX) From e8a81a27409b8504cc5ff4d15ea4e9196238a4f2 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 6 Apr 2026 16:46:43 -0500 Subject: [PATCH 03/14] refactor: use cudaGraphInstantiateWithParams It looks like cudaGraphInstantiateWithParams is more compatible with the signature of hipGraphInstantiateWithParams. --- src/colvar_gpu_calc.cpp | 58 ++++++++++++++++++++++++++++++++++------ src/colvar_gpu_support.h | 16 +++++++++++ src/colvaratoms_gpu.cpp | 10 +++++-- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/colvar_gpu_calc.cpp b/src/colvar_gpu_calc.cpp index 376c7221a..7cf621376 100644 --- a/src/colvar_gpu_calc.cpp +++ b/src/colvar_gpu_calc.cpp @@ -64,7 +64,13 @@ int colvarmodule_gpu_calc::cvc_calc_total_force( } } if (!g.nodes.empty()) { - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } g.graph_exec_initialized = true; } } @@ -213,7 +219,13 @@ int colvarmodule_gpu_calc::atom_group_read_data_gpu( if (error_code != COLVARS_OK) return error_code; g.nodes.push_back({*ag, child_graph_node, require_cpu_buffers}); } - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } if (error_code != COLVARS_OK) return error_code; g.graph_exec_initialized = true; if (cvmodule->debug()) { @@ -286,7 +298,13 @@ int colvarmodule_gpu_calc::cvc_calc_value( } } if (!g.nodes.empty()) { - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } g.graph_exec_initialized = true; } } @@ -373,7 +391,13 @@ int colvarmodule_gpu_calc::cvc_calc_gradients( } } if (!g.nodes.empty()) { - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } g.graph_exec_initialized = true; } } @@ -484,7 +508,13 @@ int colvarmodule_gpu_calc::atom_group_calc_fit_gradients( if (error_code != COLVARS_OK) return error_code; g.nodes.push_back({*ag, child_graph_node, require_cpu_buffers}); } - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } if (error_code != COLVARS_OK) return error_code; g.graph_exec_initialized = true; if (cvmodule->debug()) { @@ -564,7 +594,13 @@ int colvarmodule_gpu_calc::cvc_calc_Jacobian_derivative( } } if (!g.nodes.empty()) { - error_code |= checkGPUError(cudaGraphInstantiate(&g.graph_exec, g.graph)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams(&g.graph_exec, g.graph, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } g.graph_exec_initialized = true; } } @@ -743,6 +779,7 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector& colvars, col if (error_code != COLVARS_OK) return error_code;\ } while (0); colvarproxy* p = cvmodule->proxy; + cudaStream_t stream = p->get_default_stream(); if (!apply_forces_compute.graph_exec_initialized) { forced_atom_groups.clear(); // Find all unique atom groups requiring forces @@ -810,7 +847,13 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector& colvars, col &child_graph_node, apply_forces_compute.graph, NULL, 0, ag_graph))); checkColvarsError(checkGPUError(cudaGraphDestroy(ag_graph))); } - checkColvarsError(checkGPUError(cudaGraphInstantiate(&apply_forces_compute.graph_exec, apply_forces_compute.graph))); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + checkColvarsError(checkGPUError(cudaGraphInstantiateWithParams(&apply_forces_compute.graph_exec, apply_forces_compute.graph, ¶ms))); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } apply_forces_compute.graph_exec_initialized = true; // Debug graph if (cvmodule->debug()) { @@ -832,7 +875,6 @@ int colvarmodule_gpu_calc::apply_forces(const std::vector& colvars, col } } } - cudaStream_t stream = p->get_default_stream(); #if defined (COLVARS_NVTX_PROFILING) apply_forces_prof.start(); #endif diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index 4c10adb8a..c03749792 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -72,6 +72,22 @@ #define cudaGraphInstantiate hipGraphInstantiate #endif // cudaGraphInstantiate +#ifndef cudaGraphInstantiateWithParams +#define cudaGraphInstantiateWithParams hipGraphInstantiateWithParams +#endif // cudaGraphInstantiateWithParams + +#ifndef cudaGraphInstantiateParams +#define cudaGraphInstantiateParams hipGraphInstantiateParams +#endif // cudaGraphInstantiateParams + +#ifndef cudaGraphInstantiateFlagUpload +#define cudaGraphInstantiateFlagUpload hipGraphInstantiateFlagUpload +#endif // cudaGraphInstantiateFlagUpload + +#ifndef cudaGraphInstantiateSuccess +#define cudaGraphInstantiateSuccess hipGraphInstantiateSuccess +#endif // cudaGraphInstantiateSuccess + #ifndef cudaGraphLaunch #define cudaGraphLaunch hipGraphLaunch #endif // cudaGraphLaunch diff --git a/src/colvaratoms_gpu.cpp b/src/colvaratoms_gpu.cpp index a041d7e07..5da850a86 100644 --- a/src/colvaratoms_gpu.cpp +++ b/src/colvaratoms_gpu.cpp @@ -904,8 +904,14 @@ int colvaratoms_gpu::calc_required_properties_gpu_debug( error_code |= add_update_cpu_buffers_nodes( cpu_atoms, debug_graphs.graph_calc_required_properties, nodes_map); } - error_code |= checkGPUError(cudaGraphInstantiate( - &debug_graphs.graph_exec_calc_required_properties, debug_graphs.graph_calc_required_properties)); + cudaGraphInstantiateParams params{0}; + params.flags = cudaGraphInstantiateFlagUpload; + params.uploadStream = stream; + error_code |= checkGPUError(cudaGraphInstantiateWithParams( + &debug_graphs.graph_exec_calc_required_properties, debug_graphs.graph_calc_required_properties, ¶ms)); + if (params.result_out != cudaGraphInstantiateSuccess) { + error_code |= cvmodule->error("Failed to instantiate CUDA graph!", COLVARS_ERROR); + } debug_graphs.initialized = true; } error_code |= checkGPUError(cudaGraphLaunch( From bb5d328342742e2bd7bac24f614bc88902a5e691 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Thu, 9 Apr 2026 16:21:38 -0500 Subject: [PATCH 04/14] fix: unify the CUDA and HIP code branches --- src/cuda/colvaratoms_kernel.cu | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index 535b8f3df..95b737c17 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -650,30 +650,27 @@ __global__ void calc_fit_forces_impl_loop1_kernel( __syncthreads(); if (isLastBlockDone) { // Compute dxdC in a single warp -#if defined (COLVARS_CUDA) - const unsigned int warpID = threadIdx.x / 32; -#elif defined (COLVARS_HIP) - const unsigned int warpID = threadIdx.x / 64; -#endif + const unsigned int warpID = threadIdx.x / warpSize; if (warpID == 0) { const unsigned int tid = threadIdx.x; + constexpr const int valid_items = 4; cvm::rmatrix dxdq_dqdC; dxdq_dqdC.reset(); - if (tid < 4) { + if (tid < valid_items) { dxdq_dqdC += rot_deriv->project_force_to_C_from_dxdqi(tid, sum_dxdq[tid]); } __syncwarp(); - using WarpReduce = cub::WarpReduce; + using WarpReduce = cub::WarpReduce; __shared__ typename WarpReduce::TempStorage warp_temp_storage; - dxdq_dqdC.xx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xx); __syncwarp(); - dxdq_dqdC.xy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xy); __syncwarp(); - dxdq_dqdC.xz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xz); __syncwarp(); - dxdq_dqdC.yx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yx); __syncwarp(); - dxdq_dqdC.yy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yy); __syncwarp(); - dxdq_dqdC.yz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yz); __syncwarp(); - dxdq_dqdC.zx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zx); __syncwarp(); - dxdq_dqdC.zy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zy); __syncwarp(); - dxdq_dqdC.zz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zz); __syncwarp(); + dxdq_dqdC.xx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xx, valid_items); __syncwarp(); + dxdq_dqdC.xy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xy, valid_items); __syncwarp(); + dxdq_dqdC.xz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xz, valid_items); __syncwarp(); + dxdq_dqdC.yx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yx, valid_items); __syncwarp(); + dxdq_dqdC.yy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yy, valid_items); __syncwarp(); + dxdq_dqdC.yz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yz, valid_items); __syncwarp(); + dxdq_dqdC.zx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zx, valid_items); __syncwarp(); + dxdq_dqdC.zy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zy, valid_items); __syncwarp(); + dxdq_dqdC.zz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zz, valid_items); __syncwarp(); if (tid == 0) { dxdC->xx = dxdq_dqdC.xx; dxdC->xy = dxdq_dqdC.xy; From 860cc2f4c8a6615a048f10a8899fbef0d2132b01 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Thu, 9 Apr 2026 16:26:07 -0500 Subject: [PATCH 05/14] fix: add hipcub for HIP --- src/cuda/colvaratoms_kernel.cu | 1 + src/cuda/colvarcomp_distance_kernel.cu | 5 +++++ src/cuda/colvartypes_kernel.cu | 1 + 3 files changed, 7 insertions(+) diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index 95b737c17..bbc4e63dc 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -8,6 +8,7 @@ #if defined (COLVARS_HIP) #include +#define cub hipcub #endif namespace colvars_gpu { diff --git a/src/cuda/colvarcomp_distance_kernel.cu b/src/cuda/colvarcomp_distance_kernel.cu index 0cd26c209..5717ecfad 100644 --- a/src/cuda/colvarcomp_distance_kernel.cu +++ b/src/cuda/colvarcomp_distance_kernel.cu @@ -7,6 +7,11 @@ #include #endif +#if defined (COLVARS_HIP) +#include +#define cub hipcub +#endif + namespace colvars_gpu { #if defined(COLVARS_CUDA) || defined(COVLARS_HIP) diff --git a/src/cuda/colvartypes_kernel.cu b/src/cuda/colvartypes_kernel.cu index d016acc19..5a69ede9e 100644 --- a/src/cuda/colvartypes_kernel.cu +++ b/src/cuda/colvartypes_kernel.cu @@ -8,6 +8,7 @@ #if defined (COLVARS_HIP) #include +#define cub hipcub #endif namespace colvars_gpu { From 329190fbcd2c41e8218acf4121353c92497b7083 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 13 Apr 2026 10:08:22 -0500 Subject: [PATCH 06/14] build: fix the HIP compilation --- src/colvar_gpu_support.h | 40 +++++++++++++++++++ src/cuda/colvaratoms_kernel.cu | 2 + src/cuda/colvarcomp_distance_kernel.cu | 4 +- src/cuda/colvarcomp_distance_kernel.h | 4 +- tests/functional_gpu/CMakeLists.txt | 2 +- .../functional_gpu/run_colvars_test_cuda.cpp | 2 +- 6 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index c03749792..32d1875f3 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -15,6 +15,10 @@ #endif #endif // defined(COLVARS_CUDA) +#if defined(COLVARS_HIP) +#include +#endif // defined(COLVARS_HIP) + #if defined(COLVARS_HIP) #ifndef cudaError_t #define cudaError_t hipError_t @@ -28,6 +32,10 @@ #define cudaFreeHost hipFreeHost #endif // cudaFreeHost +#ifndef cudaFreeAsync +#define cudaFreeAsync hipFreeAsync +#endif // cudaFreeAsync + #ifndef cudaGetErrorString #define cudaGetErrorString hipGetErrorString #endif // cudaGetErrorString @@ -100,10 +108,30 @@ #define cudaGraph_t hipGraph_t #endif // cudaGraph_t +#ifndef cudaGraphDebugDotPrint +#define cudaGraphDebugDotPrint hipGraphDebugDotPrint +#endif // cudaGraphDebugDotPrint + +#ifndef cudaGraphDebugDotFlags +#define cudaGraphDebugDotFlags hipGraphDebugDotFlags +#endif // cudaGraphDebugDotFlags + +#ifndef cudaGraphDebugDotFlagsVerbose +#define cudaGraphDebugDotFlagsVerbose hipGraphDebugDotFlagsVerbose +#endif // cudaGraphDebugDotFlagsVerbose + #ifndef cudaHostAllocMapped #define cudaHostAllocMapped hipHostAllocMapped #endif // cudaHostAllocMapped +#ifndef cudaHostAlloc +#define cudaHostAlloc hipHostAlloc +#endif // cudaHostAlloc + +#ifndef cudaLaunchKernel +#define cudaLaunchKernel hipLaunchKernel +#endif // cudaLaunchKernel + #ifndef cudaKernelNodeParams #define cudaKernelNodeParams hipKernelNodeParams #endif // cudaKernelNodeParams @@ -112,10 +140,18 @@ #define cudaMalloc hipMalloc #endif // cudaMalloc +#ifndef cudaMallocAsync +#define cudaMallocAsync hipMallocAsync +#endif // cudaMallocAsync + #ifndef cudaMallocHost #define cudaMallocHost hipMallocHost #endif // cudaMallocHost +#ifndef cudaGraphAddMemcpyNode1D +#define cudaGraphAddMemcpyNode1D hipGraphAddMemcpyNode1D +#endif + #ifndef cudaMemcpy #define cudaMemcpy hipMemcpy #endif // cudaMemcpy @@ -148,6 +184,10 @@ #define cudaMemset hipMemset #endif // cudaMemset +#ifndef cudaMemsetParams +#define cudaMemsetParams hipMemsetParams +#endif // cudaMemsetParams + #ifndef cudaMemsetAsync #define cudaMemsetAsync hipMemsetAsync #endif // cudaMemsetAsync diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index bbc4e63dc..c444da228 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -4,10 +4,12 @@ #if defined(COLVARS_CUDA) #include +#include #endif #if defined (COLVARS_HIP) #include +#include #define cub hipcub #endif diff --git a/src/cuda/colvarcomp_distance_kernel.cu b/src/cuda/colvarcomp_distance_kernel.cu index 5717ecfad..b61ab069a 100644 --- a/src/cuda/colvarcomp_distance_kernel.cu +++ b/src/cuda/colvarcomp_distance_kernel.cu @@ -13,7 +13,7 @@ #endif namespace colvars_gpu { -#if defined(COLVARS_CUDA) || defined(COVLARS_HIP) +#if defined(COLVARS_CUDA) || defined(COLVARS_HIP) template __global__ void calc_value_rmsd_kernel( @@ -466,5 +466,5 @@ int calc_Jacobian_derivative_rmsd( dependencies.size(), &kernelNodeParams)); return error_code; } -#endif // defined(COLVARS_CUDA) || defined(COVLARS_HIP) +#endif // defined(COLVARS_CUDA) || defined(COLVARS_HIP) } diff --git a/src/cuda/colvarcomp_distance_kernel.h b/src/cuda/colvarcomp_distance_kernel.h index eeeafec21..18d91b651 100644 --- a/src/cuda/colvarcomp_distance_kernel.h +++ b/src/cuda/colvarcomp_distance_kernel.h @@ -4,7 +4,7 @@ #include "colvarmodule.h" #include "colvar_gpu_support.h" -#if defined(COLVARS_CUDA) || defined(COVLARS_HIP) +#if defined(COLVARS_CUDA) || defined(COLVARS_HIP) namespace colvars_gpu { @@ -68,5 +68,5 @@ int calc_Jacobian_derivative_rmsd( const std::vector& dependencies); } -#endif // defined(COLVARS_CUDA) || defined(COVLARS_HIP) +#endif // defined(COLVARS_CUDA) || defined(COLVARS_HIP) #endif // COLVARCOMP_DISTANCE_KERNEL_H diff --git a/tests/functional_gpu/CMakeLists.txt b/tests/functional_gpu/CMakeLists.txt index e1a67ecc6..120881895 100644 --- a/tests/functional_gpu/CMakeLists.txt +++ b/tests/functional_gpu/CMakeLists.txt @@ -44,7 +44,7 @@ else() include(CheckLanguage) check_language(HIP) enable_language(HIP) - find_package(libhipcxx) + # find_package(libhipcxx) endif() if(NOT EXISTS ${COLVARS_SOURCE_DIR}) diff --git a/tests/functional_gpu/run_colvars_test_cuda.cpp b/tests/functional_gpu/run_colvars_test_cuda.cpp index a5b56468b..d7eaf72a3 100644 --- a/tests/functional_gpu/run_colvars_test_cuda.cpp +++ b/tests/functional_gpu/run_colvars_test_cuda.cpp @@ -274,7 +274,7 @@ int colvarproxy_stub_gpu::read_frame_xyz(const char *filename, const bool write_ #endif int main(int argc, char *argv[]) { -#if defined (COLVARS_CUDA) +#if defined (COLVARS_CUDA) || defined (COLVARS_HIP) CLI::App app{"Colvars stub interface for testing"}; argv = app.ensure_utf8(argv); std::string configuration_file; From ef28b414d18c33f1ed2efbe15da349c74f03b333 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 13 Apr 2026 13:23:15 -0500 Subject: [PATCH 07/14] fix: use the correct macro --- src/colvarcomp_distances.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/colvarcomp_distances.cpp b/src/colvarcomp_distances.cpp index f2ddd6863..fb6e55321 100644 --- a/src/colvarcomp_distances.cpp +++ b/src/colvarcomp_distances.cpp @@ -970,7 +970,7 @@ int colvar::rmsd::init(std::string const &conf) num_ref_pos = ref_pos.size(); ref_pos_soa = cvm::atom_group::pos_aos_to_soa(ref_pos); if (has_gpu_implementation()) { -#if defined (COLVARS_CUDA) || defined (COLVARS_GPU) +#if defined (COLVARS_CUDA) || defined (COLVARS_HIP) colvarproxy* p = cvmodule->proxy; error_code |= p->reallocate_device(&d_ref_pos_soa, 3 * num_ref_pos); error_code |= p->copy_HtoD(ref_pos_soa.data(), d_ref_pos_soa, 3 * num_ref_pos); From 9f48ec8eb76640f1013af16c5fb2aca56787377b Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 13 Apr 2026 13:47:45 -0500 Subject: [PATCH 08/14] fix: fix the segfault of q --- src/cuda/colvaratoms_kernel.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index c444da228..cfa30746e 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -445,7 +445,10 @@ __global__ void apply_colvar_force_to_proxy_kernel( unsigned int num_atoms) { const unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; const cvm::real force = (*force_ptr); - const cvm::rmatrix rot_inv = q->conjugate().rotation_matrix(); + cvm::rmatrix rot_inv; + if (ag_rotate) { + rot_inv = q->conjugate().rotation_matrix(); + } if (i < num_atoms) { const unsigned int proxy_index = atoms_proxy_index[i]; cvm::real fx, fy, fz; From c96f394d5fdeed52a8053e9570b3d2d0a27f47ec Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 13 Apr 2026 14:10:37 -0500 Subject: [PATCH 09/14] fix: synchronize the stream regardless of copying CPU It looks like the HIP code path relies on this to work correctly, but I don't know why. --- src/colvaratoms_gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/colvaratoms_gpu.cpp b/src/colvaratoms_gpu.cpp index 5da850a86..d51a59652 100644 --- a/src/colvaratoms_gpu.cpp +++ b/src/colvaratoms_gpu.cpp @@ -886,8 +886,8 @@ int colvaratoms_gpu::read_positions_gpu_debug( cpu_atoms->fitting_group->atoms_pos.data(), 3 * cpu_atoms->fitting_group->num_atoms, stream); } - error_code |= checkGPUError(cudaStreamSynchronize(stream)); } + error_code |= checkGPUError(cudaStreamSynchronize(stream)); return error_code; } From 71d7dc962cdb6f053dd5b32ddfb384ef8500f3cc Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 13 Apr 2026 14:59:55 -0500 Subject: [PATCH 10/14] chore: mute the HIP compilation warnings --- src/colvar_gpu_support.h | 6 +++++- src/colvarproxy_gpu.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index 32d1875f3..9faeb8f3e 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -124,6 +124,10 @@ #define cudaHostAllocMapped hipHostAllocMapped #endif // cudaHostAllocMapped +#ifndef cudaHostAllocDefault +#define cudaHostAllocDefault hipHostAllocDefault +#endif // cudaHostAllocDefault + #ifndef cudaHostAlloc #define cudaHostAlloc hipHostAlloc #endif // cudaHostAlloc @@ -294,7 +298,7 @@ class CudaHostAllocator { return ptr; } void deallocate(T* ptr, size_t n) noexcept { - cudaFreeHost(ptr); + (void)cudaFreeHost(ptr); } template void construct(U* p, Args&&... args) { diff --git a/src/colvarproxy_gpu.cpp b/src/colvarproxy_gpu.cpp index 14a429c6b..3717c735c 100644 --- a/src/colvarproxy_gpu.cpp +++ b/src/colvarproxy_gpu.cpp @@ -6,7 +6,7 @@ using namespace colvars_gpu; #if defined (COLVARS_CUDA) || defined (COLVARS_HIP) || defined (COLVARS_SYCL) int colvarproxy_gpu::allocate_host_T(void **pp, const size_t len, const size_t sizeofT) { int error_code = COLVARS_OK; - error_code |= checkGPUError(cudaMallocHost(pp, sizeofT*len)); + error_code |= checkGPUError(cudaHostAlloc(pp, sizeofT*len, cudaHostAllocDefault)); return error_code; } From 3af46353a63dc44049802baddb241dc23e1b2ee4 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Thu, 16 Apr 2026 10:13:23 -0500 Subject: [PATCH 11/14] refactor: try making the code compatible with HIP < 7.0 --- src/colvar_gpu_support.h | 2 ++ src/cuda/colvaratoms_kernel.cu | 20 ++++++++++---------- src/cuda/colvartypes_kernel.cu | 18 +++++++++--------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index 9faeb8f3e..73790af5a 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -13,10 +13,12 @@ #ifdef COLVARS_NVTX_PROFILING #include #endif +#define COLVARS_SYNC_WARP __syncwarp() #endif // defined(COLVARS_CUDA) #if defined(COLVARS_HIP) #include +#define COLVARS_SYNC_WARP #endif // defined(COLVARS_HIP) #if defined(COLVARS_HIP) diff --git a/src/cuda/colvaratoms_kernel.cu b/src/cuda/colvaratoms_kernel.cu index cfa30746e..950433139 100644 --- a/src/cuda/colvaratoms_kernel.cu +++ b/src/cuda/colvaratoms_kernel.cu @@ -665,18 +665,18 @@ __global__ void calc_fit_forces_impl_loop1_kernel( if (tid < valid_items) { dxdq_dqdC += rot_deriv->project_force_to_C_from_dxdqi(tid, sum_dxdq[tid]); } - __syncwarp(); + COLVARS_SYNC_WARP; using WarpReduce = cub::WarpReduce; __shared__ typename WarpReduce::TempStorage warp_temp_storage; - dxdq_dqdC.xx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xx, valid_items); __syncwarp(); - dxdq_dqdC.xy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xy, valid_items); __syncwarp(); - dxdq_dqdC.xz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xz, valid_items); __syncwarp(); - dxdq_dqdC.yx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yx, valid_items); __syncwarp(); - dxdq_dqdC.yy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yy, valid_items); __syncwarp(); - dxdq_dqdC.yz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yz, valid_items); __syncwarp(); - dxdq_dqdC.zx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zx, valid_items); __syncwarp(); - dxdq_dqdC.zy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zy, valid_items); __syncwarp(); - dxdq_dqdC.zz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zz, valid_items); __syncwarp(); + dxdq_dqdC.xx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xx, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.xy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xy, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.xz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.xz, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.yx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yx, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.yy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yy, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.yz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.yz, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.zx = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zx, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.zy = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zy, valid_items); COLVARS_SYNC_WARP; + dxdq_dqdC.zz = WarpReduce(warp_temp_storage).Sum(dxdq_dqdC.zz, valid_items); COLVARS_SYNC_WARP; if (tid == 0) { dxdC->xx = dxdq_dqdC.xx; dxdC->xy = dxdq_dqdC.xy; diff --git a/src/cuda/colvartypes_kernel.cu b/src/cuda/colvartypes_kernel.cu index 5a69ede9e..424a77acb 100644 --- a/src/cuda/colvartypes_kernel.cu +++ b/src/cuda/colvartypes_kernel.cu @@ -278,17 +278,17 @@ __global__ void jacobi_4x4_kernel( const double a_qq = A[q*4+q]; compute_c_s(a_pq, a_pp, a_qq, c, s, c2, s2, cs); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 0 && rotate) { apply_jacobi<0, 1>(A, c, s, c2, s2, cs); multiply_jacobi<0, 1>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 1 && rotate) { apply_jacobi<2, 3>(A, c, s, c2, s2, cs); multiply_jacobi<2, 3>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; rotate = false; p = p_ids[idx+2]; q = q_ids[idx+2]; @@ -299,17 +299,17 @@ __global__ void jacobi_4x4_kernel( const double a_qq = A[q*4+q]; compute_c_s(a_pq, a_pp, a_qq, c, s, c2, s2, cs); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 0 && rotate) { apply_jacobi<0, 2>(A, c, s, c2, s2, cs); multiply_jacobi<0, 2>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 1 && rotate) { apply_jacobi<1, 3>(A, c, s, c2, s2, cs); multiply_jacobi<1, 3>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; rotate = false; p = p_ids[idx+4]; q = q_ids[idx+4]; @@ -320,17 +320,17 @@ __global__ void jacobi_4x4_kernel( const double a_qq = A[q*4+q]; compute_c_s(a_pq, a_pp, a_qq, c, s, c2, s2, cs); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 0 && rotate) { apply_jacobi<0, 3>(A, c, s, c2, s2, cs); multiply_jacobi<0, 3>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; if (idx == 1 && rotate) { apply_jacobi<1, 2>(A, c, s, c2, s2, cs); multiply_jacobi<1, 2>(V, c, s); } - __syncwarp(); + COLVARS_SYNC_WARP; off_diag_sum = fabs(A[0*4+1]) + fabs(A[0*4+2]) + fabs(A[0*4+3]) + fabs(A[1*4+2]) + fabs(A[1*4+3]) + From 82504e3622cd83d319eb7ce0ccc72cf87c8a47ab Mon Sep 17 00:00:00 2001 From: HanatoK Date: Thu, 16 Apr 2026 10:46:29 -0500 Subject: [PATCH 12/14] fix: use __threadfence() --- src/colvar_gpu_support.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index 73790af5a..fd4de0e17 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -18,7 +18,7 @@ #if defined(COLVARS_HIP) #include -#define COLVARS_SYNC_WARP +#define COLVARS_SYNC_WARP __threadfence() #endif // defined(COLVARS_HIP) #if defined(COLVARS_HIP) From 49ab87aac07b977a280131fe9d2db0425df10b78 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 20 Apr 2026 09:38:28 -0500 Subject: [PATCH 13/14] build: refine the HIP macro Only use __syncwarp on HIP 7 and above. --- src/colvar_gpu_support.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/colvar_gpu_support.h b/src/colvar_gpu_support.h index fd4de0e17..1c476e17f 100644 --- a/src/colvar_gpu_support.h +++ b/src/colvar_gpu_support.h @@ -18,7 +18,17 @@ #if defined(COLVARS_HIP) #include -#define COLVARS_SYNC_WARP __threadfence() +#if defined(__HIP_PLATFORM_AMD__) + #if HIP_VERSION_MAJOR >= 7 + #define COLVARS_SYNC_WARP __syncwarp() + #else + #define COLVARS_SYNC_WARP __threadfence_block() + #endif +#elif defined(__HIP_PLATFORM_NVIDIA__) + #define COLVARS_SYNC_WARP __syncwarp() +#else + #error "Unknown HIP platform" +#endif #endif // defined(COLVARS_HIP) #if defined(COLVARS_HIP) From 5019e2e25f7055853cb76e60f168e48d9d4a1f93 Mon Sep 17 00:00:00 2001 From: HanatoK Date: Mon, 20 Apr 2026 09:39:14 -0500 Subject: [PATCH 14/14] cleanup: use constexpr for ID arrays in GPU Jacobi diagonlization --- src/cuda/colvartypes_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda/colvartypes_kernel.cu b/src/cuda/colvartypes_kernel.cu index 424a77acb..5b3e0224f 100644 --- a/src/cuda/colvartypes_kernel.cu +++ b/src/cuda/colvartypes_kernel.cu @@ -259,8 +259,8 @@ __global__ void jacobi_4x4_kernel( A[15] = A_in[15]; } __syncthreads(); - const int p_ids[] = {0, 2, 0, 1, 0, 1}; - const int q_ids[] = {1, 3, 2, 3, 3, 2}; + constexpr int p_ids[] = {0, 2, 0, 1, 0, 1}; + constexpr int q_ids[] = {1, 3, 2, 3, 3, 2}; double off_diag_sum = fabs(A[0*4+1]) + fabs(A[0*4+2]) + fabs(A[0*4+3]) + fabs(A[1*4+2]) + fabs(A[1*4+3]) +