Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
7aa727d
Add gpu kernel unit testing infra
pfultz2 Aug 1, 2025
c20742f
Format
pfultz2 Aug 1, 2025
b0d7a46
Add algorithm tests
pfultz2 Aug 4, 2025
ea12168
Format
pfultz2 Aug 4, 2025
ab3e73a
Apply fixits
pfultz2 Aug 4, 2025
99250c3
Format
pfultz2 Aug 4, 2025
bfd58be
Exclude test.hpp header
pfultz2 Aug 4, 2025
35fcec7
Report failures and stop on expect but not on check
pfultz2 Aug 25, 2025
e824891
Format
pfultz2 Aug 25, 2025
0fd319c
Add parallel compilation
pfultz2 Aug 29, 2025
be87e05
Add array tests
pfultz2 Aug 29, 2025
25b9bc0
Format
pfultz2 Aug 29, 2025
83729a7
Add shape tests
pfultz2 Aug 29, 2025
d943b73
Merge branch 'develop' into gpu-kernel-unit-tests
pfultz2 Aug 29, 2025
8995591
Update standard shape calculation
pfultz2 Aug 30, 2025
c9c886c
Format
pfultz2 Aug 30, 2025
f23916e
Merge branch 'develop' into gpu-kernel-unit-tests
pfultz2 Sep 4, 2025
57d61b5
Fix null
pfultz2 Sep 9, 2025
fdbad61
Fix tidy warnings
pfultz2 Sep 9, 2025
7884aaa
format
pfultz2 Sep 9, 2025
fe9d7d0
Merge
pfultz2 Oct 9, 2025
f89a512
Format
pfultz2 Oct 9, 2025
7a2584d
Add interface keyword
pfultz2 Oct 10, 2025
40ac7ff
Fix tidy issues
pfultz2 Nov 4, 2025
0f55819
Format
pfultz2 Nov 4, 2025
33103c8
Fix not lint line
pfultz2 Nov 4, 2025
8f4c407
Format
pfultz2 Nov 4, 2025
83c0df7
Fix cppcheck
pfultz2 Nov 4, 2025
5fa3b10
Merge branch 'develop' into gpu-kernel-unit-tests
pfultz2 Nov 4, 2025
f3e41c0
Revert non gpu changes
pfultz2 Nov 4, 2025
8ba30d8
Format
pfultz2 Nov 4, 2025
ade219d
Revert "Format"
pfultz2 Nov 4, 2025
82afa17
Remove basic tests
pfultz2 Nov 4, 2025
c3e5c2d
Add missing license
pfultz2 Nov 4, 2025
f9755c2
Format
pfultz2 Nov 4, 2025
d011eb5
Format
pfultz2 Nov 4, 2025
9b720e8
Fix errors with default constructor shared memory
pfultz2 Nov 4, 2025
64b797d
Format
pfultz2 Nov 4, 2025
2e2a0ea
Fix assertion
pfultz2 Nov 4, 2025
705777d
Add license
pfultz2 Nov 5, 2025
340d534
Format
pfultz2 Nov 5, 2025
53d1fe4
Format
pfultz2 Nov 5, 2025
06dc59f
Merge branch 'develop' into gpu-kernel-unit-tests
pfultz2 Nov 18, 2025
9a9aab1
Fix hiprtc error
pfultz2 Nov 18, 2025
99333f9
Add missing array header for windows
pfultz2 Nov 19, 2025
03e9075
Add missing array header for windows
pfultz2 Nov 19, 2025
7eb3bbe
Add more array headers
pfultz2 Nov 19, 2025
0fe0d44
Add array header
pfultz2 Nov 19, 2025
0951892
Add array header for windows
pfultz2 Nov 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ rocm_enable_cppcheck(
# Disable because of too many FPs
arithOperationsOnVoidPointer
definePrefix:*test/include/test.hpp
definePrefix:*src/targets/gpu/kernels/include/migraphx/kernels/test.hpp
UseNamedLogicOperator:*src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
ctuOneDefinitionRuleViolation:*test/*
useSmartPointer:*src/api/api.cpp
useSmartPointer:*make_shared_array.hpp
Expand Down
23 changes: 13 additions & 10 deletions src/targets/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,23 +108,26 @@ target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CUR
target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)

add_library(kernel_file_check EXCLUDE_FROM_ALL)
add_library(compile_migraphx_gpu_kernels INTERFACE)
target_compile_definitions(compile_migraphx_gpu_kernels INTERFACE -DMIGRAPHX_NLOCAL=256)
target_compile_definitions(compile_migraphx_gpu_kernels INTERFACE -DMIGRAPHX_WAVEFRONTSIZE=64)
target_include_directories(compile_migraphx_gpu_kernels INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
target_link_libraries(compile_migraphx_gpu_kernels INTERFACE compile_for_gpu)
if(MIGRAPHX_USE_COMPOSABLEKERNEL)
target_link_libraries(compile_migraphx_gpu_kernels INTERFACE composable_kernel::jit_library)
endif()

add_library(migraphx_gpu_kernel_file_check EXCLUDE_FROM_ALL)

foreach(KERNEL_FILE ${KERNEL_FILES})
get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
target_sources(migraphx_gpu_kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
endforeach()

target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_WAVEFRONTSIZE=64)
target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
target_link_libraries(kernel_file_check compile_for_gpu)
if(MIGRAPHX_USE_COMPOSABLEKERNEL)
target_link_libraries(kernel_file_check composable_kernel::jit_library)
endif()
target_link_libraries(migraphx_gpu_kernel_file_check compile_migraphx_gpu_kernels)

rocm_clang_tidy_check(kernel_file_check)
rocm_clang_tidy_check(migraphx_gpu_kernel_file_check)

file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)

Expand Down
27 changes: 17 additions & 10 deletions src/targets/gpu/compile_hip_code_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,11 @@ std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t ma
return std::min(std::max(min_block_size, block_size), max_block_size);
}

operation
compile_hip_code_object(context& ctx, const std::string& content, hip_compile_options options)
std::vector<char>
compile_hip_raw(context& ctx, const std::string& content, hip_compile_options options)
{
assert(options.global > 0);
assert(options.local > 0);
assert(not options.inputs.empty());
assert(options.inputs.size() == options.virtual_inputs.size() or
options.virtual_inputs.empty());
std::vector<src_file> srcs = options.additional_src_files;
static auto kernels{::migraphx_kernels()};
std::transform(
Expand All @@ -185,9 +182,6 @@ compile_hip_code_object(context& ctx, const std::string& content, hip_compile_op
std::back_inserter(srcs),
[](const std::pair<std::string_view, std::string_view>& elem) { return src_file{elem}; });
srcs.emplace_back("main.cpp", content);
auto args_hpp =
generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
srcs.emplace_back("args.hpp", args_hpp);

if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
options.emplace_param("-fno-offload-uniform-block");
Expand All @@ -202,10 +196,23 @@ compile_hip_code_object(context& ctx, const std::string& content, hip_compile_op
options.params.insert(options.params.end(), warnings.begin(), warnings.end());
options.emplace_param("-ftemplate-backtrace-limit=0");
options.emplace_param("-Werror");
auto cos = compile_hip_src(srcs, options.params, get_device_name());
auto cos = compile_hip_src(srcs, options.params, ctx.get_current_device().get_device_name());
if(cos.size() != 1)
MIGRAPHX_THROW("No code object");
return code_object_op{value::binary{cos.front()},
return cos.front();
}

operation
compile_hip_code_object(context& ctx, const std::string& content, hip_compile_options options)
{
assert(not options.inputs.empty());
assert(options.inputs.size() == options.virtual_inputs.size() or
options.virtual_inputs.empty());
auto args_hpp =
generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
options.additional_src_files.emplace_back("args.hpp", args_hpp);

return code_object_op{value::binary{compile_hip_raw(ctx, content, options)},
options.kernel_name,
options.global,
options.local,
Expand Down
4 changes: 2 additions & 2 deletions src/targets/gpu/hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ static std::vector<T> read_from_gpu(const void* x, std::size_t sz)
return result;
}

static std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false)
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(host or is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ struct hip_compile_options
MIGRAPHX_GPU_EXPORT std::function<std::size_t(std::size_t local)>
compute_global_for(const context& ctx, std::size_t n, std::size_t over = 1);

MIGRAPHX_GPU_EXPORT std::vector<char>
compile_hip_raw(context& ctx, const std::string& content, hip_compile_options options);

MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(context& ctx,
const std::string& content,
hip_compile_options options);
Expand Down
9 changes: 9 additions & 0 deletions src/targets/gpu/include/migraphx/gpu/hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ MIGRAPHX_GPU_EXPORT argument get_preallocation(context& ctx, const std::string&

MIGRAPHX_GPU_EXPORT void gpu_fill(context& ctx, const argument& dst, int value = 0);

MIGRAPHX_GPU_EXPORT std::shared_ptr<void>
write_to_gpu(const void* x, std::size_t sz, bool host = false);

template <class T>
std::shared_ptr<T> write_to_gpu(const T& x, bool host = false)
{
return std::static_pointer_cast<T>(write_to_gpu(&x, sizeof(T), host));
}

struct hip_allocate
{
shape s;
Expand Down
2 changes: 2 additions & 0 deletions src/targets/gpu/include/migraphx/gpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ struct MIGRAPHX_GPU_EXPORT kernel
{
}

bool empty() const;

void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
Expand Down
2 changes: 2 additions & 0 deletions src/targets/gpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ kernel::kernel(const char* image, const std::string& name) : impl(std::make_shar
MIGRAPHX_THROW("Failed to get function: " + name + ": " + hip_error(status));
}

bool kernel::empty() const { return impl == nullptr; }

static void launch_kernel(hipFunction_t fun,
hipStream_t stream,
std::size_t global,
Expand Down
17 changes: 17 additions & 0 deletions src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ struct greater
}
};

template <class Iterator, class T>
constexpr void fill(Iterator first, Iterator last, const T& value)
{
for(; first != last; ++first)
*first = value;
}

template <class InputIt, class T, class BinaryOperation>
constexpr T accumulate(InputIt first, InputIt last, T init, BinaryOperation op)
{
Expand Down Expand Up @@ -96,6 +103,16 @@ constexpr OutputIt copy_if(InputIt first, InputIt last, OutputIt d_first, UnaryP
return d_first;
}

template <class Iterator, class OutputIterator, class UnaryOp>
constexpr OutputIterator
transform(Iterator first1, Iterator last1, OutputIterator out, UnaryOp unary_op)
{
for(; first1 != last1; ++out, ++first1)
*out = unary_op(*first1);

return out;
}

template <class Iterator, class Compare>
constexpr Iterator is_sorted_until(Iterator first, Iterator last, Compare comp)
{
Expand Down
19 changes: 9 additions & 10 deletions src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -28,6 +28,7 @@
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/algorithm.hpp>
#include <migraphx/kernels/debug.hpp>

namespace migraphx {
Expand Down Expand Up @@ -122,21 +123,20 @@ template <class T, index_int N>
struct array
{
using value_type = T;
T d[N];
T d[N] = {{}};

constexpr array() = default;

template <class... Ts,
MIGRAPHX_REQUIRES(sizeof...(Ts) == N and (is_convertible<Ts, T>{} and ...))>
constexpr array(Ts... xs) : d{xs...}
constexpr array(Ts... xs) : d{static_cast<value_type>(xs)...}
Copy link

Copilot AI Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explicit cast to value_type in the variadic constructor could cause silent truncation or precision loss. Consider using a concept or SFINAE to ensure safe conversions, or document the potential for data loss in conversions.

Copilot uses AI. Check for mistakes.
{
}

template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{} and (N > 1))>
constexpr explicit array(U x)
{
for(index_int i = 0; i < N; i++)
d[i] = x;
fill(this->begin(), this->end(), x);
}

constexpr T& operator[](index_int i)
Expand Down Expand Up @@ -195,8 +195,7 @@ struct array
constexpr auto apply(F f) const
{
array<decltype(f(d[0])), N> result;
for(index_int i = 0; i < N; i++)
result[i] = f(d[i]);
transform(this->begin(), this->end(), result.begin(), f);
return result;
}

Expand All @@ -214,9 +213,9 @@ struct array
MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
MIGRAPHX_DEVICE_ARRAY_OP(/=, /)
MIGRAPHX_DEVICE_ARRAY_OP(%=, %)
MIGRAPHX_DEVICE_ARRAY_OP(&=, &)
MIGRAPHX_DEVICE_ARRAY_OP(|=, |)
MIGRAPHX_DEVICE_ARRAY_OP(^=, ^)
MIGRAPHX_DEVICE_ARRAY_OP(&=, &) // NOLINT(hicpp-signed-bitwise)
MIGRAPHX_DEVICE_ARRAY_OP(|=, |) // NOLINT(hicpp-signed-bitwise)
MIGRAPHX_DEVICE_ARRAY_OP(^=, ^) // NOLINT(hicpp-signed-bitwise)

friend constexpr bool operator==(const array& x, const array& y)
{
Expand Down
5 changes: 3 additions & 2 deletions src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/scatter_reduction_modes.hpp>
#include <migraphx/kernels/tuple.hpp>
#include <migraphx/kernels/uninitialized_buffer.hpp>
#include <migraphx/kernels/pp.hpp>

namespace migraphx {
Expand Down Expand Up @@ -199,7 +200,7 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
#endif
constexpr index_int lanes_per_thread = MIGRAPHX_WAVEFRONTSIZE;
using type = decltype(index::invoke_loop(f, 0, _c<0>));
__shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
__shared__ uninitialized_buffer<type, decltype(idx.max_nlocal()){} / lanes_per_thread> buffer;
auto x = type(init);
idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
dpp_reduce(x, op);
Expand All @@ -224,7 +225,7 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
{
MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
using type = decltype(index::invoke_loop(f, 0, _c<0>));
__shared__ type buffer[idx.max_nlocal()];
__shared__ uninitialized_buffer<type, decltype(idx.max_nlocal()){}> buffer;
auto x = type(init);
idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
buffer[idx.local] = x;
Expand Down
62 changes: 53 additions & 9 deletions src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,60 @@ struct shape : equality_comparable<shape<Lens, Strides>>
}
constexpr auto skips() const
{
return return_c([] {
auto lstrides = Strides{};
return none_of(lstrides.begin(), lstrides.end(), [](auto x) { return x == 1; });
});
if constexpr(decltype(this->elements()){} == 1)
{
return false_type{};
}
else
{
return return_c([] {
auto lstrides = Strides{};
return none_of(lstrides.begin(), lstrides.end(), [](auto x) { return x == 1; });
});
}
}

constexpr auto standard() const { return packed() and not transposed(); }
constexpr auto standard() const
{
if constexpr(decltype(this->elements()){} == 1)
{
return true_type{};
}
else
{
return return_c([] {
constexpr auto n = decltype(this->elements()){};
struct state
{
bool ok = true;
index_int expected = decltype(n){};
};
auto reduce = [](state acc, array<index_int, 2> x) -> state {
index_int len = x[0];
index_int stride = x[1];
if(not acc.ok)
return acc;
if(len == 1)
return acc;
if(acc.expected % len != 0)
return {false};
acc.expected /= len;
if(stride != acc.expected)
return {false};
return acc;
};
auto ldims = Lens{};
auto lstrides = Strides{};
return inner_product(ldims.begin(),
ldims.end(),
lstrides.begin(),
state{},
reduce,
MIGRAPHX_LIFT(make_array))
.ok;
});
}
}

constexpr index_int index(index_array x) const { return x.dot(strides); }

Expand All @@ -85,10 +132,7 @@ struct shape : equality_comparable<shape<Lens, Strides>>
MIGRAPHX_ASSERT(i >= elements() or i == compute_index(i));
return i;
}
else
{
return compute_index(i);
}
return compute_index(i);
}

constexpr index_int compute_index(index_int i) const
Expand Down
Loading
Loading