diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/args.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/args.hpp
index eba6771964b..7568ce9fc20 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv/args.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/args.hpp
@@ -3,12 +3,16 @@
 
 #pragma once
 
+#include <array>
+#include <optional>
+#include <stdexcept>
+
 #include "ck_tile/builder/conv_signature_concepts.hpp"
-#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
 #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
 #include "ck_tile/builder/testing/testing.hpp"
 #include "ck_tile/builder/testing/filter_extent.hpp"
 #include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/validation.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -49,6 +53,99 @@ struct ConvTensorLengths
     FilterExtent<SPATIAL_DIM> filter = {}; // X, Y, Z
 };
 
+namespace detail {
+
+/// @brief Calculate memory strides for a tensor with custom dimension ordering.
+///
+/// Given tensor dimensions and a memory layout order, compute the stride
+/// (memory jump size) needed to move by 1 in each dimension.
+///
+/// @param lengths Tensor dimensions (e.g., {3, 4} for 3 rows × 4 columns)
+/// @param outer_to_inner Dimension ordering from outermost to innermost in memory
+/// @return Strides for each dimension (e.g., {4, 1} for row-major 3×4 tensor)
+///
+/// Example: For a 3×4 tensor stored row-major (outer_to_inner = {0, 1}):
+///   - Moving 1 row down requires jumping 4 positions → stride[0] = 4
+///   - Moving 1 column right requires jumping 1 position → stride[1] = 1
+template <size_t RANK>
+Extent<RANK> make_packed_strides_for_order(const Extent<RANK>& lengths,
+                                           const std::array<size_t, RANK>& outer_to_inner)
+{
+    Extent<RANK> strides = {};
+
+    size_t stride = 1; // Innermost dimension always has stride 1
+    for(size_t i = RANK; i > 0; --i)
+    {
+        const auto dim = outer_to_inner[i - 1]; // Get dimension at this position
+        strides[dim]   = stride;                // Assign current stride
+        stride *= lengths[dim];                 // Update stride for next (outer) dimension
+    }
+
+    return strides;
+}
+
+template <int SPATIAL_DIM>
+std::array<long_index_t, SPATIAL_DIM> to_spatial_array(const FilterExtent<SPATIAL_DIM>& extent)
+{
+    if constexpr(SPATIAL_DIM == 1)
+    {
+        return {static_cast<long_index_t>(extent.width)};
+    }
+    else if constexpr(SPATIAL_DIM == 2)
+    {
+        // CK Builder uses spatial ordering {H, W} for 2D.
+        return {static_cast<long_index_t>(extent.height), static_cast<long_index_t>(extent.width)};
+    }
+    else
+    {
+        // CK Builder uses spatial ordering {D, H, W} for 3D.
+        return {static_cast<long_index_t>(extent.depth),
+                static_cast<long_index_t>(extent.height),
+                static_cast<long_index_t>(extent.width)};
+    }
+}
+
+template <int SPATIAL_DIM>
+std::array<long_index_t, SPATIAL_DIM>
+compute_output_spatial(const std::array<long_index_t, SPATIAL_DIM>& input_spatial,
+                       const std::array<long_index_t, SPATIAL_DIM>& filter_spatial,
+                       const std::array<long_index_t, SPATIAL_DIM>& conv_strides,
+                       const std::array<long_index_t, SPATIAL_DIM>& conv_dilations,
+                       const std::array<long_index_t, SPATIAL_DIM>& left_pads,
+                       const std::array<long_index_t, SPATIAL_DIM>& right_pads)
+{
+    std::array<long_index_t, SPATIAL_DIM> output_spatial = {};
+
+    for(int i = 0; i < SPATIAL_DIM; ++i)
+    {
+        const auto in  = input_spatial[i];
+        const auto fil = filter_spatial[i];
+        const auto s   = conv_strides[i];
+        const auto d   = conv_dilations[i];
+        const auto pl  = left_pads[i];
+        const auto pr  = right_pads[i];
+
+        // effective_filter = dilation*(filter-1) + 1
+        const auto effective_filter = d * (fil - 1) + 1;
+        const auto numerator        = in + pl + pr - effective_filter;
+
+        if(s <= 0)
+        {
+            throw std::runtime_error("invalid convolution stride (must be > 0)");
+        }
+        if(numerator < 0)
+        {
+            throw std::runtime_error("invalid convolution parameters (negative output spatial)");
+        }
+
+        output_spatial[i] = numerator / s + 1;
+    }
+
+    return output_spatial;
+}
+
+} // namespace detail
+
 /// @brief `Args` specialization for forward convolution.
 ///
 /// @tparam SIGNATURE Forward convolution signature.
@@ -74,15 +171,20 @@ struct Args<SIGNATURE>
     // TODO: We shouldn't need to call into an internal namespace here.
     using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
 
+    // NOTE: ConvTensorLayouts removed - not used in this file and causes compilation error
     // TODO: We shouldn't need to call into an internal namespace here.
-    using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
+    // using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
 
     ConvTensorLengths<SPATIAL_DIM> lengths;
 
-    // TODO: Tensor strides. This needs a new structure as well as some
-    // reworking of the make_*_descriptor() functions, as the current
-    // implementation (based on ConvParam in old CK / CK Tile) does not
-    // support strides at all.
+    // Optional explicit tensor-memory strides (in elements), for custom/non-packed tensors.
+    // When not set, packed strides are derived automatically from the selected TensorLayout.
+    // NOTE: These have explicit default initializers to avoid
+    // -Wmissing-designated-field-initializers when `Args` is aggregate-initialized
+    // using designated initializers in tests.
+    std::optional<typename InputDescriptor::Extent> input_strides   = std::nullopt;
+    std::optional<typename WeightDescriptor::Extent> weight_strides = std::nullopt;
+    std::optional<typename OutputDescriptor::Extent> output_strides = std::nullopt;
 
     FilterExtent<SPATIAL_DIM> filter_strides;
     FilterExtent<SPATIAL_DIM> filter_dilation;
@@ -95,23 +197,112 @@ struct Args<SIGNATURE>
 
     int k_batch = 1;
 
+    /// @brief Compute output spatial dimensions from convolution parameters.
+    ///
+    /// @returns FilterExtent with computed output height, width, (and depth for 3D)
+    FilterExtent<SPATIAL_DIM> compute_output_spatial() const
+    {
+        const auto input_spatial_arr  = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.image);
+        const auto filter_spatial_arr = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.filter);
+        const auto conv_strides_arr   = detail::to_spatial_array<SPATIAL_DIM>(this->filter_strides);
+        const auto conv_dilations_arr =
+            detail::to_spatial_array<SPATIAL_DIM>(this->filter_dilation);
+        const auto left_pads_arr  = detail::to_spatial_array<SPATIAL_DIM>(this->input_left_pad);
+        const auto right_pads_arr = detail::to_spatial_array<SPATIAL_DIM>(this->input_right_pad);
+
+        const auto output_spatial_arr =
+            detail::compute_output_spatial<SPATIAL_DIM>(input_spatial_arr,
+                                                        filter_spatial_arr,
+                                                        conv_strides_arr,
+                                                        conv_dilations_arr,
+                                                        left_pads_arr,
+                                                        right_pads_arr);
+
+        return filter_extent_from_vector<SPATIAL_DIM>(
+            std::vector<std::size_t>(output_spatial_arr.begin(), output_spatial_arr.end()));
+    }
+
     /// This function returns the `TensorDescriptor` corresponding to
     /// the input-tensor of the convolution problem. This can then
     /// be used to, for example, allocate memory.
     InputDescriptor make_input_descriptor() const
     {
-        // TODO: We're using old CK functionality to compute the right
-        // values here, mainly because CK tile does not support the
-        // right tensor layouts here. We should probably change that
-        // because CK currently prints an annoying message about it,
-        // plus that would let us get rid of the `to_ck_conv_param()`
-        // function.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
-             typename Layouts::InLayout>(param);
         using Extent = typename InputDescriptor::Extent;
-        return InputDescriptor(Extent::from_vector(desc.GetLengths()),
-                               Extent::from_vector(desc.GetStrides()));
+        Extent lens  = {};
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.batch_size;
+        lens[2] = this->lengths.input_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = this->lengths.image.width;
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = this->lengths.image.height;
+            lens[4] = this->lengths.image.width;
+        }
+        else
+        {
+            lens[3] = this->lengths.image.depth;
+            lens[4] = this->lengths.image.height;
+            lens[5] = this->lengths.image.width;
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.input.config.layout;
+
+            if constexpr(SPATIAL_DIM == 1)
+            {
+                if constexpr(layout == TensorLayout::GNCW)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GNWC ||
+                                  layout == TensorLayout::G_NW_C_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::NWGC)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCW)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D input layout for descriptor initialization.");
+            }
+            else if constexpr(SPATIAL_DIM == 2)
+            {
+                if constexpr(layout == TensorLayout::GNCHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GNHWC ||
+                                  layout == TensorLayout::G_NHW_C_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::NHWGC)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D input layout for descriptor initialization.");
+            }
+            else
+            {
+                if constexpr(layout == TensorLayout::GNCDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GNDHWC ||
+                                  layout == TensorLayout::G_NDHW_C_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::NDHWGC)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D input layout for descriptor initialization.");
+            }
+        };
+
+        const Extent strides =
+            this->input_strides.has_value() ? *this->input_strides : make_default_strides();
+
+        return InputDescriptor(lens, strides);
     }
 
     /// This function returns the `TensorDescriptor` corresponding to
@@ -119,13 +310,76 @@ struct Args<SIGNATURE>
     /// be used to, for example, allocate memory.
     WeightDescriptor make_weight_descriptor() const
     {
-        // See note in implementation of `make_input_descriptor`.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
-             typename Layouts::WeiLayout>(param);
         using Extent = typename WeightDescriptor::Extent;
-        return WeightDescriptor(Extent::from_vector(desc.GetLengths()),
-                                Extent::from_vector(desc.GetStrides()));
+        Extent lens  = {};
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.output_channels;
+        lens[2] = this->lengths.input_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = this->lengths.filter.width;
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = this->lengths.filter.height;
+            lens[4] = this->lengths.filter.width;
+        }
+        else
+        {
+            lens[3] = this->lengths.filter.depth;
+            lens[4] = this->lengths.filter.height;
+            lens[5] = this->lengths.filter.width;
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.weight.config.layout;
+
+            if constexpr(SPATIAL_DIM == 1)
+            {
+                if constexpr(layout == TensorLayout::GKCX)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GKXC ||
+                                  layout == TensorLayout::G_K_X_C_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::KXGC)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D weight layout for descriptor initialization.");
+            }
+            else if constexpr(SPATIAL_DIM == 2)
+            {
+                if constexpr(layout == TensorLayout::GKCYX)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GKYXC ||
+                                  layout == TensorLayout::G_K_YX_C_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::KYXGC)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D weight layout for descriptor initialization.");
+            }
+            else
+            {
+                if constexpr(layout == TensorLayout::GKCZYX)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GKZYXC ||
+                                  layout == TensorLayout::G_K_ZYX_C_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::KZYXGC)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D weight layout for descriptor initialization.");
+            }
+        };
+
+        const Extent strides =
+            this->weight_strides.has_value() ? *this->weight_strides : make_default_strides();
+
+        return WeightDescriptor(lens, strides);
     }
 
     /// This function returns the `TensorDescriptor` corresponding to
@@ -133,43 +387,84 @@ struct Args<SIGNATURE>
     /// be used to, for example, allocate memory.
     OutputDescriptor make_output_descriptor() const
     {
-        // See note in implementation of `make_input_descriptor`.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
-             typename Layouts::OutLayout>(param);
         using Extent = typename OutputDescriptor::Extent;
-        return OutputDescriptor(Extent::from_vector(desc.GetLengths()),
-                                Extent::from_vector(desc.GetStrides()));
-    }
+        Extent lens  = {};
+
+        const auto output_spatial = compute_output_spatial();
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.batch_size;
+        lens[2] = this->lengths.output_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = output_spatial.width;
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = output_spatial.height;
+            lens[4] = output_spatial.width;
+        }
+        else
+        {
+            lens[3] = output_spatial.depth;
+            lens[4] = output_spatial.height;
+            lens[5] = output_spatial.width;
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.output.config.layout;
 
-    /// Convert the Args structure into a CK conv_param structure. This
-    /// function is mainly used to be able to use the existing
-    /// CK-functionality to obtain tensor descriptors.
-    ck::utils::conv::ConvParam to_ck_conv_param() const
-    {
-        const auto to_vector = [](const auto& extent) {
             if constexpr(SPATIAL_DIM == 1)
-                return std::vector<ck::index_t>{ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKW)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GNWK ||
+                                  layout == TensorLayout::G_NW_K_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::NWGK)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKW)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D output layout for descriptor initialization.");
+            }
             else if constexpr(SPATIAL_DIM == 2)
-                return std::vector<ck::index_t>{ck::index_t(extent.height),
-                                                ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GNHWK ||
+                                  layout == TensorLayout::G_NHW_K_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::NHWGK)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D output layout for descriptor initialization.");
+            }
             else
-                return std::vector<ck::index_t>{ck::index_t(extent.depth),
-                                                ck::index_t(extent.height),
-                                                ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GNDHWK ||
+                                  layout == TensorLayout::G_NDHW_K_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::NDHWGK)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D output layout for descriptor initialization.");
+            }
         };
 
-        return ck::utils::conv::ConvParam(SPATIAL_DIM,
-                                          this->lengths.groups,
-                                          this->lengths.batch_size,
-                                          this->lengths.output_channels,
-                                          this->lengths.input_channels,
-                                          to_vector(this->lengths.filter),
-                                          to_vector(this->lengths.image),
-                                          to_vector(this->filter_strides),
-                                          to_vector(this->filter_dilation),
-                                          to_vector(this->input_left_pad),
-                                          to_vector(this->input_right_pad));
+        const Extent strides =
+            this->output_strides.has_value() ? *this->output_strides : make_default_strides();
+
+        return OutputDescriptor(lens, strides);
     }
 
     /// Convert the Args structure into a CK Tile conv_param structure.
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/fwd_ck.hpp
index 5eca79508c1..05302d9e3d3 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv/fwd_ck.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/fwd_ck.hpp
@@ -103,24 +103,12 @@ template <auto SIGNATURE>
 {
     constexpr auto spatial_dim = SIGNATURE.spatial_dim;
 
-    const auto copy = [](const auto& src, auto& dst) {
-        std::copy(src.begin(), src.end(), dst.begin());
-    };
-
     const auto to_ck_lengths = [&](const auto& src) {
         std::array<ck::index_t, spatial_dim + 3> result;
-        copy(src, result);
-        return result;
-    };
-
-    const auto to_ck_extent = [&](const auto& extent) {
-        std::array<ck::index_t, spatial_dim> result;
-        copy(extent, result);
+        std::copy(src.begin(), src.end(), result.begin());
         return result;
     };
 
-    const auto param = args.to_ck_conv_param();
-
     const auto input_desc  = args.make_input_descriptor();
     const auto weight_desc = args.make_weight_descriptor();
     const auto output_desc = args.make_output_descriptor();
@@ -140,10 +128,10 @@ template <auto SIGNATURE>
                                      {},
                                      to_ck_lengths(output_desc.get_lengths()),
                                      to_ck_lengths(output_desc.get_strides()),
-                                     to_ck_extent(param.conv_filter_strides_),
-                                     to_ck_extent(param.conv_filter_dilations_),
-                                     to_ck_extent(param.input_left_pads_),
-                                     to_ck_extent(param.input_right_pads_),
+                                     args.filter_strides.to_array<ck::index_t>(),
+                                     args.filter_dilation.to_array<ck::index_t>(),
+                                     args.input_left_pad.to_array<ck::index_t>(),
+                                     args.input_right_pad.to_array<ck::index_t>(),
                                      args.a_elementwise_op,
                                      args.b_elementwise_op,
                                      args.cde_elementwise_op);
diff --git a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp
index 2fc1f390127..89eea6142f4 100644
--- a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp
@@ -17,6 +17,12 @@ template <>
 struct FilterExtent<1>
 {
     size_t width = 1;
+
+    template <typename T>
+    std::array<T, 1> to_array() const
+    {
+        return {static_cast<T>(width)};
+    }
 };
 
 template <>
@@ -24,6 +30,12 @@ struct FilterExtent<2>
 {
     size_t width  = 1;
     size_t height = 1;
+
+    template <typename T>
+    std::array<T, 2> to_array() const
+    {
+        return {static_cast<T>(height), static_cast<T>(width)};
+    }
 };
 
 template <>
@@ -32,6 +44,12 @@ struct FilterExtent<3>
     size_t width  = 1;
     size_t height = 1;
     size_t depth  = 1;
+
+    template <typename T>
+    std::array<T, 3> to_array() const
+    {
+        return {static_cast<T>(depth), static_cast<T>(height), static_cast<T>(width)};
+    }
 };
 
 template <int SPATIAL_DIM>
diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
index 2c35fb50761..038aa8e5a6d 100644
--- a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
+++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
@@ -3,33 +3,46 @@
 
 #include "utils/ckb_conv_tile_test_configs.hpp"
 #include "utils/ckb_conv_test_utils.hpp"
+#include "ck_tile/builder/testing/conv/ck_tile.hpp"
+#include "ck_tile/builder/testing/conv/reference.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "testing_utils.hpp"
 
-namespace {
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+namespace cku = ck_tile::builder::test_utils;
 
-using namespace ck_tile::builder::test_utils;
+using ck_tile::test::MatchesReference;
 
-TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP16_NHWGC)
+constexpr auto SIGNATURE =
+    ckt::ConvSignature{.spatial_dim            = 2,
+                       .direction              = ckb::ConvDirection::FORWARD,
+                       .data_type              = ckb::DataType::FP16,
+                       .accumulation_data_type = ckb::DataType::FP32,
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
+
+constexpr auto TILE_ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{}
+                                    .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT)
+                                    .with_tile_thread_block(cku::TileThreadBlock_64x64x64)
+                                    .with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave)
+                                    .with_tile_transfer(cku::TileTransfer_4x4x4)
+                                    .with_tile_optimizations(ckt::TileOptimizations{
+                                        .num_groups_to_merge = 1,
+                                        .split_image         = false,
+                                        .explicit_gemm       = false,
+                                    });
+
+using Builder   = ckb::ConvBuilder<SIGNATURE, TILE_ALGORITHM>;
+using TileConv  = Builder::Instance;
+using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
+
+TEST(Fwd2DFp16_TileV3_NHWGC, Create)
 {
-    constexpr ConvSignature FwdConvSignature{.spatial_dim            = 2,
-                                             .direction              = ConvDirection::FORWARD,
-                                             .data_type              = DataType::FP16,
-                                             .accumulation_data_type = DataType::FP32,
-                                             .input  = {.config = {.layout = TensorLayout::NHWGC}},
-                                             .weight = {.config = {.layout = TensorLayout::GKYXC}},
-                                             .output = {.config = {.layout = TensorLayout::NHWGK}}};
-
-    constexpr auto FwdConvAlgorithm =
-        ConvAlgorithm_Tile_GroupedConvolutionKernel{}
-            .with_tile_specializations(TileConvSpecialization::DEFAULT)
-            .with_tile_thread_block(TileThreadBlock_64x64x64)
-            .with_tile_block_gemm(TileBlockGemmDesc_16x16_v3_intrawave)
-            .with_tile_transfer(TileTransfer_4x4x4)
-            .with_tile_optimizations(TileOptimizations{
-                .num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false});
-
-    using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
-    run_ck_tile_test<Builder>({
-        "grouped_convolution_forward",
+    const auto expected_type_string = "grouped_convolution_forward";
+    cku::run_ck_tile_test<Builder>({
+        expected_type_string,
         "fp16",
         "NHWGC_GKYXC_NHWGK",
         "64x64x64",
@@ -48,4 +61,51 @@ TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP1
     });
 }
 
-} // namespace
+TEST(Fwd2DFp16_TileV3_NHWGC, EndToEnd)
+{
+    if(!ck_tile::get_device_name().starts_with("gfx9"))
+    {
+        GTEST_SKIP() << "unsupported architecture";
+    }
+
+    ckt::Args<SIGNATURE> args = {
+        .lengths =
+            {
+                .batch_size      = 16,
+                .groups          = 1,
+                .input_channels  = 32,
+                .output_channels = 48,
+                .image =
+                    {
+                        .width  = 56,
+                        .height = 64,
+                    },
+                .filter =
+                    {
+                        .width  = 3,
+                        .height = 5,
+                    },
+            },
+        .filter_strides     = {.width = 1, .height = 1},
+        .filter_dilation    = {.width = 1, .height = 1},
+        .input_left_pad     = {.width = 0, .height = 0},
+        .input_right_pad    = {.width = 0, .height = 0},
+        .a_elementwise_op   = {},
+        .b_elementwise_op   = {},
+        .cde_elementwise_op = {},
+    };
+
+    auto inputs    = ckt::alloc_inputs(args);
+    auto outputs   = ckt::alloc_outputs(args);
+    auto reference = ckt::alloc_outputs(args);
+
+    ckt::init_inputs(args, inputs.get());
+
+    auto tile_conv = TileConv{};
+    ckt::run(tile_conv, args, inputs.get(), outputs.get());
+
+    auto ref_conv = Reference{};
+    ckt::run(ref_conv, args, inputs.get(), reference.get());
+
+    EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
+}