From 78e95cd989984f97309e688402b1314e0ef5dbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 29 Oct 2025 20:18:12 +0100 Subject: [PATCH 01/15] mm --- src/ATen/native/xpu/#Repro.cpp# | 141 ++++++++++++++++++++++++ src/ATen/native/xpu/Repro.cpp | 133 ++++++++++++++++++++++ src/ATen/native/xpu/Repro.cpp~ | 128 +++++++++++++++++++++ src/ATen/native/xpu/repro.sycl | 177 ++++++++++++++++++++++++++++++ yaml/native/native_functions.yaml | 6 + 5 files changed, 585 insertions(+) create mode 100644 src/ATen/native/xpu/#Repro.cpp# create mode 100644 src/ATen/native/xpu/Repro.cpp create mode 100644 src/ATen/native/xpu/Repro.cpp~ create mode 100644 src/ATen/native/xpu/repro.sycl diff --git a/src/ATen/native/xpu/#Repro.cpp# b/src/ATen/native/xpu/#Repro.cpp# new file mode 100644 index 0000000000..fdc17d1ee7 --- /dev/null +++ b/src/ATen/native/xpu/#Repro.cpp# @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +//#include +#include +#include + +// namespace qr_issue { + +namespace at { +namespace native { + + +std::tuple local_qr( + const at::Tensor& a, + std::optional mode_) { + // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << + // std::endl; + + std::string mode = std::string(mode_.value_or("reduced")); + + TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); + at::Tensor a_contig = a.contiguous(); + at::Tensor result_r = at::clone(a_contig); + // at::Tensor result_r = at::empty_like(a_contig); + at::Tensor result_c = at::empty_like(a_contig); + at::Tensor result = at::empty_like(a_contig); + + auto dimensions = a.sizes(); + + std::cout << "dim " << dimensions << std::endl; + + std::cout << result_r << std::endl; + result_r = result_r.transpose(-2, -1).contiguous(); + std::cout << result_r << std::endl; + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range - 2); + int64_t m = a_contig.sizes().at(range - 1); + int64_t mn = int64_t(m * n); + int64_t b = numel / mn; + + int out_q_columns = m > n ? n : m; + if (n > m && mode == "complete") { + out_q_columns = n; + } + + std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns + << std::endl; + // at::Tensor result_q = result_r.clone(); + // dimensions[1]=out_q_columns; + std::vector v(dimensions.begin(), dimensions.end()); + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + auto ndimensions = at::IntArrayRef(v); + at::Tensor result_q = at::empty(ndimensions); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + int64_t mn1 = + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + int64_t mn2 = + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + mn2 = mn1 > mn2 ? mn1 : mn2; + float* sbuffer = sycl::malloc_device(mn2, queue); + float* tau_buf = sycl::malloc_device(out_q_columns, queue); + float* r_buf = result_r.data_ptr(); + float* q_buf = result_q.data_ptr(); + + // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == + // "complete") << std::endl; + + for (int batch_item = 0; batch_item < b; batch_item++) { + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + out_q_columns, + q_buf, + n, + tau_buf, + sbuffer, + mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer, queue); + std::cout << "done2" << std::endl; + } + + r_buf += mn; + q_buf += n * out_q_columns; + + } // batch + + if (mode == "r") { + result_q = at::empty({0, 0}); + } + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + // result_q.transpose(0,1); + return std::make_tuple( + result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); + //} +} + // Defines the operators + // TORCH_LIBRARY(qr_issue, m) { + // m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); + // } + + // ================================================== + // Register SYCL Implementations to Torch Library + // ================================================== + // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { + // m.impl("local_qr", &_qr_xpu); + // } + + // namespace qr_issue +}} + + +//cd torch-xpu-ops +//git checkout -B dev_wiktor +//git add . +//git commit -m "jakas nazwa" +//git push + diff --git a/src/ATen/native/xpu/Repro.cpp b/src/ATen/native/xpu/Repro.cpp new file mode 100644 index 0000000000..5653b76295 --- /dev/null +++ b/src/ATen/native/xpu/Repro.cpp @@ -0,0 +1,133 @@ +#include +#include +#include +#include +#include +//#include +#include +#include + +// namespace qr_issue { + +namespace at { +namespace native { + +std::tuple local_qr( + const at::Tensor& a, + std::optional mode_) { + // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << + // std::endl; + + std::string mode = std::string(mode_.value_or("reduced")); + + TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); + at::Tensor a_contig = a.contiguous(); + at::Tensor result_r = at::clone(a_contig); + // at::Tensor result_r = at::empty_like(a_contig); + at::Tensor result_c = at::empty_like(a_contig); + at::Tensor result = at::empty_like(a_contig); + + auto dimensions = a.sizes(); + + std::cout << "dim " << dimensions << std::endl; + + std::cout << result_r << std::endl; + result_r = result_r.transpose(-2, -1).contiguous(); + std::cout << result_r << std::endl; + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range - 2); + int64_t m = a_contig.sizes().at(range - 1); + int64_t mn = int64_t(m * n); + int64_t b = numel / mn; + + int out_q_columns = m > n ? n : m; + if (n > m && mode == "complete") { + out_q_columns = n; + } + + std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns + << std::endl; + // at::Tensor result_q = result_r.clone(); + // dimensions[1]=out_q_columns; + std::vector v(dimensions.begin(), dimensions.end()); + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + auto ndimensions = at::IntArrayRef(v); + at::Tensor result_q = at::empty(ndimensions); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + int64_t mn1 = + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + int64_t mn2 = + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + mn2 = mn1 > mn2 ? mn1 : mn2; + float* sbuffer = sycl::malloc_device(mn2, queue); + float* tau_buf = sycl::malloc_device(out_q_columns, queue); + float* r_buf = result_r.data_ptr(); + float* q_buf = result_q.data_ptr(); + + // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == + // "complete") << std::endl; + + for (int batch_item = 0; batch_item < b; batch_item++) { + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + out_q_columns, + q_buf, + n, + tau_buf, + sbuffer, + mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer, queue); + std::cout << "done2" << std::endl; + } + + r_buf += mn; + q_buf += n * out_q_columns; + + } // batch + + if (mode == "r") { + result_q = at::empty({0, 0}); + } + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + // result_q.transpose(0,1); + return std::make_tuple( + result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); + //} +} +// Defines the operators +// TORCH_LIBRARY(qr_issue, m) { +// m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); +// } + +// ================================================== +// Register SYCL Implementations to Torch Library +// ================================================== +// TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { +// m.impl("local_qr", &_qr_xpu); +// } + +// namespace qr_issue +} // namespace native +} // namespace at diff --git a/src/ATen/native/xpu/Repro.cpp~ b/src/ATen/native/xpu/Repro.cpp~ new file mode 100644 index 0000000000..a714102800 --- /dev/null +++ b/src/ATen/native/xpu/Repro.cpp~ @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +//#include +#include +#include + +// namespace qr_issue { + +std::tuple local_qr( + const at::Tensor& a, + std::optional mode_) { + // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << + // std::endl; + + std::string mode = std::string(mode_.value_or("reduced")); + + TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); + at::Tensor a_contig = a.contiguous(); + at::Tensor result_r = at::clone(a_contig); + // at::Tensor result_r = at::empty_like(a_contig); + at::Tensor result_c = at::empty_like(a_contig); + at::Tensor result = at::empty_like(a_contig); + + auto dimensions = a.sizes(); + + std::cout << "dim " << dimensions << std::endl; + + std::cout << result_r << std::endl; + result_r = result_r.transpose(-2, -1).contiguous(); + std::cout << result_r << std::endl; + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range - 2); + int64_t m = a_contig.sizes().at(range - 1); + int64_t mn = int64_t(m * n); + int64_t b = numel / mn; + + int out_q_columns = m > n ? n : m; + if (n > m && mode == "complete") { + out_q_columns = n; + } + + std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns + << std::endl; + // at::Tensor result_q = result_r.clone(); + // dimensions[1]=out_q_columns; + std::vector v(dimensions.begin(), dimensions.end()); + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + auto ndimensions = at::IntArrayRef(v); + at::Tensor result_q = at::empty(ndimensions); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + int64_t mn1 = + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + int64_t mn2 = + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + mn2 = mn1 > mn2 ? mn1 : mn2; + float* sbuffer = sycl::malloc_device(mn2, queue); + float* tau_buf = sycl::malloc_device(out_q_columns, queue); + float* r_buf = result_r.data_ptr(); + float* q_buf = result_q.data_ptr(); + + // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == + // "complete") << std::endl; + + for (int batch_item = 0; batch_item < b; batch_item++) { + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + out_q_columns, + q_buf, + n, + tau_buf, + sbuffer, + mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer, queue); + std::cout << "done2" << std::endl; + } + + r_buf += mn; + q_buf += n * out_q_columns; + + } // batch + + if (mode == "r") { + result_q = at::empty({0, 0}); + } + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + // result_q.transpose(0,1); + return std::make_tuple( + result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); + //} + + // Defines the operators + // TORCH_LIBRARY(qr_issue, m) { + // m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); + // } + + // ================================================== + // Register SYCL Implementations to Torch Library + // ================================================== + // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { + // m.impl("local_qr", &_qr_xpu); + // } + +} // namespace qr_issue diff --git a/src/ATen/native/xpu/repro.sycl b/src/ATen/native/xpu/repro.sycl new file mode 100644 index 0000000000..63f3247740 --- /dev/null +++ b/src/ATen/native/xpu/repro.sycl @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include +//#include +#include +#include + +namespace qr_issue { + + std::tuple _qr_xpu(const at::Tensor& a,std::optional mode_) { + + + //std::cout << "Call mode is " << " " << mode_.value_or("compressed") << std::endl; + + std::string mode = std::string(mode_.value_or("reduced")); + + TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); + at::Tensor a_contig = a.contiguous(); + at::Tensor result_r = at::clone(a_contig); + // at::Tensor result_r = at::empty_like(a_contig); + at::Tensor result_c = at::empty_like(a_contig); + at::Tensor result = at::empty_like(a_contig); + + auto dimensions = a.sizes(); + + std::cout << "dim " << dimensions << std::endl; + + + + std::cout << result_r << std::endl; + result_r=result_r.transpose(-2,-1).contiguous(); + std::cout << result_r << std::endl; + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range-2); + int64_t m = a_contig.sizes().at(range-1); + int64_t mn = int64_t(m*n); + int64_t b = numel/mn; + + + + + int out_q_columns = m > n ? n : m; + if (n>m && mode=="complete") { + out_q_columns=n; + } + + std::cout << "dim2 " << n << " " << m << " " << b << " " << + out_q_columns << std::endl; + //at::Tensor result_q = result_r.clone(); + //dimensions[1]=out_q_columns; + std::vector v(dimensions.begin(),dimensions.end()); + v[range-1]=v[range-2]; + v[range-2]=out_q_columns; + auto ndimensions = at::IntArrayRef(v); + at::Tensor result_q=at::empty(ndimensions); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + int64_t mn1 = oneapi::mkl::lapack::geqrf_scratchpad_size(queue,n,m,n); + int64_t mn2 = oneapi::mkl::lapack::orgqr_scratchpad_size(queue,n,m,m,n); + mn2 = mn1>mn2 ? mn1 : mn2; + float* sbuffer = sycl::malloc_device(mn2,queue); + float* tau_buf = sycl::malloc_device(out_q_columns,queue); + float* r_buf = result_r.data_ptr(); + float* q_buf = result_q.data_ptr(); + + //std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == "complete") << std::endl; + + + for (int batch_item=0; batch_itemm ? m : out_q_columns; + queue.memcpy(q_buf,r_buf,n*copy_columns*sizeof(float)).wait(); + + + oneapi::mkl::lapack::orgqr(queue,n,out_q_columns,out_q_columns,q_buf,n,tau_buf,sbuffer,mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer,queue); + std::cout << "done2" << std::endl; + + } + + r_buf+=mn; + q_buf+=n*out_q_columns; + + } // batch + + if (mode =="r") { + result_q=at::empty({0,0}); + } + + if ((mode=="reduced" || mode =="r" ) && n>m) { + result_r = result_r.index({"...",at::indexing::Slice(0,n),at::indexing::Slice(0,m)}).contiguous(); + } + + //result_q.transpose(0,1); + return std::make_tuple(result_q.transpose(-2,-1),result_r.transpose(-2,-1).triu_()); + + } + + + +// Defines the operators +TORCH_LIBRARY(qr_issue, m) { + m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); +} + +// ================================================== +// Register SYCL Implementations to Torch Library +// ================================================== +TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { + m.impl("local_qr",&_qr_xpu); + } + +} // namespace sycl_extension + + + +//#include + +// original code, which compiles, but failed to link. + +//#include +//#include +//#include +//#include + +//#include + +/* #include */ +/* #include */ + +/* using namespace sycl; */ + +/* int main() */ +/* { */ +/* queue q; */ + + + + +/* int numel = 25; */ +/* int64_t n = 5; */ +/* int64_t m = 5; */ +/* int64_t mn=m*n; */ + +/* std::cout << " dziala " << n << " " << m << " " << mn << std::endl; */ +/* // at::Tensor tau = at::empty({1,n}); */ + +/* int64_t mn2 = oneapi::mkl::lapack::geqrf_scratchpad_size(q,n,m,n); */ +/* int64_t mn3 = oneapi::mkl::lapack::ormqr_scratchpad_size(q,oneapi::mkl::side::left,oneapi::mkl::transpose::trans,n,m,n,n,n); */ + +/* sycl::buffer sbuffer(mn2); */ +/* sycl::buffer res_q_ptr(numel); */ +/* sycl::buffer tau_ptr(numel); */ +/* sycl::buffer c_ptr(numel); */ + + +/* std::cout << " dziala " << n << " " << m << " " << mn << " " << mn2 << " " << mn3 << std::endl; */ +/* oneapi::mkl::lapack::geqrf(q,n,m,res_q_ptr,n,tau_ptr,sbuffer,mn2); */ +/* sycl::buffer sbuffer2(mn3); */ +/* oneapi::mkl::lapack::ormqr(q,oneapi::mkl::side::right,oneapi::mkl::transpose::nontrans,n,m,n,res_q_ptr,n,tau_ptr,c_ptr,n,sbuffer2,mn3); */ +/* std::cout << "done" << std::endl; */ + +/* return 0; */ +/* } */ diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index a3281791de..f11752f456 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -9443,6 +9443,12 @@ - func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor python_module: linalg +- func: local_qr(Tensor a, str? mode_=None) -> (Tensor Q, Tensor R) + python_module: linalg + structured: False + dispatch: + XPU: local_qr_xpu + - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) python_module: linalg structured_delegate: linalg_inv_ex.inverse From 60b868bda9c1dc4bccf3a9034fbe85caa271f451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 29 Oct 2025 20:29:56 +0100 Subject: [PATCH 02/15] remove duplicates --- src/ATen/native/xpu/#Repro.cpp# | 141 ------------------------- src/ATen/native/xpu/Repro.cpp~ | 128 ----------------------- src/ATen/native/xpu/repro.sycl | 177 -------------------------------- 3 files changed, 446 deletions(-) delete mode 100644 src/ATen/native/xpu/#Repro.cpp# delete mode 100644 src/ATen/native/xpu/Repro.cpp~ delete mode 100644 src/ATen/native/xpu/repro.sycl diff --git a/src/ATen/native/xpu/#Repro.cpp# b/src/ATen/native/xpu/#Repro.cpp# deleted file mode 100644 index fdc17d1ee7..0000000000 --- a/src/ATen/native/xpu/#Repro.cpp# +++ /dev/null @@ -1,141 +0,0 @@ -#include -#include -#include -#include -#include -//#include -#include -#include - -// namespace qr_issue { - -namespace at { -namespace native { - - -std::tuple local_qr( - const at::Tensor& a, - std::optional mode_) { - // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << - // std::endl; - - std::string mode = std::string(mode_.value_or("reduced")); - - TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); - at::Tensor a_contig = a.contiguous(); - at::Tensor result_r = at::clone(a_contig); - // at::Tensor result_r = at::empty_like(a_contig); - at::Tensor result_c = at::empty_like(a_contig); - at::Tensor result = at::empty_like(a_contig); - - auto dimensions = a.sizes(); - - std::cout << "dim " << dimensions << std::endl; - - std::cout << result_r << std::endl; - result_r = result_r.transpose(-2, -1).contiguous(); - std::cout << result_r << std::endl; - int numel = a_contig.numel(); - int range = a_contig.dim(); - int64_t n = a_contig.sizes().at(range - 2); - int64_t m = a_contig.sizes().at(range - 1); - int64_t mn = int64_t(m * n); - int64_t b = numel / mn; - - int out_q_columns = m > n ? n : m; - if (n > m && mode == "complete") { - out_q_columns = n; - } - - std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns - << std::endl; - // at::Tensor result_q = result_r.clone(); - // dimensions[1]=out_q_columns; - std::vector v(dimensions.begin(), dimensions.end()); - v[range - 1] = v[range - 2]; - v[range - 2] = out_q_columns; - auto ndimensions = at::IntArrayRef(v); - at::Tensor result_q = at::empty(ndimensions); - - sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - int64_t mn1 = - oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); - int64_t mn2 = - oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); - mn2 = mn1 > mn2 ? mn1 : mn2; - float* sbuffer = sycl::malloc_device(mn2, queue); - float* tau_buf = sycl::malloc_device(out_q_columns, queue); - float* r_buf = result_r.data_ptr(); - float* q_buf = result_q.data_ptr(); - - // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == - // "complete") << std::endl; - - for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); - - if (mode != "r") { - // copy relevant part of R matrix to Q matrix - int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - - oneapi::mkl::lapack::orgqr( - queue, - n, - out_q_columns, - out_q_columns, - q_buf, - n, - tau_buf, - sbuffer, - mn2); - std::cout << "done" << std::endl; - - sycl::free(sbuffer, queue); - std::cout << "done2" << std::endl; - } - - r_buf += mn; - q_buf += n * out_q_columns; - - } // batch - - if (mode == "r") { - result_q = at::empty({0, 0}); - } - - if ((mode == "reduced" || mode == "r") && n > m) { - result_r = - result_r - .index( - {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) - .contiguous(); - } - - // result_q.transpose(0,1); - return std::make_tuple( - result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); - //} -} - // Defines the operators - // TORCH_LIBRARY(qr_issue, m) { - // m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); - // } - - // ================================================== - // Register SYCL Implementations to Torch Library - // ================================================== - // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { - // m.impl("local_qr", &_qr_xpu); - // } - - // namespace qr_issue -}} - - -//cd torch-xpu-ops -//git checkout -B dev_wiktor -//git add . -//git commit -m "jakas nazwa" -//git push - diff --git a/src/ATen/native/xpu/Repro.cpp~ b/src/ATen/native/xpu/Repro.cpp~ deleted file mode 100644 index a714102800..0000000000 --- a/src/ATen/native/xpu/Repro.cpp~ +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include -#include -#include -//#include -#include -#include - -// namespace qr_issue { - -std::tuple local_qr( - const at::Tensor& a, - std::optional mode_) { - // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << - // std::endl; - - std::string mode = std::string(mode_.value_or("reduced")); - - TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); - at::Tensor a_contig = a.contiguous(); - at::Tensor result_r = at::clone(a_contig); - // at::Tensor result_r = at::empty_like(a_contig); - at::Tensor result_c = at::empty_like(a_contig); - at::Tensor result = at::empty_like(a_contig); - - auto dimensions = a.sizes(); - - std::cout << "dim " << dimensions << std::endl; - - std::cout << result_r << std::endl; - result_r = result_r.transpose(-2, -1).contiguous(); - std::cout << result_r << std::endl; - int numel = a_contig.numel(); - int range = a_contig.dim(); - int64_t n = a_contig.sizes().at(range - 2); - int64_t m = a_contig.sizes().at(range - 1); - int64_t mn = int64_t(m * n); - int64_t b = numel / mn; - - int out_q_columns = m > n ? n : m; - if (n > m && mode == "complete") { - out_q_columns = n; - } - - std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns - << std::endl; - // at::Tensor result_q = result_r.clone(); - // dimensions[1]=out_q_columns; - std::vector v(dimensions.begin(), dimensions.end()); - v[range - 1] = v[range - 2]; - v[range - 2] = out_q_columns; - auto ndimensions = at::IntArrayRef(v); - at::Tensor result_q = at::empty(ndimensions); - - sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - int64_t mn1 = - oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); - int64_t mn2 = - oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); - mn2 = mn1 > mn2 ? mn1 : mn2; - float* sbuffer = sycl::malloc_device(mn2, queue); - float* tau_buf = sycl::malloc_device(out_q_columns, queue); - float* r_buf = result_r.data_ptr(); - float* q_buf = result_q.data_ptr(); - - // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == - // "complete") << std::endl; - - for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); - - if (mode != "r") { - // copy relevant part of R matrix to Q matrix - int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - - oneapi::mkl::lapack::orgqr( - queue, - n, - out_q_columns, - out_q_columns, - q_buf, - n, - tau_buf, - sbuffer, - mn2); - std::cout << "done" << std::endl; - - sycl::free(sbuffer, queue); - std::cout << "done2" << std::endl; - } - - r_buf += mn; - q_buf += n * out_q_columns; - - } // batch - - if (mode == "r") { - result_q = at::empty({0, 0}); - } - - if ((mode == "reduced" || mode == "r") && n > m) { - result_r = - result_r - .index( - {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) - .contiguous(); - } - - // result_q.transpose(0,1); - return std::make_tuple( - result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); - //} - - // Defines the operators - // TORCH_LIBRARY(qr_issue, m) { - // m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); - // } - - // ================================================== - // Register SYCL Implementations to Torch Library - // ================================================== - // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { - // m.impl("local_qr", &_qr_xpu); - // } - -} // namespace qr_issue diff --git a/src/ATen/native/xpu/repro.sycl b/src/ATen/native/xpu/repro.sycl deleted file mode 100644 index 63f3247740..0000000000 --- a/src/ATen/native/xpu/repro.sycl +++ /dev/null @@ -1,177 +0,0 @@ -#include -#include -#include -#include -#include -//#include -#include -#include - -namespace qr_issue { - - std::tuple _qr_xpu(const at::Tensor& a,std::optional mode_) { - - - //std::cout << "Call mode is " << " " << mode_.value_or("compressed") << std::endl; - - std::string mode = std::string(mode_.value_or("reduced")); - - TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); - at::Tensor a_contig = a.contiguous(); - at::Tensor result_r = at::clone(a_contig); - // at::Tensor result_r = at::empty_like(a_contig); - at::Tensor result_c = at::empty_like(a_contig); - at::Tensor result = at::empty_like(a_contig); - - auto dimensions = a.sizes(); - - std::cout << "dim " << dimensions << std::endl; - - - - std::cout << result_r << std::endl; - result_r=result_r.transpose(-2,-1).contiguous(); - std::cout << result_r << std::endl; - int numel = a_contig.numel(); - int range = a_contig.dim(); - int64_t n = a_contig.sizes().at(range-2); - int64_t m = a_contig.sizes().at(range-1); - int64_t mn = int64_t(m*n); - int64_t b = numel/mn; - - - - - int out_q_columns = m > n ? n : m; - if (n>m && mode=="complete") { - out_q_columns=n; - } - - std::cout << "dim2 " << n << " " << m << " " << b << " " << - out_q_columns << std::endl; - //at::Tensor result_q = result_r.clone(); - //dimensions[1]=out_q_columns; - std::vector v(dimensions.begin(),dimensions.end()); - v[range-1]=v[range-2]; - v[range-2]=out_q_columns; - auto ndimensions = at::IntArrayRef(v); - at::Tensor result_q=at::empty(ndimensions); - - sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - int64_t mn1 = oneapi::mkl::lapack::geqrf_scratchpad_size(queue,n,m,n); - int64_t mn2 = oneapi::mkl::lapack::orgqr_scratchpad_size(queue,n,m,m,n); - mn2 = mn1>mn2 ? mn1 : mn2; - float* sbuffer = sycl::malloc_device(mn2,queue); - float* tau_buf = sycl::malloc_device(out_q_columns,queue); - float* r_buf = result_r.data_ptr(); - float* q_buf = result_q.data_ptr(); - - //std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == "complete") << std::endl; - - - for (int batch_item=0; batch_itemm ? m : out_q_columns; - queue.memcpy(q_buf,r_buf,n*copy_columns*sizeof(float)).wait(); - - - oneapi::mkl::lapack::orgqr(queue,n,out_q_columns,out_q_columns,q_buf,n,tau_buf,sbuffer,mn2); - std::cout << "done" << std::endl; - - sycl::free(sbuffer,queue); - std::cout << "done2" << std::endl; - - } - - r_buf+=mn; - q_buf+=n*out_q_columns; - - } // batch - - if (mode =="r") { - result_q=at::empty({0,0}); - } - - if ((mode=="reduced" || mode =="r" ) && n>m) { - result_r = result_r.index({"...",at::indexing::Slice(0,n),at::indexing::Slice(0,m)}).contiguous(); - } - - //result_q.transpose(0,1); - return std::make_tuple(result_q.transpose(-2,-1),result_r.transpose(-2,-1).triu_()); - - } - - - -// Defines the operators -TORCH_LIBRARY(qr_issue, m) { - m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); -} - -// ================================================== -// Register SYCL Implementations to Torch Library -// ================================================== -TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { - m.impl("local_qr",&_qr_xpu); - } - -} // namespace sycl_extension - - - -//#include - -// original code, which compiles, but failed to link. - -//#include -//#include -//#include -//#include - -//#include - -/* #include */ -/* #include */ - -/* using namespace sycl; */ - -/* int main() */ -/* { */ -/* queue q; */ - - - - -/* int numel = 25; */ -/* int64_t n = 5; */ -/* int64_t m = 5; */ -/* int64_t mn=m*n; */ - -/* std::cout << " dziala " << n << " " << m << " " << mn << std::endl; */ -/* // at::Tensor tau = at::empty({1,n}); */ - -/* int64_t mn2 = oneapi::mkl::lapack::geqrf_scratchpad_size(q,n,m,n); */ -/* int64_t mn3 = oneapi::mkl::lapack::ormqr_scratchpad_size(q,oneapi::mkl::side::left,oneapi::mkl::transpose::trans,n,m,n,n,n); */ - -/* sycl::buffer sbuffer(mn2); */ -/* sycl::buffer res_q_ptr(numel); */ -/* sycl::buffer tau_ptr(numel); */ -/* sycl::buffer c_ptr(numel); */ - - -/* std::cout << " dziala " << n << " " << m << " " << mn << " " << mn2 << " " << mn3 << std::endl; */ -/* oneapi::mkl::lapack::geqrf(q,n,m,res_q_ptr,n,tau_ptr,sbuffer,mn2); */ -/* sycl::buffer sbuffer2(mn3); */ -/* oneapi::mkl::lapack::ormqr(q,oneapi::mkl::side::right,oneapi::mkl::transpose::nontrans,n,m,n,res_q_ptr,n,tau_ptr,c_ptr,n,sbuffer2,mn3); */ -/* std::cout << "done" << std::endl; */ - -/* return 0; */ -/* } */ From ba893b09a0ee42d655e13b48faeda36c504136ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Fri, 31 Oct 2025 13:17:33 +0000 Subject: [PATCH 03/15] dev3 --- src/ATen/native/xpu/QR.cpp | 19 ++++ src/ATen/native/xpu/Repro.cpp | 133 -------------------------- src/ATen/native/xpu/sycl/QRKernel.cpp | 132 +++++++++++++++++++++++++ src/ATen/native/xpu/sycl/QRKernel.h | 13 +++ yaml/native/native_functions.yaml | 11 ++- 5 files changed, 172 insertions(+), 136 deletions(-) create mode 100644 src/ATen/native/xpu/QR.cpp delete mode 100644 src/ATen/native/xpu/Repro.cpp create mode 100644 src/ATen/native/xpu/sycl/QRKernel.cpp create mode 100644 src/ATen/native/xpu/sycl/QRKernel.h diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp new file mode 100644 index 0000000000..9a2adeebc3 --- /dev/null +++ b/src/ATen/native/xpu/QR.cpp @@ -0,0 +1,19 @@ +#include +#include +#include +#include +#include + +#include + +namespace at::native { + +TORCH_IMPL_FUNC(linalg_qr_xpu_out) +(const Tensor& A, + c10::string_view mode, + const Tensor& Q, + const Tensor& R) { + xpu::linalg_qr_kernel(A, mode, Q, R); +} + +} // namespace at::native diff --git a/src/ATen/native/xpu/Repro.cpp b/src/ATen/native/xpu/Repro.cpp deleted file mode 100644 index 5653b76295..0000000000 --- a/src/ATen/native/xpu/Repro.cpp +++ /dev/null @@ -1,133 +0,0 @@ -#include -#include -#include -#include -#include -//#include -#include -#include - -// namespace qr_issue { - -namespace at { -namespace native { - -std::tuple local_qr( - const at::Tensor& a, - std::optional mode_) { - // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << - // std::endl; - - std::string mode = std::string(mode_.value_or("reduced")); - - TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); - at::Tensor a_contig = a.contiguous(); - at::Tensor result_r = at::clone(a_contig); - // at::Tensor result_r = at::empty_like(a_contig); - at::Tensor result_c = at::empty_like(a_contig); - at::Tensor result = at::empty_like(a_contig); - - auto dimensions = a.sizes(); - - std::cout << "dim " << dimensions << std::endl; - - std::cout << result_r << std::endl; - result_r = result_r.transpose(-2, -1).contiguous(); - std::cout << result_r << std::endl; - int numel = a_contig.numel(); - int range = a_contig.dim(); - int64_t n = a_contig.sizes().at(range - 2); - int64_t m = a_contig.sizes().at(range - 1); - int64_t mn = int64_t(m * n); - int64_t b = numel / mn; - - int out_q_columns = m > n ? n : m; - if (n > m && mode == "complete") { - out_q_columns = n; - } - - std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns - << std::endl; - // at::Tensor result_q = result_r.clone(); - // dimensions[1]=out_q_columns; - std::vector v(dimensions.begin(), dimensions.end()); - v[range - 1] = v[range - 2]; - v[range - 2] = out_q_columns; - auto ndimensions = at::IntArrayRef(v); - at::Tensor result_q = at::empty(ndimensions); - - sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - int64_t mn1 = - oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); - int64_t mn2 = - oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); - mn2 = mn1 > mn2 ? mn1 : mn2; - float* sbuffer = sycl::malloc_device(mn2, queue); - float* tau_buf = sycl::malloc_device(out_q_columns, queue); - float* r_buf = result_r.data_ptr(); - float* q_buf = result_q.data_ptr(); - - // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == - // "complete") << std::endl; - - for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); - - if (mode != "r") { - // copy relevant part of R matrix to Q matrix - int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - - oneapi::mkl::lapack::orgqr( - queue, - n, - out_q_columns, - out_q_columns, - q_buf, - n, - tau_buf, - sbuffer, - mn2); - std::cout << "done" << std::endl; - - sycl::free(sbuffer, queue); - std::cout << "done2" << std::endl; - } - - r_buf += mn; - q_buf += n * out_q_columns; - - } // batch - - if (mode == "r") { - result_q = at::empty({0, 0}); - } - - if ((mode == "reduced" || mode == "r") && n > m) { - result_r = - result_r - .index( - {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) - .contiguous(); - } - - // result_q.transpose(0,1); - return std::make_tuple( - result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); - //} -} -// Defines the operators -// TORCH_LIBRARY(qr_issue, m) { -// m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)"); -// } - -// ================================================== -// Register SYCL Implementations to Torch Library -// ================================================== -// TORCH_LIBRARY_IMPL(qr_issue, XPU, m) { -// m.impl("local_qr", &_qr_xpu); -// } - -// namespace qr_issue -} // namespace native -} // namespace at diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp new file mode 100644 index 0000000000..c578b35b3d --- /dev/null +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -0,0 +1,132 @@ +#include +#include +// #include +#include +#include +//#include +#include +#include + +namespace at { +namespace native { +namespace xpu { + +void linalg_qr_kernel( + const at::Tensor& A, + c10::string_view mode, + const at::Tensor& Q, + const at::Tensor& R) { + std::cout << "Hello from kernel"; + } + +} // namespace xpu +} // namespace native +} // namespace at + +// std::tuple linalg_qr_kernel_draft( +// const at::Tensor& a, +// std::optional mode_) { +// return std::make_tuple(a, a); +// } + + // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << + // std::endl; + +// std::string mode = std::string(mode_.value_or("reduced")); + +// TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); +// at::Tensor a_contig = a.contiguous(); +// at::Tensor result_r = at::clone(a_contig); +// // at::Tensor result_r = at::empty_like(a_contig); +// at::Tensor result_c = at::empty_like(a_contig); +// at::Tensor result = at::empty_like(a_contig); + +// auto dimensions = a.sizes(); + +// std::cout << "dim " << dimensions << std::endl; + +// std::cout << result_r << std::endl; +// result_r = result_r.transpose(-2, -1).contiguous(); +// std::cout << result_r << std::endl; +// int numel = a_contig.numel(); +// int range = a_contig.dim(); +// int64_t n = a_contig.sizes().at(range - 2); +// int64_t m = a_contig.sizes().at(range - 1); +// int64_t mn = int64_t(m * n); +// int64_t b = numel / mn; + +// int out_q_columns = m > n ? n : m; +// if (n > m && mode == "complete") { +// out_q_columns = n; +// } + +// std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns +// << std::endl; +// // at::Tensor result_q = result_r.clone(); +// // dimensions[1]=out_q_columns; +// std::vector v(dimensions.begin(), dimensions.end()); +// v[range - 1] = v[range - 2]; +// v[range - 2] = out_q_columns; +// auto ndimensions = at::IntArrayRef(v); +// at::Tensor result_q = at::empty(ndimensions); + +// sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); +// int64_t mn1 = +// oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); +// int64_t mn2 = +// oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); +// mn2 = mn1 > mn2 ? mn1 : mn2; +// float* sbuffer = sycl::malloc_device(mn2, queue); +// float* tau_buf = sycl::malloc_device(out_q_columns, queue); +// float* r_buf = result_r.data_ptr(); +// float* q_buf = result_q.data_ptr(); + +// // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == +// // "complete") << std::endl; + +// for (int batch_item = 0; batch_item < b; batch_item++) { +// oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + +// if (mode != "r") { +// // copy relevant part of R matrix to Q matrix +// int copy_columns = out_q_columns > m ? m : out_q_columns; +// queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + +// oneapi::mkl::lapack::orgqr( +// queue, +// n, +// out_q_columns, +// out_q_columns, +// q_buf, +// n, +// tau_buf, +// sbuffer, +// mn2); +// std::cout << "done" << std::endl; + +// sycl::free(sbuffer, queue); +// std::cout << "done2" << std::endl; +// } + +// r_buf += mn; +// q_buf += n * out_q_columns; + +// } // batch + +// if (mode == "r") { +// result_q = at::empty({0, 0}); +// } + +// if ((mode == "reduced" || mode == "r") && n > m) { +// result_r = +// result_r +// .index( +// {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) +// .contiguous(); +// } + +// // result_q.transpose(0,1); +// return std::make_tuple( +// result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); +// //} +// } diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h new file mode 100644 index 0000000000..9e6a81435a --- /dev/null +++ b/src/ATen/native/xpu/sycl/QRKernel.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace at::native::xpu { + +TORCH_XPU_API void linalg_qr_kernel( + const Tensor& A, + c10::string_view mode, + const Tensor& Q, + const Tensor& R); + +} // namespace at::native::xpu diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index f11752f456..7221ebdbdc 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -9443,11 +9443,16 @@ - func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor python_module: linalg -- func: local_qr(Tensor a, str? mode_=None) -> (Tensor Q, Tensor R) +- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) python_module: linalg - structured: False + variants: function + structured_delegate: linalg_qr.out + +- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) + python_module: linalg + structured: True dispatch: - XPU: local_qr_xpu + XPU: linalg_qr_xpu_out - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) python_module: linalg From e0f5f6be868cf5092fdaf2e827635c115a34d456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Fri, 31 Oct 2025 16:58:31 +0000 Subject: [PATCH 04/15] First callable poc --- src/ATen/native/xpu/QR.cpp | 2 +- src/ATen/native/xpu/sycl/QRKernel.cpp | 216 ++++++++++++-------------- src/ATen/native/xpu/sycl/QRKernel.h | 2 +- 3 files changed, 103 insertions(+), 117 deletions(-) diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp index 9a2adeebc3..53b42c2be9 100644 --- a/src/ATen/native/xpu/QR.cpp +++ b/src/ATen/native/xpu/QR.cpp @@ -10,7 +10,7 @@ namespace at::native { TORCH_IMPL_FUNC(linalg_qr_xpu_out) (const Tensor& A, - c10::string_view mode, + std::string_view mode, const Tensor& Q, const Tensor& R) { xpu::linalg_qr_kernel(A, mode, Q, R); diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index c578b35b3d..ce26fe862f 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -7,126 +7,112 @@ #include #include -namespace at { -namespace native { -namespace xpu { +namespace at::native::xpu { void linalg_qr_kernel( const at::Tensor& A, - c10::string_view mode, + std::string_view mode_, const at::Tensor& Q, const at::Tensor& R) { - std::cout << "Hello from kernel"; + std::cout << "Hello from kernel"; + std::cout << "Call mode is " << " " << mode_ << std::endl; + std::string mode = std::string(mode_); + + TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor"); + at::Tensor a_contig = A.contiguous(); + at::Tensor result_r = at::clone(a_contig); + // at::Tensor result_r = at::empty_like(a_contig); + at::Tensor result_c = at::empty_like(a_contig); + at::Tensor result = at::empty_like(a_contig); + + auto dimensions = A.sizes(); + + std::cout << "dim " << dimensions << std::endl; + + std::cout << result_r << std::endl; + result_r = result_r.transpose(-2, -1).contiguous(); + std::cout << result_r << std::endl; + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range - 2); + int64_t m = a_contig.sizes().at(range - 1); + int64_t mn = int64_t(m * n); + int64_t b = numel / mn; + + int out_q_columns = m > n ? n : m; + if (n > m && mode == "complete") { + out_q_columns = n; } -} // namespace xpu -} // namespace native -} // namespace at - -// std::tuple linalg_qr_kernel_draft( -// const at::Tensor& a, -// std::optional mode_) { -// return std::make_tuple(a, a); -// } - - // std::cout << "Call mode is " << " " << mode_.value_or("compressed") << - // std::endl; - -// std::string mode = std::string(mode_.value_or("reduced")); - -// TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); -// at::Tensor a_contig = a.contiguous(); -// at::Tensor result_r = at::clone(a_contig); -// // at::Tensor result_r = at::empty_like(a_contig); -// at::Tensor result_c = at::empty_like(a_contig); -// at::Tensor result = at::empty_like(a_contig); - -// auto dimensions = a.sizes(); - -// std::cout << "dim " << dimensions << std::endl; - -// std::cout << result_r << std::endl; -// result_r = result_r.transpose(-2, -1).contiguous(); -// std::cout << result_r << std::endl; -// int numel = a_contig.numel(); -// int range = a_contig.dim(); -// int64_t n = a_contig.sizes().at(range - 2); -// int64_t m = a_contig.sizes().at(range - 1); -// int64_t mn = int64_t(m * n); -// int64_t b = numel / mn; - -// int out_q_columns = m > n ? n : m; -// if (n > m && mode == "complete") { -// out_q_columns = n; -// } - -// std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns -// << std::endl; -// // at::Tensor result_q = result_r.clone(); -// // dimensions[1]=out_q_columns; -// std::vector v(dimensions.begin(), dimensions.end()); -// v[range - 1] = v[range - 2]; -// v[range - 2] = out_q_columns; -// auto ndimensions = at::IntArrayRef(v); -// at::Tensor result_q = at::empty(ndimensions); - -// sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); -// int64_t mn1 = -// oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); -// int64_t mn2 = -// oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); -// mn2 = mn1 > mn2 ? mn1 : mn2; -// float* sbuffer = sycl::malloc_device(mn2, queue); -// float* tau_buf = sycl::malloc_device(out_q_columns, queue); -// float* r_buf = result_r.data_ptr(); -// float* q_buf = result_q.data_ptr(); - -// // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == -// // "complete") << std::endl; - -// for (int batch_item = 0; batch_item < b; batch_item++) { -// oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); - -// if (mode != "r") { -// // copy relevant part of R matrix to Q matrix -// int copy_columns = out_q_columns > m ? m : out_q_columns; -// queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - -// oneapi::mkl::lapack::orgqr( -// queue, -// n, -// out_q_columns, -// out_q_columns, -// q_buf, -// n, -// tau_buf, -// sbuffer, -// mn2); -// std::cout << "done" << std::endl; - -// sycl::free(sbuffer, queue); -// std::cout << "done2" << std::endl; -// } - -// r_buf += mn; -// q_buf += n * out_q_columns; - -// } // batch - -// if (mode == "r") { -// result_q = at::empty({0, 0}); -// } - -// if ((mode == "reduced" || mode == "r") && n > m) { -// result_r = -// result_r -// .index( -// {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) -// .contiguous(); -// } - -// // result_q.transpose(0,1); -// return std::make_tuple( -// result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); -// //} + std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns + << std::endl; + // at::Tensor result_q = result_r.clone(); + // dimensions[1]=out_q_columns; + std::vector v(dimensions.begin(), dimensions.end()); + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + auto ndimensions = at::IntArrayRef(v); + at::Tensor result_q = at::empty(ndimensions); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + int64_t mn1 = + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + int64_t mn2 = + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + mn2 = mn1 > mn2 ? mn1 : mn2; + float* sbuffer = sycl::malloc_device(mn2, queue); + float* tau_buf = sycl::malloc_device(out_q_columns, queue); + float* r_buf = result_r.data_ptr(); + float* q_buf = result_q.data_ptr(); + + std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == + "complete") << std::endl; + + for (int batch_item = 0; batch_item < b; batch_item++) { + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + out_q_columns, + q_buf, + n, + tau_buf, + sbuffer, + mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer, queue); + std::cout << "done2" << std::endl; + } + + r_buf += mn; + q_buf += n * out_q_columns; + + } // batch + + if (mode == "r") { + result_q = at::empty({0, 0}); + } + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + // result_q.transpose(0,1); + // return std::make_tuple( + // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); +} + +} // namespace at::native::xpu // } diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h index 9e6a81435a..6927439a72 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.h +++ b/src/ATen/native/xpu/sycl/QRKernel.h @@ -6,7 +6,7 @@ namespace at::native::xpu { TORCH_XPU_API void linalg_qr_kernel( const Tensor& A, - c10::string_view mode, + std::string_view mode, const Tensor& Q, const Tensor& R); From 025291ab8ed2454490020f2d810b45f3dc38d98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 3 Nov 2025 07:50:45 +0000 Subject: [PATCH 05/15] simple test --- test/xpu/test_linalg_qr.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 test/xpu/test_linalg_qr.py diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py new file mode 100644 index 0000000000..6422d08f54 --- /dev/null +++ b/test/xpu/test_linalg_qr.py @@ -0,0 +1,24 @@ +import torch +import pytest + + +@pytest.mark.parametrize("mode", ['reduced', 'complete', 'r']) +def test_linalg_qr(mode): + A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]]) + A_xpu = A.to('xpu') + + Q, R = torch.linalg.qr(A) + Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) + + + print("==== CPU ====") + print("A",A) + print("Q",Q) + print("R",R) + print("\n==== XPU ====") + print("A_xpu",A_xpu) + print("Q_xpu",Q_xpu) + print("R_xpu",R_xpu) + + assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5) + assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5) From 6e025e194a59d0ab1667d398851f536aed0c123d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 3 Nov 2025 09:33:11 +0000 Subject: [PATCH 06/15] cleanup --- src/ATen/native/xpu/QR.cpp | 11 +++-------- src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/QRKernel.cpp | 9 ++------- src/ATen/native/xpu/sycl/QRKernel.h | 6 +++--- test/xpu/test_linalg_qr.py | 2 +- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp index 53b42c2be9..37d8896b20 100644 --- a/src/ATen/native/xpu/QR.cpp +++ b/src/ATen/native/xpu/QR.cpp @@ -1,18 +1,13 @@ -#include -#include #include -#include -#include - #include namespace at::native { TORCH_IMPL_FUNC(linalg_qr_xpu_out) -(const Tensor& A, +(const at::Tensor& A, std::string_view mode, - const Tensor& Q, - const Tensor& R) { + const at::Tensor& Q, + const at::Tensor& R) { xpu::linalg_qr_kernel(A, mode, Q, R); } diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 053c806292..41e511c5e9 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -206,7 +206,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "linalg_lstsq.out", "linalg_lu.out", "linalg_matrix_exp", - "linalg_qr.out", "linalg_solve_triangular", "linalg_solve_triangular.out", "_linalg_svd.U", diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index ce26fe862f..05ace1ceb9 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -1,22 +1,17 @@ #include #include -// #include #include #include -//#include #include -#include namespace at::native::xpu { void linalg_qr_kernel( const at::Tensor& A, - std::string_view mode_, + std::string_view mode, const at::Tensor& Q, const at::Tensor& R) { - std::cout << "Hello from kernel"; - std::cout << "Call mode is " << " " << mode_ << std::endl; - std::string mode = std::string(mode_); + std::cout << "Call mode is " << " " << mode << std::endl; TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor"); at::Tensor a_contig = A.contiguous(); diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h index 6927439a72..93696feed7 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.h +++ b/src/ATen/native/xpu/sycl/QRKernel.h @@ -5,9 +5,9 @@ namespace at::native::xpu { TORCH_XPU_API void linalg_qr_kernel( - const Tensor& A, + const at::Tensor& A, std::string_view mode, - const Tensor& Q, - const Tensor& R); + const at::Tensor& Q, + const at::Tensor& R); } // namespace at::native::xpu diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py index 6422d08f54..6a23db8172 100644 --- a/test/xpu/test_linalg_qr.py +++ b/test/xpu/test_linalg_qr.py @@ -7,7 +7,7 @@ def test_linalg_qr(mode): A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]]) A_xpu = A.to('xpu') - Q, R = torch.linalg.qr(A) + Q, R = torch.linalg.qr(A, mode=mode) Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) From 5e593d8cdafd6e1c3165ba9703a3fb9669bfc405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 3 Nov 2025 10:07:26 +0000 Subject: [PATCH 07/15] format --- src/ATen/native/xpu/sycl/QRKernel.cpp | 94 +++++++++++++-------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index 05ace1ceb9..cd94bc5f5f 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -1,8 +1,8 @@ #include #include +#include #include #include -#include namespace at::native::xpu { @@ -60,54 +60,54 @@ void linalg_qr_kernel( float* r_buf = result_r.data_ptr(); float* q_buf = result_q.data_ptr(); - std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == - "complete") << std::endl; + std::cout << "entering " << n << " " << m << " " << mode << " " + << (mode == "complete") << std::endl; for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); - - if (mode != "r") { - // copy relevant part of R matrix to Q matrix - int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - - oneapi::mkl::lapack::orgqr( - queue, - n, - out_q_columns, - out_q_columns, - q_buf, - n, - tau_buf, - sbuffer, - mn2); - std::cout << "done" << std::endl; - - sycl::free(sbuffer, queue); - std::cout << "done2" << std::endl; - } - - r_buf += mn; - q_buf += n * out_q_columns; - - } // batch - - if (mode == "r") { - result_q = at::empty({0, 0}); - } - - if ((mode == "reduced" || mode == "r") && n > m) { - result_r = - result_r - .index( - {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) - .contiguous(); - } - - // result_q.transpose(0,1); - // return std::make_tuple( - // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + out_q_columns, + q_buf, + n, + tau_buf, + sbuffer, + mn2); + std::cout << "done" << std::endl; + + sycl::free(sbuffer, queue); + std::cout << "done2" << std::endl; + } + + r_buf += mn; + q_buf += n * out_q_columns; + + } // batch + + if (mode == "r") { + result_q = at::empty({0, 0}); + } + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + // result_q.transpose(0,1); + // return std::make_tuple( + // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); } - + } // namespace at::native::xpu // } From 2038538f9b65f745bfb6b3e19125e0c1d5896f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 3 Nov 2025 10:55:56 +0000 Subject: [PATCH 08/15] remove vars --- src/ATen/native/xpu/sycl/QRKernel.cpp | 10 ++------- test/xpu/test_linalg_qr.py | 30 ++++++++++++++------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index cd94bc5f5f..7dde1634bf 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -12,13 +12,8 @@ void linalg_qr_kernel( const at::Tensor& Q, const at::Tensor& R) { std::cout << "Call mode is " << " " << mode << std::endl; - - TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor"); at::Tensor a_contig = A.contiguous(); at::Tensor result_r = at::clone(a_contig); - // at::Tensor result_r = at::empty_like(a_contig); - at::Tensor result_c = at::empty_like(a_contig); - at::Tensor result = at::empty_like(a_contig); auto dimensions = A.sizes(); @@ -105,9 +100,8 @@ void linalg_qr_kernel( } // result_q.transpose(0,1); - // return std::make_tuple( - // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_()); + Q.set_(result_q.transpose(-2, -1).to("xpu")); + R.set_(result_r.transpose(-2, -1).triu_().to("xpu")); } } // namespace at::native::xpu -// } diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py index 6a23db8172..83e94e9853 100644 --- a/test/xpu/test_linalg_qr.py +++ b/test/xpu/test_linalg_qr.py @@ -1,24 +1,26 @@ import torch import pytest - +@pytest.mark.parametrize("dtype", [torch.float32]) @pytest.mark.parametrize("mode", ['reduced', 'complete', 'r']) -def test_linalg_qr(mode): - A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]]) +@pytest.mark.parametrize("shape", [(5, 3), (2, 3, 5), (2, 3, 4, 4), (2, 1, 1)]) +# @pytest.mark.parametrize("shape", [ +# # 2D matrices +# (5, 3), (3, 5), (8, 6), (6, 8), +# # 3D batched matrices +# (2, 3, 3), (3, 4, 4), (2, 5, 3), (2, 3, 5), +# # 4D batched matrices +# (2, 3, 4, 4), (1, 2, 5, 3), (2, 1, 3, 5), +# # Edge cases +# (1, 1), (10, 1), (1, 10), (2, 1, 1), (1, 1, 1) +# ]) +def test_linalg_qr(dtype, mode, shape): + A = torch.randn(shape, dtype=dtype) A_xpu = A.to('xpu') - + Q, R = torch.linalg.qr(A, mode=mode) Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) - - print("==== CPU ====") - print("A",A) - print("Q",Q) - print("R",R) - print("\n==== XPU ====") - print("A_xpu",A_xpu) - print("Q_xpu",Q_xpu) - print("R_xpu",R_xpu) - assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5) assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5) + assert Q_xpu.device.type == 'xpu' and R_xpu.device.type == 'xpu' From 3c26a9e1e6cc24be4f85cef0f0c8b9388d539342 Mon Sep 17 00:00:00 2001 From: Michal Wiktor Date: Mon, 24 Nov 2025 12:06:55 +0000 Subject: [PATCH 09/15] New implementation of QR kernel based on mkl::geqrf and orgqr --- src/ATen/native/xpu/sycl/QRKernel.cpp | 76 +++++++++++++++------------ 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index 7dde1634bf..461abc699f 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -11,17 +11,17 @@ void linalg_qr_kernel( std::string_view mode, const at::Tensor& Q, const at::Tensor& R) { - std::cout << "Call mode is " << " " << mode << std::endl; + TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); + TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); + at::Tensor a_contig = A.contiguous(); at::Tensor result_r = at::clone(a_contig); + auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU); auto dimensions = A.sizes(); - std::cout << "dim " << dimensions << std::endl; - - std::cout << result_r << std::endl; result_r = result_r.transpose(-2, -1).contiguous(); - std::cout << result_r << std::endl; + int numel = a_contig.numel(); int range = a_contig.dim(); int64_t n = a_contig.sizes().at(range - 2); @@ -34,32 +34,41 @@ void linalg_qr_kernel( out_q_columns = n; } - std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns - << std::endl; - // at::Tensor result_q = result_r.clone(); - // dimensions[1]=out_q_columns; std::vector v(dimensions.begin(), dimensions.end()); - v[range - 1] = v[range - 2]; - v[range - 2] = out_q_columns; - auto ndimensions = at::IntArrayRef(v); - at::Tensor result_q = at::empty(ndimensions); + if (mode != "r") { + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + } else { + v = std::vector({0, 0}); + } + auto q_dimensions = at::IntArrayRef(v); + std::cout << "Q SIZE " << q_dimensions << " " << dimensions << " " + << std::endl; + at::Tensor result_q = at::empty(q_dimensions, options); + + std::cout << "Q SIZE " << q_dimensions << " " << dimensions << std::endl; sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - int64_t mn1 = + + int64_t bufsize1 = oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); - int64_t mn2 = + int64_t bufsize2 = oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); - mn2 = mn1 > mn2 ? mn1 : mn2; - float* sbuffer = sycl::malloc_device(mn2, queue); - float* tau_buf = sycl::malloc_device(out_q_columns, queue); + + int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1; + int64_t tau_len = m > n ? n : m; + float* sbuffer = sycl::malloc_device(bufsize, queue); + float* tau_buf = sycl::malloc_device(tau_len, queue); float* r_buf = result_r.data_ptr(); - float* q_buf = result_q.data_ptr(); - std::cout << "entering " << n << " " << m << " " << mode << " " - << (mode == "complete") << std::endl; + float* q_buf = NULL; + if (mode != "r") { + q_buf = result_q.data_ptr(); + } for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2); + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize) + .wait(); if (mode != "r") { // copy relevant part of R matrix to Q matrix @@ -70,26 +79,24 @@ void linalg_qr_kernel( queue, n, out_q_columns, - out_q_columns, + // out_q_columns, + tau_len, q_buf, n, tau_buf, sbuffer, - mn2); - std::cout << "done" << std::endl; + bufsize) + .wait(); - sycl::free(sbuffer, queue); - std::cout << "done2" << std::endl; + q_buf += n * out_q_columns; } r_buf += mn; - q_buf += n * out_q_columns; } // batch - if (mode == "r") { - result_q = at::empty({0, 0}); - } + sycl::free(sbuffer, queue); + sycl::free(tau_buf, queue); if ((mode == "reduced" || mode == "r") && n > m) { result_r = @@ -100,8 +107,11 @@ void linalg_qr_kernel( } // result_q.transpose(0,1); - Q.set_(result_q.transpose(-2, -1).to("xpu")); - R.set_(result_r.transpose(-2, -1).triu_().to("xpu")); + // Q.set_(result_q.transpose(-2, -1).to("xpu")); + // R.set_(result_r.transpose(-2, -1).triu_().to("xpu")); + Q.set_(result_q.transpose(-2, -1)); + R.set_(result_r.transpose(-2, -1).triu_()); + queue.wait(); } } // namespace at::native::xpu From 1531fa1e7df4ab54d26775c27a9ec87f49479cd1 Mon Sep 17 00:00:00 2001 From: Michal Wiktor Date: Mon, 24 Nov 2025 12:12:02 +0000 Subject: [PATCH 10/15] cleaned version of QR --- src/ATen/native/xpu/sycl/QRKernel.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp index 461abc699f..761744a8a8 100644 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ b/src/ATen/native/xpu/sycl/QRKernel.cpp @@ -11,8 +11,9 @@ void linalg_qr_kernel( std::string_view mode, const at::Tensor& Q, const at::Tensor& R) { - TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); - TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); + + //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); + //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); at::Tensor a_contig = A.contiguous(); at::Tensor result_r = at::clone(a_contig); @@ -42,11 +43,10 @@ void linalg_qr_kernel( v = std::vector({0, 0}); } auto q_dimensions = at::IntArrayRef(v); - std::cout << "Q SIZE " << q_dimensions << " " << dimensions << " " - << std::endl; + at::Tensor result_q = at::empty(q_dimensions, options); - std::cout << "Q SIZE " << q_dimensions << " " << dimensions << std::endl; + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); @@ -79,7 +79,6 @@ void linalg_qr_kernel( queue, n, out_q_columns, - // out_q_columns, tau_len, q_buf, n, @@ -106,9 +105,6 @@ void linalg_qr_kernel( .contiguous(); } - // result_q.transpose(0,1); - // Q.set_(result_q.transpose(-2, -1).to("xpu")); - // R.set_(result_r.transpose(-2, -1).triu_().to("xpu")); Q.set_(result_q.transpose(-2, -1)); R.set_(result_r.transpose(-2, -1).triu_()); queue.wait(); From 980732bdde4f83a5c2ef8ea10293e51c2c9b6814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 24 Nov 2025 12:33:06 +0000 Subject: [PATCH 11/15] Enable more dev tests --- test/xpu/test_linalg_qr.py | 45 +++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py index 83e94e9853..2aaffab7bd 100644 --- a/test/xpu/test_linalg_qr.py +++ b/test/xpu/test_linalg_qr.py @@ -1,26 +1,41 @@ -import torch import pytest +import torch + @pytest.mark.parametrize("dtype", [torch.float32]) -@pytest.mark.parametrize("mode", ['reduced', 'complete', 'r']) -@pytest.mark.parametrize("shape", [(5, 3), (2, 3, 5), (2, 3, 4, 4), (2, 1, 1)]) -# @pytest.mark.parametrize("shape", [ -# # 2D matrices -# (5, 3), (3, 5), (8, 6), (6, 8), -# # 3D batched matrices -# (2, 3, 3), (3, 4, 4), (2, 5, 3), (2, 3, 5), -# # 4D batched matrices -# (2, 3, 4, 4), (1, 2, 5, 3), (2, 1, 3, 5), -# # Edge cases -# (1, 1), (10, 1), (1, 10), (2, 1, 1), (1, 1, 1) -# ]) +@pytest.mark.parametrize("mode", ["reduced", "complete", "r"]) +@pytest.mark.parametrize( + "shape", + [ + # 2D matrices + (5, 3), + (3, 5), + (8, 6), + (6, 8), + # 3D batched matrices + (2, 3, 3), + (3, 4, 4), + (2, 5, 3), + (2, 3, 5), + # 4D batched matrices + (2, 3, 4, 4), + (1, 2, 5, 3), + (2, 1, 3, 5), + # Edge cases + (1, 1), + (10, 1), + (1, 10), + (2, 1, 1), + (1, 1, 1), + ], +) def test_linalg_qr(dtype, mode, shape): A = torch.randn(shape, dtype=dtype) - A_xpu = A.to('xpu') + A_xpu = A.to("xpu") Q, R = torch.linalg.qr(A, mode=mode) Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5) assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5) - assert Q_xpu.device.type == 'xpu' and R_xpu.device.type == 'xpu' + assert Q_xpu.device.type == "xpu" and R_xpu.device.type == "xpu" From e8b49e7a721412f277dbee0a88a53f8554c1b2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 26 Nov 2025 11:21:14 +0000 Subject: [PATCH 12/15] Move QR to mkl --- src/ATen/native/xpu/BatchLinearAlgebra.cpp | 18 +++ src/ATen/native/xpu/QR.cpp | 14 --- .../native/xpu/mkl/BatchLinearAlgebra.cpp | 104 ++++++++++++++++ src/ATen/native/xpu/mkl/BatchLinearAlgebra.h | 6 + src/ATen/native/xpu/sycl/QRKernel.cpp | 113 ------------------ src/ATen/native/xpu/sycl/QRKernel.h | 13 -- 6 files changed, 128 insertions(+), 140 deletions(-) delete mode 100644 src/ATen/native/xpu/QR.cpp delete mode 100644 src/ATen/native/xpu/sycl/QRKernel.cpp delete mode 100644 src/ATen/native/xpu/sycl/QRKernel.h diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp index 8036419c07..aa38a8c39c 100644 --- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #if defined(USE_ONEMKL_XPU) #include #endif // USE_ONEMKL_XPU @@ -64,4 +65,21 @@ void lu_factor_kernel_xpu( REGISTER_XPU_DISPATCH(lu_factor_stub, &lu_factor_kernel_xpu); +TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A, + std::string_view mode, + const Tensor & Q, + const Tensor & R) { +#if defined(USE_ONEMKL_XPU) + xpu::linalg_qr_kernel(A, mode, Q, R); +#else + auto A_cpu = A.to(A.options().device(kCPU)); + auto Q_cpu = Q.to(Q.options().device(kCPU)); + auto R_cpu = R.to(R.options().device(kCPU)); + at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode); + Q.copy_(Q_cpu); + R.copy_(R_cpu); +#endif // USE_ONEMKL_XPU +} + + } // namespace at::native diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp deleted file mode 100644 index 37d8896b20..0000000000 --- a/src/ATen/native/xpu/QR.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include - -namespace at::native { - -TORCH_IMPL_FUNC(linalg_qr_xpu_out) -(const at::Tensor& A, - std::string_view mode, - const at::Tensor& Q, - const at::Tensor& R) { - xpu::linalg_qr_kernel(A, mode, Q, R); -} - -} // namespace at::native diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp index 26e80fa4d0..12425d6ad6 100644 --- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp @@ -561,4 +561,108 @@ void lu_factor_mkl( pivots.copy_(pivots_); } +void linalg_qr_kernel( + const at::Tensor& A, + std::string_view mode, + const at::Tensor& Q, + const at::Tensor& R) { + + //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); + //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); + + at::Tensor a_contig = A.contiguous(); + at::Tensor result_r = at::clone(a_contig); + + auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU); + auto dimensions = A.sizes(); + + result_r = result_r.transpose(-2, -1).contiguous(); + + int numel = a_contig.numel(); + int range = a_contig.dim(); + int64_t n = a_contig.sizes().at(range - 2); + int64_t m = a_contig.sizes().at(range - 1); + int64_t mn = int64_t(m * n); + int64_t b = numel / mn; + + int out_q_columns = m > n ? n : m; + if (n > m && mode == "complete") { + out_q_columns = n; + } + + std::vector v(dimensions.begin(), dimensions.end()); + if (mode != "r") { + v[range - 1] = v[range - 2]; + v[range - 2] = out_q_columns; + } else { + v = std::vector({0, 0}); + } + auto q_dimensions = at::IntArrayRef(v); + + at::Tensor result_q = at::empty(q_dimensions, options); + + + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + + int64_t bufsize1 = + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + int64_t bufsize2 = + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + + int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1; + int64_t tau_len = m > n ? n : m; + float* sbuffer = sycl::malloc_device(bufsize, queue); + float* tau_buf = sycl::malloc_device(tau_len, queue); + float* r_buf = result_r.data_ptr(); + + float* q_buf = NULL; + if (mode != "r") { + q_buf = result_q.data_ptr(); + } + + for (int batch_item = 0; batch_item < b; batch_item++) { + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize) + .wait(); + + if (mode != "r") { + // copy relevant part of R matrix to Q matrix + int copy_columns = out_q_columns > m ? m : out_q_columns; + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + + oneapi::mkl::lapack::orgqr( + queue, + n, + out_q_columns, + tau_len, + q_buf, + n, + tau_buf, + sbuffer, + bufsize) + .wait(); + + q_buf += n * out_q_columns; + } + + r_buf += mn; + + } // batch + + sycl::free(sbuffer, queue); + sycl::free(tau_buf, queue); + + if ((mode == "reduced" || mode == "r") && n > m) { + result_r = + result_r + .index( + {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) + .contiguous(); + } + + Q.set_(result_q.transpose(-2, -1)); + R.set_(result_r.transpose(-2, -1).triu_()); + queue.wait(); +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h index c1cc1da5c6..ef846c4d6b 100644 --- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h +++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h @@ -16,4 +16,10 @@ TORCH_XPU_API void lu_factor_mkl( const Tensor& info, bool pivot); +TORCH_XPU_API void linalg_qr_kernel( + const at::Tensor& A, + std::string_view mode, + const at::Tensor& Q, + const at::Tensor& R); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp deleted file mode 100644 index 761744a8a8..0000000000 --- a/src/ATen/native/xpu/sycl/QRKernel.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include -#include -#include -#include -#include - -namespace at::native::xpu { - -void linalg_qr_kernel( - const at::Tensor& A, - std::string_view mode, - const at::Tensor& Q, - const at::Tensor& R) { - - //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); - //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); - - at::Tensor a_contig = A.contiguous(); - at::Tensor result_r = at::clone(a_contig); - - auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU); - auto dimensions = A.sizes(); - - result_r = result_r.transpose(-2, -1).contiguous(); - - int numel = a_contig.numel(); - int range = a_contig.dim(); - int64_t n = a_contig.sizes().at(range - 2); - int64_t m = a_contig.sizes().at(range - 1); - int64_t mn = int64_t(m * n); - int64_t b = numel / mn; - - int out_q_columns = m > n ? n : m; - if (n > m && mode == "complete") { - out_q_columns = n; - } - - std::vector v(dimensions.begin(), dimensions.end()); - if (mode != "r") { - v[range - 1] = v[range - 2]; - v[range - 2] = out_q_columns; - } else { - v = std::vector({0, 0}); - } - auto q_dimensions = at::IntArrayRef(v); - - at::Tensor result_q = at::empty(q_dimensions, options); - - - - sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); - - int64_t bufsize1 = - oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); - int64_t bufsize2 = - oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); - - int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1; - int64_t tau_len = m > n ? n : m; - float* sbuffer = sycl::malloc_device(bufsize, queue); - float* tau_buf = sycl::malloc_device(tau_len, queue); - float* r_buf = result_r.data_ptr(); - - float* q_buf = NULL; - if (mode != "r") { - q_buf = result_q.data_ptr(); - } - - for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize) - .wait(); - - if (mode != "r") { - // copy relevant part of R matrix to Q matrix - int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); - - oneapi::mkl::lapack::orgqr( - queue, - n, - out_q_columns, - tau_len, - q_buf, - n, - tau_buf, - sbuffer, - bufsize) - .wait(); - - q_buf += n * out_q_columns; - } - - r_buf += mn; - - } // batch - - sycl::free(sbuffer, queue); - sycl::free(tau_buf, queue); - - if ((mode == "reduced" || mode == "r") && n > m) { - result_r = - result_r - .index( - {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)}) - .contiguous(); - } - - Q.set_(result_q.transpose(-2, -1)); - R.set_(result_r.transpose(-2, -1).triu_()); - queue.wait(); -} - -} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h deleted file mode 100644 index 93696feed7..0000000000 --- a/src/ATen/native/xpu/sycl/QRKernel.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include - -namespace at::native::xpu { - -TORCH_XPU_API void linalg_qr_kernel( - const at::Tensor& A, - std::string_view mode, - const at::Tensor& Q, - const at::Tensor& R); - -} // namespace at::native::xpu From 419eb5731cdd1fb3df1b1b448c0cff5361d4c449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 26 Nov 2025 12:10:02 +0000 Subject: [PATCH 13/15] Port ut to test_linalg_xpu --- src/ATen/native/xpu/BatchLinearAlgebra.cpp | 2 +- test/xpu/test_linalg_qr.py | 41 ----------------- test/xpu/test_linalg_xpu.py | 53 ++++++++++++++++++++++ 3 files changed, 54 insertions(+), 42 deletions(-) delete mode 100644 test/xpu/test_linalg_qr.py diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp index aa38a8c39c..ea56200f12 100644 --- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp @@ -75,7 +75,7 @@ TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A, auto A_cpu = A.to(A.options().device(kCPU)); auto Q_cpu = Q.to(Q.options().device(kCPU)); auto R_cpu = R.to(R.options().device(kCPU)); - at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode); + at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode); Q.copy_(Q_cpu); R.copy_(R_cpu); #endif // USE_ONEMKL_XPU diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py deleted file mode 100644 index 2aaffab7bd..0000000000 --- a/test/xpu/test_linalg_qr.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest -import torch - - -@pytest.mark.parametrize("dtype", [torch.float32]) -@pytest.mark.parametrize("mode", ["reduced", "complete", "r"]) -@pytest.mark.parametrize( - "shape", - [ - # 2D matrices - (5, 3), - (3, 5), - (8, 6), - (6, 8), - # 3D batched matrices - (2, 3, 3), - (3, 4, 4), - (2, 5, 3), - (2, 3, 5), - # 4D batched matrices - (2, 3, 4, 4), - (1, 2, 5, 3), - (2, 1, 3, 5), - # Edge cases - (1, 1), - (10, 1), - (1, 10), - (2, 1, 1), - (1, 1, 1), - ], -) -def test_linalg_qr(dtype, mode, shape): - A = torch.randn(shape, dtype=dtype) - A_xpu = A.to("xpu") - - Q, R = torch.linalg.qr(A, mode=mode) - Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) - - assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5) - assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5) - assert Q_xpu.device.type == "xpu" and R_xpu.device.type == "xpu" diff --git a/test/xpu/test_linalg_xpu.py b/test/xpu/test_linalg_xpu.py index c7b88ccc9e..aa1fb745e4 100644 --- a/test/xpu/test_linalg_xpu.py +++ b/test/xpu/test_linalg_xpu.py @@ -473,6 +473,57 @@ def __tunableop_ctx(self): pass +@parametrize("batch", [1, 3]) +@parametrize("m", [0, 1, 12]) +@parametrize("n", [0, 1, 17]) +@dtypes(torch.float32) +def qr_mode_r(self, device, dtype, batch, m, n): + if batch > 1: + A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu") + else: + A_cpu = torch.randn(m, n, dtype=dtype, device="cpu") + A_xpu = A_cpu.to(device) + + R_cpu = torch.linalg.qr(A_cpu, mode="r").R + R_xpu = torch.linalg.qr(A_xpu, mode="r").R + self.assertEqual(R_xpu, R_cpu, atol=1e-5, rtol=1e-5) + + # Verify that R is upper triangular + lower_triangle = torch.tril(R_xpu, diagonal=-1) + self.assertEqual(lower_triangle.sum(), 0.0, atol=0.0, rtol=0.0) + + +@parametrize("batch", [1, 3]) +@parametrize("m", [0, 1, 12]) +@parametrize("n", [0, 1, 17]) +@parametrize("mode", ["reduced", "complete"]) +@dtypes(torch.float32) +def qr_modes_reduced_complete(self, device, dtype, batch, m, n, mode): + if batch > 1: + A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu") + else: + A_cpu = torch.randn(m, n, dtype=dtype, device="cpu") + A_xpu = A_cpu.to(device) + + Q_cpu, R_cpu = torch.linalg.qr(A_cpu, mode=mode) + Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode) + + self.assertEqual(Q_xpu, Q_cpu, atol=1e-5, rtol=1e-5) + self.assertEqual(R_xpu, R_cpu, atol=1e-5, rtol=1e-5) + + # Verify Q is orthogonal: Q^T @ Q should be identity + QTQ_xpu = torch.matmul(Q_xpu.mT, Q_xpu) + k = min(m, n) if mode == "reduced" else m + identity = torch.eye(k, dtype=dtype, device=device) + if batch > 1: + identity = identity.expand(batch, k, k) + self.assertEqual(QTQ_xpu, identity, atol=1e-5, rtol=1e-5) + + # Verify that R is upper triangular + lower_triangle = torch.tril(R_xpu, diagonal=-1) + self.assertEqual(lower_triangle.sum(), 0.0, atol=0.0, rtol=0.0) + + with XPUPatchForImport(False): from test_linalg import TestLinalg @@ -493,6 +544,8 @@ def __tunableop_ctx(self): TestLinalg.test_ck_blas_library = ck_blas_library TestLinalg.test_addmm_relu_tunableop_rocm = addmm_relu_tunableop_rocm TestLinalg._tunableop_ctx = __tunableop_ctx +TestLinalg.test_qr_mode_r = qr_mode_r +TestLinalg.test_qr_modes_reduced_complete = qr_modes_reduced_complete TestLinalg._default_dtype_check_enabled = True instantiate_device_type_tests(TestLinalg, globals(), only_for=("xpu"), allow_xpu=True) From 7906ef245d8eee63026a1b800464b381f3dca623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 26 Nov 2025 13:33:11 +0000 Subject: [PATCH 14/15] Fix the non-MKL fallback path --- src/ATen/native/xpu/BatchLinearAlgebra.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp index ea56200f12..e50a5cf4b4 100644 --- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #if defined(USE_ONEMKL_XPU) #include #endif // USE_ONEMKL_XPU @@ -72,10 +73,10 @@ TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A, #if defined(USE_ONEMKL_XPU) xpu::linalg_qr_kernel(A, mode, Q, R); #else - auto A_cpu = A.to(A.options().device(kCPU)); - auto Q_cpu = Q.to(Q.options().device(kCPU)); - auto R_cpu = R.to(R.options().device(kCPU)); - at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode); + auto A_cpu = A.to(at::kCPU); + auto Q_cpu = at::empty_like(Q, at::kCPU); + auto R_cpu = at::empty_like(R, at::kCPU); + at::cpu::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode); Q.copy_(Q_cpu); R.copy_(R_cpu); #endif // USE_ONEMKL_XPU From c995189fc9e528cbb1c7505b46724e98258ab3ff Mon Sep 17 00:00:00 2001 From: Michal Wiktor Date: Mon, 1 Dec 2025 14:29:53 +0000 Subject: [PATCH 15/15] Production ready fused QR kernel --- .../native/xpu/mkl/BatchLinearAlgebra.cpp | 57 +++++++++++++------ test/xpu/test_linalg_xpu.py | 8 +-- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp index 12425d6ad6..3b75487429 100644 --- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp @@ -561,19 +561,19 @@ void lu_factor_mkl( pivots.copy_(pivots_); } -void linalg_qr_kernel( + +template +void linalg_qr_kernel_impl( const at::Tensor& A, std::string_view mode, const at::Tensor& Q, const at::Tensor& R) { - //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor"); - //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float"); at::Tensor a_contig = A.contiguous(); at::Tensor result_r = at::clone(a_contig); - auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU); + auto options = at::TensorOptions().dtype(A.dtype()).device(kXPU); auto dimensions = A.sizes(); result_r = result_r.transpose(-2, -1).contiguous(); @@ -583,7 +583,15 @@ void linalg_qr_kernel( int64_t n = a_contig.sizes().at(range - 2); int64_t m = a_contig.sizes().at(range - 1); int64_t mn = int64_t(m * n); - int64_t b = numel / mn; + int64_t b = numel ==0 ? 0 : numel / mn; + + + if (b==0 && mode=="complete" && n>0) { + b=1; + for (int dimension=0; dimension n ? n : m; if (n > m && mode == "complete") { @@ -595,7 +603,7 @@ void linalg_qr_kernel( v[range - 1] = v[range - 2]; v[range - 2] = out_q_columns; } else { - v = std::vector({0, 0}); + v = std::vector({0}); } auto q_dimensions = at::IntArrayRef(v); @@ -606,29 +614,32 @@ void linalg_qr_kernel( sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); int64_t bufsize1 = - oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n, m, n); + oneapi::mkl::lapack::geqrf_scratchpad_size(queue, n+1, m+1, n+1); int64_t bufsize2 = - oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n, m, m, n); + oneapi::mkl::lapack::orgqr_scratchpad_size(queue, n+1, m+1, m+1, n+1); int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1; int64_t tau_len = m > n ? n : m; - float* sbuffer = sycl::malloc_device(bufsize, queue); - float* tau_buf = sycl::malloc_device(tau_len, queue); - float* r_buf = result_r.data_ptr(); + scalar_t* sbuffer = sycl::malloc_device(bufsize, queue); + scalar_t* tau_buf = sycl::malloc_device(tau_len, queue); + scalar_t* r_buf = result_r.data_ptr(); - float* q_buf = NULL; + scalar_t* q_buf = nullptr; if (mode != "r") { - q_buf = result_q.data_ptr(); + q_buf = result_q.data_ptr(); } for (int batch_item = 0; batch_item < b; batch_item++) { - oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize) + + + if (mn!=0) // make QR if there is something to orthogonalize + oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize) .wait(); if (mode != "r") { // copy relevant part of R matrix to Q matrix int copy_columns = out_q_columns > m ? m : out_q_columns; - queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait(); + queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(scalar_t)).wait(); oneapi::mkl::lapack::orgqr( queue, @@ -660,9 +671,23 @@ void linalg_qr_kernel( .contiguous(); } - Q.set_(result_q.transpose(-2, -1)); + // normal case, non-zero dimensions + if (mode!="r") { + result_q.transpose_(-2, -1); + } + Q.set_(result_q); R.set_(result_r.transpose(-2, -1).triu_()); queue.wait(); } +void linalg_qr_kernel( + const at::Tensor& A, + std::string_view mode, + const at::Tensor& Q, + const at::Tensor& R) { + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "linalg_qr_xpu", [&] { + linalg_qr_kernel_impl(A, mode, Q, R); + }); +} } // namespace at::native::xpu + // diff --git a/test/xpu/test_linalg_xpu.py b/test/xpu/test_linalg_xpu.py index aa1fb745e4..edc70c25ce 100644 --- a/test/xpu/test_linalg_xpu.py +++ b/test/xpu/test_linalg_xpu.py @@ -474,9 +474,9 @@ def __tunableop_ctx(self): @parametrize("batch", [1, 3]) -@parametrize("m", [0, 1, 12]) -@parametrize("n", [0, 1, 17]) -@dtypes(torch.float32) +@parametrize("m", [1, 12]) +@parametrize("n", [1, 17]) +@dtypes(torch.float32, torch.float64) def qr_mode_r(self, device, dtype, batch, m, n): if batch > 1: A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu") @@ -497,7 +497,7 @@ def qr_mode_r(self, device, dtype, batch, m, n): @parametrize("m", [0, 1, 12]) @parametrize("n", [0, 1, 17]) @parametrize("mode", ["reduced", "complete"]) -@dtypes(torch.float32) +@dtypes(torch.float32, torch.float64) def qr_modes_reduced_complete(self, device, dtype, batch, m, n, mode): if batch > 1: A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu")