From 78e95cd989984f97309e688402b1314e0ef5dbb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 29 Oct 2025 20:18:12 +0100
Subject: [PATCH 01/15] mm

---
 src/ATen/native/xpu/#Repro.cpp#   | 141 ++++++++++++++++++++++++
 src/ATen/native/xpu/Repro.cpp     | 133 ++++++++++++++++++++++
 src/ATen/native/xpu/Repro.cpp~    | 128 +++++++++++++++++++++
 src/ATen/native/xpu/repro.sycl    | 177 ++++++++++++++++++++++++++++++
 yaml/native/native_functions.yaml |   6 +
 5 files changed, 585 insertions(+)
 create mode 100644 src/ATen/native/xpu/#Repro.cpp#
 create mode 100644 src/ATen/native/xpu/Repro.cpp
 create mode 100644 src/ATen/native/xpu/Repro.cpp~
 create mode 100644 src/ATen/native/xpu/repro.sycl
diff --git a/src/ATen/native/xpu/#Repro.cpp# b/src/ATen/native/xpu/#Repro.cpp#
new file mode 100644
index 0000000000..fdc17d1ee7
--- /dev/null
+++ b/src/ATen/native/xpu/#Repro.cpp#
@@ -0,0 +1,141 @@
+#include <ATen/Operators.h>
+#include <c10/xpu/XPUStream.h>
+#include <sycl/sycl.hpp>
+#include <torch/all.h>
+#include <torch/library.h>
+//#include <mkl_lapack.h>
+#include <oneapi/mkl/lapack.hpp>
+#include <cstdint>
+
+// namespace qr_issue {
+
+namespace at {
+namespace native {
+
+
+std::tuple<at::Tensor, at::Tensor> local_qr(
+    const at::Tensor& a,
+    std::optional<std::string_view> mode_) {
+  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
+  // std::endl;
+
+  std::string mode = std::string(mode_.value_or("reduced"));
+
+  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor result_r = at::clone(a_contig);
+  //  at::Tensor result_r = at::empty_like(a_contig);
+  at::Tensor result_c = at::empty_like(a_contig);
+  at::Tensor result = at::empty_like(a_contig);
+
+  auto dimensions = a.sizes();
+
+  std::cout << "dim " << dimensions << std::endl;
+
+  std::cout << result_r << std::endl;
+  result_r = result_r.transpose(-2, -1).contiguous();
+  std::cout << result_r << std::endl;
+  int numel = a_contig.numel();
+  int range = a_contig.dim();
+  int64_t n = a_contig.sizes().at(range - 2);
+  int64_t m = a_contig.sizes().at(range - 1);
+  int64_t mn = int64_t(m * n);
+  int64_t b = numel / mn;
+
+  int out_q_columns = m > n ? n : m;
+  if (n > m && mode == "complete") {
+    out_q_columns = n;
+  }
+
+  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
+            << std::endl;
+  // at::Tensor result_q = result_r.clone();
+  // dimensions[1]=out_q_columns;
+  std::vector v(dimensions.begin(), dimensions.end());
+  v[range - 1] = v[range - 2];
+  v[range - 2] = out_q_columns;
+  auto ndimensions = at::IntArrayRef(v);
+  at::Tensor result_q = at::empty(ndimensions);
+
+  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+  int64_t mn1 =
+      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+  int64_t mn2 =
+      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+  mn2 = mn1 > mn2 ? mn1 : mn2;
+  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
+  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+  float* r_buf = result_r.data_ptr<float>();
+  float* q_buf = result_q.data_ptr<float>();
+
+  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
+  // "complete") << std::endl;
+
+  for (int batch_item = 0; batch_item < b; batch_item++) {
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+
+    if (mode != "r") {
+      // copy relevant part of R matrix to Q matrix
+      int copy_columns = out_q_columns > m ? m : out_q_columns;
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+      oneapi::mkl::lapack::orgqr(
+          queue,
+          n,
+          out_q_columns,
+          out_q_columns,
+          q_buf,
+          n,
+          tau_buf,
+          sbuffer,
+          mn2);
+      std::cout << "done" << std::endl;
+
+      sycl::free(sbuffer, queue);
+      std::cout << "done2" << std::endl;
+    }
+
+    r_buf += mn;
+    q_buf += n * out_q_columns;
+
+  } // batch
+
+  if (mode == "r") {
+    result_q = at::empty({0, 0});
+  }
+
+  if ((mode == "reduced" || mode == "r") && n > m) {
+    result_r =
+        result_r
+            .index(
+                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+            .contiguous();
+  }
+
+  // result_q.transpose(0,1);
+  return std::make_tuple(
+      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+  //}
+}
+  // Defines the operators
+  // TORCH_LIBRARY(qr_issue, m) {
+  //   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
+  // }
+
+  // ==================================================
+  // Register SYCL Implementations to Torch Library
+  // ==================================================
+  // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
+  //   m.impl("local_qr", &_qr_xpu);
+  // }
+
+ // namespace qr_issue
+}}
+
+
+//cd torch-xpu-ops
+//git checkout -B dev_wiktor
+//git add .
+//git commit -m "jakas nazwa"
+//git push
+
diff --git a/src/ATen/native/xpu/Repro.cpp b/src/ATen/native/xpu/Repro.cpp
new file mode 100644
index 0000000000..5653b76295
--- /dev/null
+++ b/src/ATen/native/xpu/Repro.cpp
@@ -0,0 +1,133 @@
+#include <ATen/Operators.h>
+#include <c10/xpu/XPUStream.h>
+#include <sycl/sycl.hpp>
+#include <torch/all.h>
+#include <torch/library.h>
+//#include <mkl_lapack.h>
+#include <oneapi/mkl/lapack.hpp>
+#include <cstdint>
+
+// namespace qr_issue {
+
+namespace at {
+namespace native {
+
+std::tuple<at::Tensor, at::Tensor> local_qr(
+    const at::Tensor& a,
+    std::optional<std::string_view> mode_) {
+  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
+  // std::endl;
+
+  std::string mode = std::string(mode_.value_or("reduced"));
+
+  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor result_r = at::clone(a_contig);
+  //  at::Tensor result_r = at::empty_like(a_contig);
+  at::Tensor result_c = at::empty_like(a_contig);
+  at::Tensor result = at::empty_like(a_contig);
+
+  auto dimensions = a.sizes();
+
+  std::cout << "dim " << dimensions << std::endl;
+
+  std::cout << result_r << std::endl;
+  result_r = result_r.transpose(-2, -1).contiguous();
+  std::cout << result_r << std::endl;
+  int numel = a_contig.numel();
+  int range = a_contig.dim();
+  int64_t n = a_contig.sizes().at(range - 2);
+  int64_t m = a_contig.sizes().at(range - 1);
+  int64_t mn = int64_t(m * n);
+  int64_t b = numel / mn;
+
+  int out_q_columns = m > n ? n : m;
+  if (n > m && mode == "complete") {
+    out_q_columns = n;
+  }
+
+  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
+            << std::endl;
+  // at::Tensor result_q = result_r.clone();
+  // dimensions[1]=out_q_columns;
+  std::vector v(dimensions.begin(), dimensions.end());
+  v[range - 1] = v[range - 2];
+  v[range - 2] = out_q_columns;
+  auto ndimensions = at::IntArrayRef(v);
+  at::Tensor result_q = at::empty(ndimensions);
+
+  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+  int64_t mn1 =
+      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+  int64_t mn2 =
+      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+  mn2 = mn1 > mn2 ? mn1 : mn2;
+  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
+  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+  float* r_buf = result_r.data_ptr<float>();
+  float* q_buf = result_q.data_ptr<float>();
+
+  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
+  // "complete") << std::endl;
+
+  for (int batch_item = 0; batch_item < b; batch_item++) {
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+
+    if (mode != "r") {
+      // copy relevant part of R matrix to Q matrix
+      int copy_columns = out_q_columns > m ? m : out_q_columns;
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+      oneapi::mkl::lapack::orgqr(
+          queue,
+          n,
+          out_q_columns,
+          out_q_columns,
+          q_buf,
+          n,
+          tau_buf,
+          sbuffer,
+          mn2);
+      std::cout << "done" << std::endl;
+
+      sycl::free(sbuffer, queue);
+      std::cout << "done2" << std::endl;
+    }
+
+    r_buf += mn;
+    q_buf += n * out_q_columns;
+
+  } // batch
+
+  if (mode == "r") {
+    result_q = at::empty({0, 0});
+  }
+
+  if ((mode == "reduced" || mode == "r") && n > m) {
+    result_r =
+        result_r
+            .index(
+                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+            .contiguous();
+  }
+
+  // result_q.transpose(0,1);
+  return std::make_tuple(
+      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+  //}
+}
+// Defines the operators
+// TORCH_LIBRARY(qr_issue, m) {
+//   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
+// }
+
+// ==================================================
+// Register SYCL Implementations to Torch Library
+// ==================================================
+// TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
+//   m.impl("local_qr", &_qr_xpu);
+// }
+
+// namespace qr_issue
+} // namespace native
+} // namespace at
diff --git a/src/ATen/native/xpu/Repro.cpp~ b/src/ATen/native/xpu/Repro.cpp~
new file mode 100644
index 0000000000..a714102800
--- /dev/null
+++ b/src/ATen/native/xpu/Repro.cpp~
@@ -0,0 +1,128 @@
+#include <ATen/Operators.h>
+#include <c10/xpu/XPUStream.h>
+#include <sycl/sycl.hpp>
+#include <torch/all.h>
+#include <torch/library.h>
+//#include <mkl_lapack.h>
+#include <oneapi/mkl/lapack.hpp>
+#include <cstdint>
+
+// namespace qr_issue {
+
+std::tuple<at::Tensor, at::Tensor> local_qr(
+    const at::Tensor& a,
+    std::optional<std::string_view> mode_) {
+  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
+  // std::endl;
+
+  std::string mode = std::string(mode_.value_or("reduced"));
+
+  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor result_r = at::clone(a_contig);
+  //  at::Tensor result_r = at::empty_like(a_contig);
+  at::Tensor result_c = at::empty_like(a_contig);
+  at::Tensor result = at::empty_like(a_contig);
+
+  auto dimensions = a.sizes();
+
+  std::cout << "dim " << dimensions << std::endl;
+
+  std::cout << result_r << std::endl;
+  result_r = result_r.transpose(-2, -1).contiguous();
+  std::cout << result_r << std::endl;
+  int numel = a_contig.numel();
+  int range = a_contig.dim();
+  int64_t n = a_contig.sizes().at(range - 2);
+  int64_t m = a_contig.sizes().at(range - 1);
+  int64_t mn = int64_t(m * n);
+  int64_t b = numel / mn;
+
+  int out_q_columns = m > n ? n : m;
+  if (n > m && mode == "complete") {
+    out_q_columns = n;
+  }
+
+  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
+            << std::endl;
+  // at::Tensor result_q = result_r.clone();
+  // dimensions[1]=out_q_columns;
+  std::vector v(dimensions.begin(), dimensions.end());
+  v[range - 1] = v[range - 2];
+  v[range - 2] = out_q_columns;
+  auto ndimensions = at::IntArrayRef(v);
+  at::Tensor result_q = at::empty(ndimensions);
+
+  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+  int64_t mn1 =
+      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+  int64_t mn2 =
+      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+  mn2 = mn1 > mn2 ? mn1 : mn2;
+  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
+  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+  float* r_buf = result_r.data_ptr<float>();
+  float* q_buf = result_q.data_ptr<float>();
+
+  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
+  // "complete") << std::endl;
+
+  for (int batch_item = 0; batch_item < b; batch_item++) {
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+
+    if (mode != "r") {
+      // copy relevant part of R matrix to Q matrix
+      int copy_columns = out_q_columns > m ? m : out_q_columns;
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+      oneapi::mkl::lapack::orgqr(
+          queue,
+          n,
+          out_q_columns,
+          out_q_columns,
+          q_buf,
+          n,
+          tau_buf,
+          sbuffer,
+          mn2);
+      std::cout << "done" << std::endl;
+
+      sycl::free(sbuffer, queue);
+      std::cout << "done2" << std::endl;
+    }
+
+    r_buf += mn;
+    q_buf += n * out_q_columns;
+
+  } // batch
+
+  if (mode == "r") {
+    result_q = at::empty({0, 0});
+  }
+
+  if ((mode == "reduced" || mode == "r") && n > m) {
+    result_r =
+        result_r
+            .index(
+                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+            .contiguous();
+  }
+
+  // result_q.transpose(0,1);
+  return std::make_tuple(
+      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+  //}
+
+  // Defines the operators
+  // TORCH_LIBRARY(qr_issue, m) {
+  //   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
+  // }
+
+  // ==================================================
+  // Register SYCL Implementations to Torch Library
+  // ==================================================
+  // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
+  //   m.impl("local_qr", &_qr_xpu);
+  // }
+
+} // namespace qr_issue
diff --git a/src/ATen/native/xpu/repro.sycl b/src/ATen/native/xpu/repro.sycl
new file mode 100644
index 0000000000..63f3247740
--- /dev/null
+++ b/src/ATen/native/xpu/repro.sycl
@@ -0,0 +1,177 @@
+#include <c10/xpu/XPUStream.h>
+#include <sycl/sycl.hpp>
+#include <ATen/Operators.h>
+#include <torch/all.h>
+#include <torch/library.h>
+//#include <mkl_lapack.h>
+#include <oneapi/mkl/lapack.hpp>
+#include <cstdint>
+
+namespace  qr_issue {
+ 
+  std::tuple<at::Tensor,at::Tensor>   _qr_xpu(const at::Tensor& a,std::optional<std::string_view> mode_) {
+
+
+    //std::cout << "Call mode is " << " " << mode_.value_or("compressed") << std::endl;
+
+    std::string mode = std::string(mode_.value_or("reduced"));
+    
+    TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+    at::Tensor a_contig = a.contiguous();
+    at::Tensor result_r = at::clone(a_contig);
+    //  at::Tensor result_r = at::empty_like(a_contig);
+    at::Tensor result_c = at::empty_like(a_contig);
+    at::Tensor result = at::empty_like(a_contig);
+
+    auto dimensions = a.sizes();
+
+    std::cout << "dim " << dimensions << std::endl;
+
+
+    
+    std::cout << result_r << std::endl;
+    result_r=result_r.transpose(-2,-1).contiguous();
+    std::cout << result_r << std::endl;
+    int numel = a_contig.numel();
+    int range = a_contig.dim();
+    int64_t n = a_contig.sizes().at(range-2);
+    int64_t m = a_contig.sizes().at(range-1);
+    int64_t mn = int64_t(m*n);
+    int64_t b = numel/mn;
+
+
+
+
+    int out_q_columns = m > n ? n : m;
+    if (n>m && mode=="complete") {
+      out_q_columns=n;
+    }
+
+    std::cout << "dim2 " << n << " " << m << " " << b  << " " <<
+      out_q_columns << std::endl;
+    //at::Tensor result_q = result_r.clone();
+    //dimensions[1]=out_q_columns;
+    std::vector v(dimensions.begin(),dimensions.end());
+    v[range-1]=v[range-2];
+    v[range-2]=out_q_columns;
+    auto ndimensions = at::IntArrayRef(v);
+    at::Tensor result_q=at::empty(ndimensions);    
+  
+    sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+    int64_t mn1 = oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue,n,m,n);
+    int64_t mn2 = oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue,n,m,m,n);
+    mn2 = mn1>mn2 ? mn1 : mn2;
+    float* sbuffer = sycl::malloc_device<float>(mn2,queue);
+    float* tau_buf = sycl::malloc_device<float>(out_q_columns,queue);
+    float* r_buf = result_r.data_ptr<float>();
+    float* q_buf = result_q.data_ptr<float>();
+  
+    //std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == "complete") << std::endl;
+
+
+    for (int batch_item=0; batch_item<b; batch_item++) {
+
+    
+    
+      oneapi::mkl::lapack::geqrf(queue,n,m,r_buf,n,tau_buf,sbuffer,mn2);
+
+
+      if (mode != "r") {
+    
+	// copy relevant part of R matrix to Q matrix
+	int copy_columns = out_q_columns>m ? m : out_q_columns;
+	queue.memcpy(q_buf,r_buf,n*copy_columns*sizeof(float)).wait();
+	
+
+	oneapi::mkl::lapack::orgqr(queue,n,out_q_columns,out_q_columns,q_buf,n,tau_buf,sbuffer,mn2);
+	std::cout << "done" << std::endl;
+	
+	sycl::free(sbuffer,queue);
+	std::cout << "done2" << std::endl;
+      
+      }
+
+      r_buf+=mn;
+      q_buf+=n*out_q_columns;
+      
+    } // batch
+
+    if (mode =="r") {
+      result_q=at::empty({0,0});
+    }
+    
+    if ((mode=="reduced" || mode =="r" ) && n>m) {
+      result_r = result_r.index({"...",at::indexing::Slice(0,n),at::indexing::Slice(0,m)}).contiguous();
+    }
+    
+    //result_q.transpose(0,1);
+    return std::make_tuple(result_q.transpose(-2,-1),result_r.transpose(-2,-1).triu_());
+
+  }
+
+ 
+
+// Defines the operators
+TORCH_LIBRARY(qr_issue, m) {
+  m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
+}
+
+// ==================================================
+// Register SYCL Implementations to Torch Library
+// ==================================================
+TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
+    m.impl("local_qr",&_qr_xpu);
+ }
+
+} // namespace sycl_extension
+
+
+
+//#include <c10/xpu/XPUStream.h>
+
+// original code, which compiles, but failed to link.
+
+//#include <sycl/sycl.hpp>
+//#include <ATen/Operators.h>
+//#include <torch/all.h>
+//#include <torch/library.h>
+
+//#include <mkl_lapack.h>
+
+/* #include <oneapi/mkl/lapack.hpp> */
+/* #include <cstdint> */
+
+/* using namespace sycl; */
+
+/* int main() */
+/* { */
+/*   queue q; */
+  
+  
+
+  
+/*   int numel = 25;  */
+/*   int64_t n = 5;  */
+/*   int64_t m = 5;  */
+/*   int64_t mn=m*n; */
+
+/*   std::cout << " dziala " << n << " " << m << " " << mn << std::endl; */
+/*   // at::Tensor tau      = at::empty({1,n}); */
+    
+/*   int64_t mn2 = oneapi::mkl::lapack::geqrf_scratchpad_size<float>(q,n,m,n); */
+/*   int64_t mn3 = oneapi::mkl::lapack::ormqr_scratchpad_size<float>(q,oneapi::mkl::side::left,oneapi::mkl::transpose::trans,n,m,n,n,n); */
+
+/*   sycl::buffer<float> sbuffer(mn2); */
+/*   sycl::buffer<float> res_q_ptr(numel); */
+/*   sycl::buffer<float> tau_ptr(numel); */
+/*   sycl::buffer<float> c_ptr(numel); */
+  
+
+/*   std::cout << " dziala " << n << " " << m << " " << mn << " " << mn2 << " " << mn3 <<  std::endl; */
+/*   oneapi::mkl::lapack::geqrf(q,n,m,res_q_ptr,n,tau_ptr,sbuffer,mn2); */
+/*   sycl::buffer<float> sbuffer2(mn3); */
+/*   oneapi::mkl::lapack::ormqr(q,oneapi::mkl::side::right,oneapi::mkl::transpose::nontrans,n,m,n,res_q_ptr,n,tau_ptr,c_ptr,n,sbuffer2,mn3); */
+/*     std::cout << "done" << std::endl; */
+  
+/*     return 0; */
+/* } */
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index a3281791de..f11752f456 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -9443,6 +9443,12 @@
 - func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor
   python_module: linalg
 
+- func: local_qr(Tensor a, str? mode_=None) -> (Tensor Q, Tensor R) 
+  python_module: linalg
+  structured: False
+  dispatch:
+    XPU: local_qr_xpu
+
 - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
   python_module: linalg
   structured_delegate: linalg_inv_ex.inverse

From 60b868bda9c1dc4bccf3a9034fbe85caa271f451 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 29 Oct 2025 20:29:56 +0100
Subject: [PATCH 02/15] remove duplicates

---
 src/ATen/native/xpu/#Repro.cpp# | 141 -------------------------
 src/ATen/native/xpu/Repro.cpp~  | 128 -----------------------
 src/ATen/native/xpu/repro.sycl  | 177 --------------------------------
 3 files changed, 446 deletions(-)
 delete mode 100644 src/ATen/native/xpu/#Repro.cpp#
 delete mode 100644 src/ATen/native/xpu/Repro.cpp~
 delete mode 100644 src/ATen/native/xpu/repro.sycl

diff --git a/src/ATen/native/xpu/#Repro.cpp# b/src/ATen/native/xpu/#Repro.cpp#
deleted file mode 100644
index fdc17d1ee7..0000000000
--- a/src/ATen/native/xpu/#Repro.cpp#
+++ /dev/null
@@ -1,141 +0,0 @@
-#include <ATen/Operators.h>
-#include <c10/xpu/XPUStream.h>
-#include <sycl/sycl.hpp>
-#include <torch/all.h>
-#include <torch/library.h>
-//#include <mkl_lapack.h>
-#include <oneapi/mkl/lapack.hpp>
-#include <cstdint>
-
-// namespace qr_issue {
-
-namespace at {
-namespace native {
-
-
-std::tuple<at::Tensor, at::Tensor> local_qr(
-    const at::Tensor& a,
-    std::optional<std::string_view> mode_) {
-  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
-  // std::endl;
-
-  std::string mode = std::string(mode_.value_or("reduced"));
-
-  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor result_r = at::clone(a_contig);
-  //  at::Tensor result_r = at::empty_like(a_contig);
-  at::Tensor result_c = at::empty_like(a_contig);
-  at::Tensor result = at::empty_like(a_contig);
-
-  auto dimensions = a.sizes();
-
-  std::cout << "dim " << dimensions << std::endl;
-
-  std::cout << result_r << std::endl;
-  result_r = result_r.transpose(-2, -1).contiguous();
-  std::cout << result_r << std::endl;
-  int numel = a_contig.numel();
-  int range = a_contig.dim();
-  int64_t n = a_contig.sizes().at(range - 2);
-  int64_t m = a_contig.sizes().at(range - 1);
-  int64_t mn = int64_t(m * n);
-  int64_t b = numel / mn;
-
-  int out_q_columns = m > n ? n : m;
-  if (n > m && mode == "complete") {
-    out_q_columns = n;
-  }
-
-  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
-            << std::endl;
-  // at::Tensor result_q = result_r.clone();
-  // dimensions[1]=out_q_columns;
-  std::vector v(dimensions.begin(), dimensions.end());
-  v[range - 1] = v[range - 2];
-  v[range - 2] = out_q_columns;
-  auto ndimensions = at::IntArrayRef(v);
-  at::Tensor result_q = at::empty(ndimensions);
-
-  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-  int64_t mn1 =
-      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-  int64_t mn2 =
-      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-  mn2 = mn1 > mn2 ? mn1 : mn2;
-  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
-  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
-  float* r_buf = result_r.data_ptr<float>();
-  float* q_buf = result_q.data_ptr<float>();
-
-  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
-  // "complete") << std::endl;
-
-  for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
-
-    if (mode != "r") {
-      // copy relevant part of R matrix to Q matrix
-      int copy_columns = out_q_columns > m ? m : out_q_columns;
-      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-
-      oneapi::mkl::lapack::orgqr(
-          queue,
-          n,
-          out_q_columns,
-          out_q_columns,
-          q_buf,
-          n,
-          tau_buf,
-          sbuffer,
-          mn2);
-      std::cout << "done" << std::endl;
-
-      sycl::free(sbuffer, queue);
-      std::cout << "done2" << std::endl;
-    }
-
-    r_buf += mn;
-    q_buf += n * out_q_columns;
-
-  } // batch
-
-  if (mode == "r") {
-    result_q = at::empty({0, 0});
-  }
-
-  if ((mode == "reduced" || mode == "r") && n > m) {
-    result_r =
-        result_r
-            .index(
-                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-            .contiguous();
-  }
-
-  // result_q.transpose(0,1);
-  return std::make_tuple(
-      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
-  //}
-}
-  // Defines the operators
-  // TORCH_LIBRARY(qr_issue, m) {
-  //   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
-  // }
-
-  // ==================================================
-  // Register SYCL Implementations to Torch Library
-  // ==================================================
-  // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
-  //   m.impl("local_qr", &_qr_xpu);
-  // }
-
- // namespace qr_issue
-}}
-
-
-//cd torch-xpu-ops
-//git checkout -B dev_wiktor
-//git add .
-//git commit -m "jakas nazwa"
-//git push
-
diff --git a/src/ATen/native/xpu/Repro.cpp~ b/src/ATen/native/xpu/Repro.cpp~
deleted file mode 100644
index a714102800..0000000000
--- a/src/ATen/native/xpu/Repro.cpp~
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <ATen/Operators.h>
-#include <c10/xpu/XPUStream.h>
-#include <sycl/sycl.hpp>
-#include <torch/all.h>
-#include <torch/library.h>
-//#include <mkl_lapack.h>
-#include <oneapi/mkl/lapack.hpp>
-#include <cstdint>
-
-// namespace qr_issue {
-
-std::tuple<at::Tensor, at::Tensor> local_qr(
-    const at::Tensor& a,
-    std::optional<std::string_view> mode_) {
-  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
-  // std::endl;
-
-  std::string mode = std::string(mode_.value_or("reduced"));
-
-  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor result_r = at::clone(a_contig);
-  //  at::Tensor result_r = at::empty_like(a_contig);
-  at::Tensor result_c = at::empty_like(a_contig);
-  at::Tensor result = at::empty_like(a_contig);
-
-  auto dimensions = a.sizes();
-
-  std::cout << "dim " << dimensions << std::endl;
-
-  std::cout << result_r << std::endl;
-  result_r = result_r.transpose(-2, -1).contiguous();
-  std::cout << result_r << std::endl;
-  int numel = a_contig.numel();
-  int range = a_contig.dim();
-  int64_t n = a_contig.sizes().at(range - 2);
-  int64_t m = a_contig.sizes().at(range - 1);
-  int64_t mn = int64_t(m * n);
-  int64_t b = numel / mn;
-
-  int out_q_columns = m > n ? n : m;
-  if (n > m && mode == "complete") {
-    out_q_columns = n;
-  }
-
-  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
-            << std::endl;
-  // at::Tensor result_q = result_r.clone();
-  // dimensions[1]=out_q_columns;
-  std::vector v(dimensions.begin(), dimensions.end());
-  v[range - 1] = v[range - 2];
-  v[range - 2] = out_q_columns;
-  auto ndimensions = at::IntArrayRef(v);
-  at::Tensor result_q = at::empty(ndimensions);
-
-  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-  int64_t mn1 =
-      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-  int64_t mn2 =
-      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-  mn2 = mn1 > mn2 ? mn1 : mn2;
-  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
-  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
-  float* r_buf = result_r.data_ptr<float>();
-  float* q_buf = result_q.data_ptr<float>();
-
-  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
-  // "complete") << std::endl;
-
-  for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
-
-    if (mode != "r") {
-      // copy relevant part of R matrix to Q matrix
-      int copy_columns = out_q_columns > m ? m : out_q_columns;
-      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-
-      oneapi::mkl::lapack::orgqr(
-          queue,
-          n,
-          out_q_columns,
-          out_q_columns,
-          q_buf,
-          n,
-          tau_buf,
-          sbuffer,
-          mn2);
-      std::cout << "done" << std::endl;
-
-      sycl::free(sbuffer, queue);
-      std::cout << "done2" << std::endl;
-    }
-
-    r_buf += mn;
-    q_buf += n * out_q_columns;
-
-  } // batch
-
-  if (mode == "r") {
-    result_q = at::empty({0, 0});
-  }
-
-  if ((mode == "reduced" || mode == "r") && n > m) {
-    result_r =
-        result_r
-            .index(
-                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-            .contiguous();
-  }
-
-  // result_q.transpose(0,1);
-  return std::make_tuple(
-      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
-  //}
-
-  // Defines the operators
-  // TORCH_LIBRARY(qr_issue, m) {
-  //   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
-  // }
-
-  // ==================================================
-  // Register SYCL Implementations to Torch Library
-  // ==================================================
-  // TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
-  //   m.impl("local_qr", &_qr_xpu);
-  // }
-
-} // namespace qr_issue
diff --git a/src/ATen/native/xpu/repro.sycl b/src/ATen/native/xpu/repro.sycl
deleted file mode 100644
index 63f3247740..0000000000
--- a/src/ATen/native/xpu/repro.sycl
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <c10/xpu/XPUStream.h>
-#include <sycl/sycl.hpp>
-#include <ATen/Operators.h>
-#include <torch/all.h>
-#include <torch/library.h>
-//#include <mkl_lapack.h>
-#include <oneapi/mkl/lapack.hpp>
-#include <cstdint>
-
-namespace  qr_issue {
- 
-  std::tuple<at::Tensor,at::Tensor>   _qr_xpu(const at::Tensor& a,std::optional<std::string_view> mode_) {
-
-
-    //std::cout << "Call mode is " << " " << mode_.value_or("compressed") << std::endl;
-
-    std::string mode = std::string(mode_.value_or("reduced"));
-    
-    TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
-    at::Tensor a_contig = a.contiguous();
-    at::Tensor result_r = at::clone(a_contig);
-    //  at::Tensor result_r = at::empty_like(a_contig);
-    at::Tensor result_c = at::empty_like(a_contig);
-    at::Tensor result = at::empty_like(a_contig);
-
-    auto dimensions = a.sizes();
-
-    std::cout << "dim " << dimensions << std::endl;
-
-
-    
-    std::cout << result_r << std::endl;
-    result_r=result_r.transpose(-2,-1).contiguous();
-    std::cout << result_r << std::endl;
-    int numel = a_contig.numel();
-    int range = a_contig.dim();
-    int64_t n = a_contig.sizes().at(range-2);
-    int64_t m = a_contig.sizes().at(range-1);
-    int64_t mn = int64_t(m*n);
-    int64_t b = numel/mn;
-
-
-
-
-    int out_q_columns = m > n ? n : m;
-    if (n>m && mode=="complete") {
-      out_q_columns=n;
-    }
-
-    std::cout << "dim2 " << n << " " << m << " " << b  << " " <<
-      out_q_columns << std::endl;
-    //at::Tensor result_q = result_r.clone();
-    //dimensions[1]=out_q_columns;
-    std::vector v(dimensions.begin(),dimensions.end());
-    v[range-1]=v[range-2];
-    v[range-2]=out_q_columns;
-    auto ndimensions = at::IntArrayRef(v);
-    at::Tensor result_q=at::empty(ndimensions);    
-  
-    sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-    int64_t mn1 = oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue,n,m,n);
-    int64_t mn2 = oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue,n,m,m,n);
-    mn2 = mn1>mn2 ? mn1 : mn2;
-    float* sbuffer = sycl::malloc_device<float>(mn2,queue);
-    float* tau_buf = sycl::malloc_device<float>(out_q_columns,queue);
-    float* r_buf = result_r.data_ptr<float>();
-    float* q_buf = result_q.data_ptr<float>();
-  
-    //std::cout << "entering " << n << " " << m << " " << mode << " " << (mode == "complete") << std::endl;
-
-
-    for (int batch_item=0; batch_item<b; batch_item++) {
-
-    
-    
-      oneapi::mkl::lapack::geqrf(queue,n,m,r_buf,n,tau_buf,sbuffer,mn2);
-
-
-      if (mode != "r") {
-    
-	// copy relevant part of R matrix to Q matrix
-	int copy_columns = out_q_columns>m ? m : out_q_columns;
-	queue.memcpy(q_buf,r_buf,n*copy_columns*sizeof(float)).wait();
-	
-
-	oneapi::mkl::lapack::orgqr(queue,n,out_q_columns,out_q_columns,q_buf,n,tau_buf,sbuffer,mn2);
-	std::cout << "done" << std::endl;
-	
-	sycl::free(sbuffer,queue);
-	std::cout << "done2" << std::endl;
-      
-      }
-
-      r_buf+=mn;
-      q_buf+=n*out_q_columns;
-      
-    } // batch
-
-    if (mode =="r") {
-      result_q=at::empty({0,0});
-    }
-    
-    if ((mode=="reduced" || mode =="r" ) && n>m) {
-      result_r = result_r.index({"...",at::indexing::Slice(0,n),at::indexing::Slice(0,m)}).contiguous();
-    }
-    
-    //result_q.transpose(0,1);
-    return std::make_tuple(result_q.transpose(-2,-1),result_r.transpose(-2,-1).triu_());
-
-  }
-
- 
-
-// Defines the operators
-TORCH_LIBRARY(qr_issue, m) {
-  m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
-}
-
-// ==================================================
-// Register SYCL Implementations to Torch Library
-// ==================================================
-TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
-    m.impl("local_qr",&_qr_xpu);
- }
-
-} // namespace sycl_extension
-
-
-
-//#include <c10/xpu/XPUStream.h>
-
-// original code, which compiles, but failed to link.
-
-//#include <sycl/sycl.hpp>
-//#include <ATen/Operators.h>
-//#include <torch/all.h>
-//#include <torch/library.h>
-
-//#include <mkl_lapack.h>
-
-/* #include <oneapi/mkl/lapack.hpp> */
-/* #include <cstdint> */
-
-/* using namespace sycl; */
-
-/* int main() */
-/* { */
-/*   queue q; */
-  
-  
-
-  
-/*   int numel = 25;  */
-/*   int64_t n = 5;  */
-/*   int64_t m = 5;  */
-/*   int64_t mn=m*n; */
-
-/*   std::cout << " dziala " << n << " " << m << " " << mn << std::endl; */
-/*   // at::Tensor tau      = at::empty({1,n}); */
-    
-/*   int64_t mn2 = oneapi::mkl::lapack::geqrf_scratchpad_size<float>(q,n,m,n); */
-/*   int64_t mn3 = oneapi::mkl::lapack::ormqr_scratchpad_size<float>(q,oneapi::mkl::side::left,oneapi::mkl::transpose::trans,n,m,n,n,n); */
-
-/*   sycl::buffer<float> sbuffer(mn2); */
-/*   sycl::buffer<float> res_q_ptr(numel); */
-/*   sycl::buffer<float> tau_ptr(numel); */
-/*   sycl::buffer<float> c_ptr(numel); */
-  
-
-/*   std::cout << " dziala " << n << " " << m << " " << mn << " " << mn2 << " " << mn3 <<  std::endl; */
-/*   oneapi::mkl::lapack::geqrf(q,n,m,res_q_ptr,n,tau_ptr,sbuffer,mn2); */
-/*   sycl::buffer<float> sbuffer2(mn3); */
-/*   oneapi::mkl::lapack::ormqr(q,oneapi::mkl::side::right,oneapi::mkl::transpose::nontrans,n,m,n,res_q_ptr,n,tau_ptr,c_ptr,n,sbuffer2,mn3); */
-/*     std::cout << "done" << std::endl; */
-  
-/*     return 0; */
-/* } */

From ba893b09a0ee42d655e13b48faeda36c504136ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Fri, 31 Oct 2025 13:17:33 +0000
Subject: [PATCH 03/15] dev3

---
 src/ATen/native/xpu/QR.cpp            |  19 ++++
 src/ATen/native/xpu/Repro.cpp         | 133 --------------------------
 src/ATen/native/xpu/sycl/QRKernel.cpp | 132 +++++++++++++++++++++++++
 src/ATen/native/xpu/sycl/QRKernel.h   |  13 +++
 yaml/native/native_functions.yaml     |  11 ++-
 5 files changed, 172 insertions(+), 136 deletions(-)
 create mode 100644 src/ATen/native/xpu/QR.cpp
 delete mode 100644 src/ATen/native/xpu/Repro.cpp
 create mode 100644 src/ATen/native/xpu/sycl/QRKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/QRKernel.h

diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp
new file mode 100644
index 0000000000..9a2adeebc3
--- /dev/null
+++ b/src/ATen/native/xpu/QR.cpp
@@ -0,0 +1,19 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/xpu/sycl/QRKernel.h>
+#include <comm/RegisterUtils.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/linalg_qr_native.h>
+
+namespace at::native {
+
+TORCH_IMPL_FUNC(linalg_qr_xpu_out)
+(const Tensor& A,
+ c10::string_view mode,
+ const Tensor& Q,
+ const Tensor& R) {
+  xpu::linalg_qr_kernel(A, mode, Q, R);
+}
+
+} // namespace at::native
diff --git a/src/ATen/native/xpu/Repro.cpp b/src/ATen/native/xpu/Repro.cpp
deleted file mode 100644
index 5653b76295..0000000000
--- a/src/ATen/native/xpu/Repro.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-#include <ATen/Operators.h>
-#include <c10/xpu/XPUStream.h>
-#include <sycl/sycl.hpp>
-#include <torch/all.h>
-#include <torch/library.h>
-//#include <mkl_lapack.h>
-#include <oneapi/mkl/lapack.hpp>
-#include <cstdint>
-
-// namespace qr_issue {
-
-namespace at {
-namespace native {
-
-std::tuple<at::Tensor, at::Tensor> local_qr(
-    const at::Tensor& a,
-    std::optional<std::string_view> mode_) {
-  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
-  // std::endl;
-
-  std::string mode = std::string(mode_.value_or("reduced"));
-
-  TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor result_r = at::clone(a_contig);
-  //  at::Tensor result_r = at::empty_like(a_contig);
-  at::Tensor result_c = at::empty_like(a_contig);
-  at::Tensor result = at::empty_like(a_contig);
-
-  auto dimensions = a.sizes();
-
-  std::cout << "dim " << dimensions << std::endl;
-
-  std::cout << result_r << std::endl;
-  result_r = result_r.transpose(-2, -1).contiguous();
-  std::cout << result_r << std::endl;
-  int numel = a_contig.numel();
-  int range = a_contig.dim();
-  int64_t n = a_contig.sizes().at(range - 2);
-  int64_t m = a_contig.sizes().at(range - 1);
-  int64_t mn = int64_t(m * n);
-  int64_t b = numel / mn;
-
-  int out_q_columns = m > n ? n : m;
-  if (n > m && mode == "complete") {
-    out_q_columns = n;
-  }
-
-  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
-            << std::endl;
-  // at::Tensor result_q = result_r.clone();
-  // dimensions[1]=out_q_columns;
-  std::vector v(dimensions.begin(), dimensions.end());
-  v[range - 1] = v[range - 2];
-  v[range - 2] = out_q_columns;
-  auto ndimensions = at::IntArrayRef(v);
-  at::Tensor result_q = at::empty(ndimensions);
-
-  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-  int64_t mn1 =
-      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-  int64_t mn2 =
-      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-  mn2 = mn1 > mn2 ? mn1 : mn2;
-  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
-  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
-  float* r_buf = result_r.data_ptr<float>();
-  float* q_buf = result_q.data_ptr<float>();
-
-  // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
-  // "complete") << std::endl;
-
-  for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
-
-    if (mode != "r") {
-      // copy relevant part of R matrix to Q matrix
-      int copy_columns = out_q_columns > m ? m : out_q_columns;
-      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-
-      oneapi::mkl::lapack::orgqr(
-          queue,
-          n,
-          out_q_columns,
-          out_q_columns,
-          q_buf,
-          n,
-          tau_buf,
-          sbuffer,
-          mn2);
-      std::cout << "done" << std::endl;
-
-      sycl::free(sbuffer, queue);
-      std::cout << "done2" << std::endl;
-    }
-
-    r_buf += mn;
-    q_buf += n * out_q_columns;
-
-  } // batch
-
-  if (mode == "r") {
-    result_q = at::empty({0, 0});
-  }
-
-  if ((mode == "reduced" || mode == "r") && n > m) {
-    result_r =
-        result_r
-            .index(
-                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-            .contiguous();
-  }
-
-  // result_q.transpose(0,1);
-  return std::make_tuple(
-      result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
-  //}
-}
-// Defines the operators
-// TORCH_LIBRARY(qr_issue, m) {
-//   m.def("local_qr(Tensor a,str? mode_) -> (Tensor,Tensor)");
-// }
-
-// ==================================================
-// Register SYCL Implementations to Torch Library
-// ==================================================
-// TORCH_LIBRARY_IMPL(qr_issue, XPU, m) {
-//   m.impl("local_qr", &_qr_xpu);
-// }
-
-// namespace qr_issue
-} // namespace native
-} // namespace at
diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
new file mode 100644
index 0000000000..c578b35b3d
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -0,0 +1,132 @@
+#include <ATen/Operators.h>
+#include <c10/xpu/XPUStream.h>
+// #include <sycl/sycl.hpp>
+#include <torch/all.h>
+#include <torch/library.h>
+//#include <mkl_lapack.h>
+#include <oneapi/mkl/lapack.hpp>
+#include <cstdint>
+
+namespace at {
+namespace native {
+namespace xpu {
+
+void linalg_qr_kernel(
+    const at::Tensor& A,
+    c10::string_view mode,
+    const at::Tensor& Q,
+    const at::Tensor& R) {
+    std::cout << "Hello from kernel";
+  }
+
+} // namespace xpu
+} // namespace native
+} // namespace at
+
+// std::tuple<at::Tensor, at::Tensor> linalg_qr_kernel_draft(
+//     const at::Tensor& a,
+//     std::optional<std::string_view> mode_) {
+//       return std::make_tuple(a, a);
+//   }
+
+  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
+  // std::endl;
+
+//   std::string mode = std::string(mode_.value_or("reduced"));
+
+//   TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+//   at::Tensor a_contig = a.contiguous();
+//   at::Tensor result_r = at::clone(a_contig);
+//   //  at::Tensor result_r = at::empty_like(a_contig);
+//   at::Tensor result_c = at::empty_like(a_contig);
+//   at::Tensor result = at::empty_like(a_contig);
+
+//   auto dimensions = a.sizes();
+
+//   std::cout << "dim " << dimensions << std::endl;
+
+//   std::cout << result_r << std::endl;
+//   result_r = result_r.transpose(-2, -1).contiguous();
+//   std::cout << result_r << std::endl;
+//   int numel = a_contig.numel();
+//   int range = a_contig.dim();
+//   int64_t n = a_contig.sizes().at(range - 2);
+//   int64_t m = a_contig.sizes().at(range - 1);
+//   int64_t mn = int64_t(m * n);
+//   int64_t b = numel / mn;
+
+//   int out_q_columns = m > n ? n : m;
+//   if (n > m && mode == "complete") {
+//     out_q_columns = n;
+//   }
+
+//   std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
+//             << std::endl;
+//   // at::Tensor result_q = result_r.clone();
+//   // dimensions[1]=out_q_columns;
+//   std::vector v(dimensions.begin(), dimensions.end());
+//   v[range - 1] = v[range - 2];
+//   v[range - 2] = out_q_columns;
+//   auto ndimensions = at::IntArrayRef(v);
+//   at::Tensor result_q = at::empty(ndimensions);
+
+//   sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+//   int64_t mn1 =
+//       oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+//   int64_t mn2 =
+//       oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+//   mn2 = mn1 > mn2 ? mn1 : mn2;
+//   float* sbuffer = sycl::malloc_device<float>(mn2, queue);
+//   float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+//   float* r_buf = result_r.data_ptr<float>();
+//   float* q_buf = result_q.data_ptr<float>();
+
+//   // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
+//   // "complete") << std::endl;
+
+//   for (int batch_item = 0; batch_item < b; batch_item++) {
+//     oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+
+//     if (mode != "r") {
+//       // copy relevant part of R matrix to Q matrix
+//       int copy_columns = out_q_columns > m ? m : out_q_columns;
+//       queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+//       oneapi::mkl::lapack::orgqr(
+//           queue,
+//           n,
+//           out_q_columns,
+//           out_q_columns,
+//           q_buf,
+//           n,
+//           tau_buf,
+//           sbuffer,
+//           mn2);
+//       std::cout << "done" << std::endl;
+
+//       sycl::free(sbuffer, queue);
+//       std::cout << "done2" << std::endl;
+//     }
+
+//     r_buf += mn;
+//     q_buf += n * out_q_columns;
+
+//   } // batch
+
+//   if (mode == "r") {
+//     result_q = at::empty({0, 0});
+//   }
+
+//   if ((mode == "reduced" || mode == "r") && n > m) {
+//     result_r =
+//         result_r
+//             .index(
+//                 {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+//             .contiguous();
+//   }
+
+//   // result_q.transpose(0,1);
+//   return std::make_tuple(
+//       result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+//   //}
+// }
diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h
new file mode 100644
index 0000000000..9e6a81435a
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/QRKernel.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <comm/xpu_aten.h>
+
+namespace at::native::xpu {
+
+TORCH_XPU_API void linalg_qr_kernel(
+    const Tensor& A,
+    c10::string_view mode,
+    const Tensor& Q,
+    const Tensor& R);
+
+} // namespace at::native::xpu
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index f11752f456..7221ebdbdc 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -9443,11 +9443,16 @@
 - func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor
   python_module: linalg
 
-- func: local_qr(Tensor a, str? mode_=None) -> (Tensor Q, Tensor R) 
+- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
   python_module: linalg
-  structured: False
+  variants: function
+  structured_delegate: linalg_qr.out
+
+- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+  python_module: linalg
+  structured: True
   dispatch:
-    XPU: local_qr_xpu
+    XPU: linalg_qr_xpu_out
 
 - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
   python_module: linalg

From e0f5f6be868cf5092fdaf2e827635c115a34d456 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Fri, 31 Oct 2025 16:58:31 +0000
Subject: [PATCH 04/15] First callable poc

---
 src/ATen/native/xpu/QR.cpp            |   2 +-
 src/ATen/native/xpu/sycl/QRKernel.cpp | 216 ++++++++++++--------------
 src/ATen/native/xpu/sycl/QRKernel.h   |   2 +-
 3 files changed, 103 insertions(+), 117 deletions(-)

diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp
index 9a2adeebc3..53b42c2be9 100644
--- a/src/ATen/native/xpu/QR.cpp
+++ b/src/ATen/native/xpu/QR.cpp
@@ -10,7 +10,7 @@ namespace at::native {
 
 TORCH_IMPL_FUNC(linalg_qr_xpu_out)
 (const Tensor& A,
- c10::string_view mode,
+ std::string_view mode,
  const Tensor& Q,
  const Tensor& R) {
   xpu::linalg_qr_kernel(A, mode, Q, R);
diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index c578b35b3d..ce26fe862f 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -7,126 +7,112 @@
 #include <oneapi/mkl/lapack.hpp>
 #include <cstdint>
 
-namespace at {
-namespace native {
-namespace xpu {
+namespace at::native::xpu {
 
 void linalg_qr_kernel(
     const at::Tensor& A,
-    c10::string_view mode,
+    std::string_view mode_,
     const at::Tensor& Q,
     const at::Tensor& R) {
-    std::cout << "Hello from kernel";
+  std::cout << "Hello from kernel";
+  std::cout << "Call mode is " << " " << mode_ << std::endl;
+  std::string mode = std::string(mode_);
+
+  TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor");
+  at::Tensor a_contig = A.contiguous();
+  at::Tensor result_r = at::clone(a_contig);
+  //  at::Tensor result_r = at::empty_like(a_contig);
+  at::Tensor result_c = at::empty_like(a_contig);
+  at::Tensor result = at::empty_like(a_contig);
+
+  auto dimensions = A.sizes();
+
+  std::cout << "dim " << dimensions << std::endl;
+
+  std::cout << result_r << std::endl;
+  result_r = result_r.transpose(-2, -1).contiguous();
+  std::cout << result_r << std::endl;
+  int numel = a_contig.numel();
+  int range = a_contig.dim();
+  int64_t n = a_contig.sizes().at(range - 2);
+  int64_t m = a_contig.sizes().at(range - 1);
+  int64_t mn = int64_t(m * n);
+  int64_t b = numel / mn;
+
+  int out_q_columns = m > n ? n : m;
+  if (n > m && mode == "complete") {
+    out_q_columns = n;
   }
 
-} // namespace xpu
-} // namespace native
-} // namespace at
-
-// std::tuple<at::Tensor, at::Tensor> linalg_qr_kernel_draft(
-//     const at::Tensor& a,
-//     std::optional<std::string_view> mode_) {
-//       return std::make_tuple(a, a);
-//   }
-
-  // std::cout << "Call mode is " << " " << mode_.value_or("compressed") <<
-  // std::endl;
-
-//   std::string mode = std::string(mode_.value_or("reduced"));
-
-//   TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
-//   at::Tensor a_contig = a.contiguous();
-//   at::Tensor result_r = at::clone(a_contig);
-//   //  at::Tensor result_r = at::empty_like(a_contig);
-//   at::Tensor result_c = at::empty_like(a_contig);
-//   at::Tensor result = at::empty_like(a_contig);
-
-//   auto dimensions = a.sizes();
-
-//   std::cout << "dim " << dimensions << std::endl;
-
-//   std::cout << result_r << std::endl;
-//   result_r = result_r.transpose(-2, -1).contiguous();
-//   std::cout << result_r << std::endl;
-//   int numel = a_contig.numel();
-//   int range = a_contig.dim();
-//   int64_t n = a_contig.sizes().at(range - 2);
-//   int64_t m = a_contig.sizes().at(range - 1);
-//   int64_t mn = int64_t(m * n);
-//   int64_t b = numel / mn;
-
-//   int out_q_columns = m > n ? n : m;
-//   if (n > m && mode == "complete") {
-//     out_q_columns = n;
-//   }
-
-//   std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
-//             << std::endl;
-//   // at::Tensor result_q = result_r.clone();
-//   // dimensions[1]=out_q_columns;
-//   std::vector v(dimensions.begin(), dimensions.end());
-//   v[range - 1] = v[range - 2];
-//   v[range - 2] = out_q_columns;
-//   auto ndimensions = at::IntArrayRef(v);
-//   at::Tensor result_q = at::empty(ndimensions);
-
-//   sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-//   int64_t mn1 =
-//       oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-//   int64_t mn2 =
-//       oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-//   mn2 = mn1 > mn2 ? mn1 : mn2;
-//   float* sbuffer = sycl::malloc_device<float>(mn2, queue);
-//   float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
-//   float* r_buf = result_r.data_ptr<float>();
-//   float* q_buf = result_q.data_ptr<float>();
-
-//   // std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
-//   // "complete") << std::endl;
-
-//   for (int batch_item = 0; batch_item < b; batch_item++) {
-//     oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
-
-//     if (mode != "r") {
-//       // copy relevant part of R matrix to Q matrix
-//       int copy_columns = out_q_columns > m ? m : out_q_columns;
-//       queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-
-//       oneapi::mkl::lapack::orgqr(
-//           queue,
-//           n,
-//           out_q_columns,
-//           out_q_columns,
-//           q_buf,
-//           n,
-//           tau_buf,
-//           sbuffer,
-//           mn2);
-//       std::cout << "done" << std::endl;
-
-//       sycl::free(sbuffer, queue);
-//       std::cout << "done2" << std::endl;
-//     }
-
-//     r_buf += mn;
-//     q_buf += n * out_q_columns;
-
-//   } // batch
-
-//   if (mode == "r") {
-//     result_q = at::empty({0, 0});
-//   }
-
-//   if ((mode == "reduced" || mode == "r") && n > m) {
-//     result_r =
-//         result_r
-//             .index(
-//                 {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-//             .contiguous();
-//   }
-
-//   // result_q.transpose(0,1);
-//   return std::make_tuple(
-//       result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
-//   //}
+  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
+            << std::endl;
+  // at::Tensor result_q = result_r.clone();
+  // dimensions[1]=out_q_columns;
+  std::vector v(dimensions.begin(), dimensions.end());
+  v[range - 1] = v[range - 2];
+  v[range - 2] = out_q_columns;
+  auto ndimensions = at::IntArrayRef(v);
+  at::Tensor result_q = at::empty(ndimensions);
+
+  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+  int64_t mn1 =
+      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+  int64_t mn2 =
+      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+  mn2 = mn1 > mn2 ? mn1 : mn2;
+  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
+  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+  float* r_buf = result_r.data_ptr<float>();
+  float* q_buf = result_q.data_ptr<float>();
+
+  std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
+  "complete") << std::endl;
+
+  for (int batch_item = 0; batch_item < b; batch_item++) {
+      oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+  
+      if (mode != "r") {
+          // copy relevant part of R matrix to Q matrix
+          int copy_columns = out_q_columns > m ? m : out_q_columns;
+          queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+    
+          oneapi::mkl::lapack::orgqr(
+                queue,
+                n,
+                out_q_columns,
+                out_q_columns,
+                q_buf,
+                n,
+                tau_buf,
+                sbuffer,
+                mn2);
+            std::cout << "done" << std::endl;
+      
+            sycl::free(sbuffer, queue);
+            std::cout << "done2" << std::endl;
+          }
+      
+          r_buf += mn;
+          q_buf += n * out_q_columns;
+      
+        } // batch
+      
+        if (mode == "r") {
+            result_q = at::empty({0, 0});
+          }
+        
+          if ((mode == "reduced" || mode == "r") && n > m) {
+              result_r =
+                  result_r
+                      .index(
+                            {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+                        .contiguous();
+              }
+            
+              // result_q.transpose(0,1);
+              // return std::make_tuple(
+                    // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+}
+ 
+} // namespace at::native::xpu
 // }
diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h
index 9e6a81435a..6927439a72 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.h
+++ b/src/ATen/native/xpu/sycl/QRKernel.h
@@ -6,7 +6,7 @@ namespace at::native::xpu {
 
 TORCH_XPU_API void linalg_qr_kernel(
     const Tensor& A,
-    c10::string_view mode,
+    std::string_view mode,
     const Tensor& Q,
     const Tensor& R);
 

From 025291ab8ed2454490020f2d810b45f3dc38d98b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 3 Nov 2025 07:50:45 +0000
Subject: [PATCH 05/15] simple test

---
 test/xpu/test_linalg_qr.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 test/xpu/test_linalg_qr.py

diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py
new file mode 100644
index 0000000000..6422d08f54
--- /dev/null
+++ b/test/xpu/test_linalg_qr.py
@@ -0,0 +1,24 @@
+import torch
+import pytest
+
+
+@pytest.mark.parametrize("mode", ['reduced', 'complete', 'r'])
+def test_linalg_qr(mode):
+    A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+    A_xpu = A.to('xpu')
+    
+    Q, R = torch.linalg.qr(A)
+    Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
+
+
+    print("==== CPU ====")
+    print("A",A)
+    print("Q",Q)
+    print("R",R)
+    print("\n==== XPU ====")
+    print("A_xpu",A_xpu)
+    print("Q_xpu",Q_xpu)
+    print("R_xpu",R_xpu)
+
+    assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5)
+    assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5)

From 6e025e194a59d0ab1667d398851f536aed0c123d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 3 Nov 2025 09:33:11 +0000
Subject: [PATCH 06/15] cleanup

---
 src/ATen/native/xpu/QR.cpp               | 11 +++--------
 src/ATen/native/xpu/XPUFallback.template |  1 -
 src/ATen/native/xpu/sycl/QRKernel.cpp    |  9 ++-------
 src/ATen/native/xpu/sycl/QRKernel.h      |  6 +++---
 test/xpu/test_linalg_qr.py               |  2 +-
 5 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp
index 53b42c2be9..37d8896b20 100644
--- a/src/ATen/native/xpu/QR.cpp
+++ b/src/ATen/native/xpu/QR.cpp
@@ -1,18 +1,13 @@
-#include <ATen/core/Tensor.h>
-#include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/xpu/sycl/QRKernel.h>
-#include <comm/RegisterUtils.h>
-#include <comm/xpu_aten.h>
-
 #include <ATen/ops/linalg_qr_native.h>
 
 namespace at::native {
 
 TORCH_IMPL_FUNC(linalg_qr_xpu_out)
-(const Tensor& A,
+(const at::Tensor& A,
  std::string_view mode,
- const Tensor& Q,
- const Tensor& R) {
+ const at::Tensor& Q,
+ const at::Tensor& R) {
   xpu::linalg_qr_kernel(A, mode, Q, R);
 }
 
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 053c806292..41e511c5e9 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -206,7 +206,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "linalg_lstsq.out",
     "linalg_lu.out",
     "linalg_matrix_exp",
-    "linalg_qr.out",
     "linalg_solve_triangular",
     "linalg_solve_triangular.out",
     "_linalg_svd.U",
diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index ce26fe862f..05ace1ceb9 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -1,22 +1,17 @@
 #include <ATen/Operators.h>
 #include <c10/xpu/XPUStream.h>
-// #include <sycl/sycl.hpp>
 #include <torch/all.h>
 #include <torch/library.h>
-//#include <mkl_lapack.h>
 #include <oneapi/mkl/lapack.hpp>
-#include <cstdint>
 
 namespace at::native::xpu {
 
 void linalg_qr_kernel(
     const at::Tensor& A,
-    std::string_view mode_,
+    std::string_view mode,
     const at::Tensor& Q,
     const at::Tensor& R) {
-  std::cout << "Hello from kernel";
-  std::cout << "Call mode is " << " " << mode_ << std::endl;
-  std::string mode = std::string(mode_);
+  std::cout << "Call mode is " << " " << mode << std::endl;
 
   TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor");
   at::Tensor a_contig = A.contiguous();
diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h
index 6927439a72..93696feed7 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.h
+++ b/src/ATen/native/xpu/sycl/QRKernel.h
@@ -5,9 +5,9 @@
 namespace at::native::xpu {
 
 TORCH_XPU_API void linalg_qr_kernel(
-    const Tensor& A,
+    const at::Tensor& A,
     std::string_view mode,
-    const Tensor& Q,
-    const Tensor& R);
+    const at::Tensor& Q,
+    const at::Tensor& R);
 
 } // namespace at::native::xpu
diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py
index 6422d08f54..6a23db8172 100644
--- a/test/xpu/test_linalg_qr.py
+++ b/test/xpu/test_linalg_qr.py
@@ -7,7 +7,7 @@ def test_linalg_qr(mode):
     A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
     A_xpu = A.to('xpu')
     
-    Q, R = torch.linalg.qr(A)
+    Q, R = torch.linalg.qr(A, mode=mode)
     Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
 
 

From 5e593d8cdafd6e1c3165ba9703a3fb9669bfc405 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 3 Nov 2025 10:07:26 +0000
Subject: [PATCH 07/15] format

---
 src/ATen/native/xpu/sycl/QRKernel.cpp | 94 +++++++++++++--------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index 05ace1ceb9..cd94bc5f5f 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/Operators.h>
 #include <c10/xpu/XPUStream.h>
+#include <oneapi/mkl/lapack.hpp>
 #include <torch/all.h>
 #include <torch/library.h>
-#include <oneapi/mkl/lapack.hpp>
 
 namespace at::native::xpu {
 
@@ -60,54 +60,54 @@ void linalg_qr_kernel(
   float* r_buf = result_r.data_ptr<float>();
   float* q_buf = result_q.data_ptr<float>();
 
-  std::cout << "entering " << n << " " << m << " " << mode << " " << (mode ==
-  "complete") << std::endl;
+  std::cout << "entering " << n << " " << m << " " << mode << " "
+            << (mode == "complete") << std::endl;
 
   for (int batch_item = 0; batch_item < b; batch_item++) {
-      oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
-  
-      if (mode != "r") {
-          // copy relevant part of R matrix to Q matrix
-          int copy_columns = out_q_columns > m ? m : out_q_columns;
-          queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-    
-          oneapi::mkl::lapack::orgqr(
-                queue,
-                n,
-                out_q_columns,
-                out_q_columns,
-                q_buf,
-                n,
-                tau_buf,
-                sbuffer,
-                mn2);
-            std::cout << "done" << std::endl;
-      
-            sycl::free(sbuffer, queue);
-            std::cout << "done2" << std::endl;
-          }
-      
-          r_buf += mn;
-          q_buf += n * out_q_columns;
-      
-        } // batch
-      
-        if (mode == "r") {
-            result_q = at::empty({0, 0});
-          }
-        
-          if ((mode == "reduced" || mode == "r") && n > m) {
-              result_r =
-                  result_r
-                      .index(
-                            {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-                        .contiguous();
-              }
-            
-              // result_q.transpose(0,1);
-              // return std::make_tuple(
-                    // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+
+    if (mode != "r") {
+      // copy relevant part of R matrix to Q matrix
+      int copy_columns = out_q_columns > m ? m : out_q_columns;
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+      oneapi::mkl::lapack::orgqr(
+          queue,
+          n,
+          out_q_columns,
+          out_q_columns,
+          q_buf,
+          n,
+          tau_buf,
+          sbuffer,
+          mn2);
+      std::cout << "done" << std::endl;
+
+      sycl::free(sbuffer, queue);
+      std::cout << "done2" << std::endl;
+    }
+
+    r_buf += mn;
+    q_buf += n * out_q_columns;
+
+  } // batch
+
+  if (mode == "r") {
+    result_q = at::empty({0, 0});
+  }
+
+  if ((mode == "reduced" || mode == "r") && n > m) {
+    result_r =
+        result_r
+            .index(
+                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+            .contiguous();
+  }
+
+  // result_q.transpose(0,1);
+  // return std::make_tuple(
+  // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
 }
- 
+
 } // namespace at::native::xpu
 // }

From 2038538f9b65f745bfb6b3e19125e0c1d5896f67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 3 Nov 2025 10:55:56 +0000
Subject: [PATCH 08/15] remove vars

---
 src/ATen/native/xpu/sycl/QRKernel.cpp | 10 ++-------
 test/xpu/test_linalg_qr.py            | 30 ++++++++++++++-------------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index cd94bc5f5f..7dde1634bf 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -12,13 +12,8 @@ void linalg_qr_kernel(
     const at::Tensor& Q,
     const at::Tensor& R) {
   std::cout << "Call mode is " << " " << mode << std::endl;
-
-  TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor");
   at::Tensor a_contig = A.contiguous();
   at::Tensor result_r = at::clone(a_contig);
-  //  at::Tensor result_r = at::empty_like(a_contig);
-  at::Tensor result_c = at::empty_like(a_contig);
-  at::Tensor result = at::empty_like(a_contig);
 
   auto dimensions = A.sizes();
 
@@ -105,9 +100,8 @@ void linalg_qr_kernel(
   }
 
   // result_q.transpose(0,1);
-  // return std::make_tuple(
-  // result_q.transpose(-2, -1), result_r.transpose(-2, -1).triu_());
+  Q.set_(result_q.transpose(-2, -1).to("xpu"));
+  R.set_(result_r.transpose(-2, -1).triu_().to("xpu"));
 }
 
 } // namespace at::native::xpu
-// }
diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py
index 6a23db8172..83e94e9853 100644
--- a/test/xpu/test_linalg_qr.py
+++ b/test/xpu/test_linalg_qr.py
@@ -1,24 +1,26 @@
 import torch
 import pytest
 
-
+@pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("mode", ['reduced', 'complete', 'r'])
-def test_linalg_qr(mode):
-    A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+@pytest.mark.parametrize("shape", [(5, 3), (2, 3, 5), (2, 3, 4, 4), (2, 1, 1)])
+# @pytest.mark.parametrize("shape", [
+#     # 2D matrices
+#     (5, 3), (3, 5), (8, 6), (6, 8),
+#     # 3D batched matrices
+#     (2, 3, 3), (3, 4, 4), (2, 5, 3), (2, 3, 5),
+#     # 4D batched matrices
+#     (2, 3, 4, 4), (1, 2, 5, 3), (2, 1, 3, 5),
+#     # Edge cases
+#     (1, 1), (10, 1), (1, 10), (2, 1, 1), (1, 1, 1)
+# ]) 
+def test_linalg_qr(dtype, mode, shape):
+    A = torch.randn(shape, dtype=dtype)
     A_xpu = A.to('xpu')
-    
+
     Q, R = torch.linalg.qr(A, mode=mode)
     Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
 
-
-    print("==== CPU ====")
-    print("A",A)
-    print("Q",Q)
-    print("R",R)
-    print("\n==== XPU ====")
-    print("A_xpu",A_xpu)
-    print("Q_xpu",Q_xpu)
-    print("R_xpu",R_xpu)
-
     assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5)
     assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5)
+    assert Q_xpu.device.type == 'xpu' and R_xpu.device.type == 'xpu'

From 3c26a9e1e6cc24be4f85cef0f0c8b9388d539342 Mon Sep 17 00:00:00 2001
From: Michal Wiktor <michal.wiktor@intel.com>
Date: Mon, 24 Nov 2025 12:06:55 +0000
Subject: [PATCH 09/15] New implementation of QR kernel based on mkl::geqrf and
 orgqr

---
 src/ATen/native/xpu/sycl/QRKernel.cpp | 76 +++++++++++++++------------
 1 file changed, 43 insertions(+), 33 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index 7dde1634bf..461abc699f 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -11,17 +11,17 @@ void linalg_qr_kernel(
     std::string_view mode,
     const at::Tensor& Q,
     const at::Tensor& R) {
-  std::cout << "Call mode is " << " " << mode << std::endl;
+  TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
+  TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
+
   at::Tensor a_contig = A.contiguous();
   at::Tensor result_r = at::clone(a_contig);
 
+  auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU);
   auto dimensions = A.sizes();
 
-  std::cout << "dim " << dimensions << std::endl;
-
-  std::cout << result_r << std::endl;
   result_r = result_r.transpose(-2, -1).contiguous();
-  std::cout << result_r << std::endl;
+
   int numel = a_contig.numel();
   int range = a_contig.dim();
   int64_t n = a_contig.sizes().at(range - 2);
@@ -34,32 +34,41 @@ void linalg_qr_kernel(
     out_q_columns = n;
   }
 
-  std::cout << "dim2 " << n << " " << m << " " << b << " " << out_q_columns
-            << std::endl;
-  // at::Tensor result_q = result_r.clone();
-  // dimensions[1]=out_q_columns;
   std::vector v(dimensions.begin(), dimensions.end());
-  v[range - 1] = v[range - 2];
-  v[range - 2] = out_q_columns;
-  auto ndimensions = at::IntArrayRef(v);
-  at::Tensor result_q = at::empty(ndimensions);
+  if (mode != "r") {
+    v[range - 1] = v[range - 2];
+    v[range - 2] = out_q_columns;
+  } else {
+    v = std::vector<long>({0, 0});
+  }
+  auto q_dimensions = at::IntArrayRef(v);
+  std::cout << "Q SIZE " << q_dimensions << " " << dimensions << " "
+            << std::endl;
+  at::Tensor result_q = at::empty(q_dimensions, options);
+
+  std::cout << "Q SIZE " << q_dimensions << " " << dimensions << std::endl;
 
   sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-  int64_t mn1 =
+
+  int64_t bufsize1 =
       oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-  int64_t mn2 =
+  int64_t bufsize2 =
       oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-  mn2 = mn1 > mn2 ? mn1 : mn2;
-  float* sbuffer = sycl::malloc_device<float>(mn2, queue);
-  float* tau_buf = sycl::malloc_device<float>(out_q_columns, queue);
+
+  int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1;
+  int64_t tau_len = m > n ? n : m;
+  float* sbuffer = sycl::malloc_device<float>(bufsize, queue);
+  float* tau_buf = sycl::malloc_device<float>(tau_len, queue);
   float* r_buf = result_r.data_ptr<float>();
-  float* q_buf = result_q.data_ptr<float>();
 
-  std::cout << "entering " << n << " " << m << " " << mode << " "
-            << (mode == "complete") << std::endl;
+  float* q_buf = NULL;
+  if (mode != "r") {
+    q_buf = result_q.data_ptr<float>();
+  }
 
   for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, mn2);
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize)
+        .wait();
 
     if (mode != "r") {
       // copy relevant part of R matrix to Q matrix
@@ -70,26 +79,24 @@ void linalg_qr_kernel(
           queue,
           n,
           out_q_columns,
-          out_q_columns,
+          //          out_q_columns,
+          tau_len,
           q_buf,
           n,
           tau_buf,
           sbuffer,
-          mn2);
-      std::cout << "done" << std::endl;
+          bufsize)
+          .wait();
 
-      sycl::free(sbuffer, queue);
-      std::cout << "done2" << std::endl;
+      q_buf += n * out_q_columns;
     }
 
     r_buf += mn;
-    q_buf += n * out_q_columns;
 
   } // batch
 
-  if (mode == "r") {
-    result_q = at::empty({0, 0});
-  }
+  sycl::free(sbuffer, queue);
+  sycl::free(tau_buf, queue);
 
   if ((mode == "reduced" || mode == "r") && n > m) {
     result_r =
@@ -100,8 +107,11 @@ void linalg_qr_kernel(
   }
 
   // result_q.transpose(0,1);
-  Q.set_(result_q.transpose(-2, -1).to("xpu"));
-  R.set_(result_r.transpose(-2, -1).triu_().to("xpu"));
+  // Q.set_(result_q.transpose(-2, -1).to("xpu"));
+  // R.set_(result_r.transpose(-2, -1).triu_().to("xpu"));
+  Q.set_(result_q.transpose(-2, -1));
+  R.set_(result_r.transpose(-2, -1).triu_());
+  queue.wait();
 }
 
 } // namespace at::native::xpu

From 1531fa1e7df4ab54d26775c27a9ec87f49479cd1 Mon Sep 17 00:00:00 2001
From: Michal Wiktor <michal.wiktor@intel.com>
Date: Mon, 24 Nov 2025 12:12:02 +0000
Subject: [PATCH 10/15] cleaned version of QR

---
 src/ATen/native/xpu/sycl/QRKernel.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
index 461abc699f..761744a8a8 100644
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ b/src/ATen/native/xpu/sycl/QRKernel.cpp
@@ -11,8 +11,9 @@ void linalg_qr_kernel(
     std::string_view mode,
     const at::Tensor& Q,
     const at::Tensor& R) {
-  TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
-  TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
+
+  //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
+  //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
 
   at::Tensor a_contig = A.contiguous();
   at::Tensor result_r = at::clone(a_contig);
@@ -42,11 +43,10 @@ void linalg_qr_kernel(
     v = std::vector<long>({0, 0});
   }
   auto q_dimensions = at::IntArrayRef(v);
-  std::cout << "Q SIZE " << q_dimensions << " " << dimensions << " "
-            << std::endl;
+
   at::Tensor result_q = at::empty(q_dimensions, options);
 
-  std::cout << "Q SIZE " << q_dimensions << " " << dimensions << std::endl;
+
 
   sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
 
@@ -79,7 +79,6 @@ void linalg_qr_kernel(
           queue,
           n,
           out_q_columns,
-          //          out_q_columns,
           tau_len,
           q_buf,
           n,
@@ -106,9 +105,6 @@ void linalg_qr_kernel(
             .contiguous();
   }
 
-  // result_q.transpose(0,1);
-  // Q.set_(result_q.transpose(-2, -1).to("xpu"));
-  // R.set_(result_r.transpose(-2, -1).triu_().to("xpu"));
   Q.set_(result_q.transpose(-2, -1));
   R.set_(result_r.transpose(-2, -1).triu_());
   queue.wait();

From 980732bdde4f83a5c2ef8ea10293e51c2c9b6814 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 24 Nov 2025 12:33:06 +0000
Subject: [PATCH 11/15] Enable more dev tests

---
 test/xpu/test_linalg_qr.py | 45 +++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py
index 83e94e9853..2aaffab7bd 100644
--- a/test/xpu/test_linalg_qr.py
+++ b/test/xpu/test_linalg_qr.py
@@ -1,26 +1,41 @@
-import torch
 import pytest
+import torch
+
 
 @pytest.mark.parametrize("dtype", [torch.float32])
-@pytest.mark.parametrize("mode", ['reduced', 'complete', 'r'])
-@pytest.mark.parametrize("shape", [(5, 3), (2, 3, 5), (2, 3, 4, 4), (2, 1, 1)])
-# @pytest.mark.parametrize("shape", [
-#     # 2D matrices
-#     (5, 3), (3, 5), (8, 6), (6, 8),
-#     # 3D batched matrices
-#     (2, 3, 3), (3, 4, 4), (2, 5, 3), (2, 3, 5),
-#     # 4D batched matrices
-#     (2, 3, 4, 4), (1, 2, 5, 3), (2, 1, 3, 5),
-#     # Edge cases
-#     (1, 1), (10, 1), (1, 10), (2, 1, 1), (1, 1, 1)
-# ]) 
+@pytest.mark.parametrize("mode", ["reduced", "complete", "r"])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        # 2D matrices
+        (5, 3),
+        (3, 5),
+        (8, 6),
+        (6, 8),
+        # 3D batched matrices
+        (2, 3, 3),
+        (3, 4, 4),
+        (2, 5, 3),
+        (2, 3, 5),
+        # 4D batched matrices
+        (2, 3, 4, 4),
+        (1, 2, 5, 3),
+        (2, 1, 3, 5),
+        # Edge cases
+        (1, 1),
+        (10, 1),
+        (1, 10),
+        (2, 1, 1),
+        (1, 1, 1),
+    ],
+)
 def test_linalg_qr(dtype, mode, shape):
     A = torch.randn(shape, dtype=dtype)
-    A_xpu = A.to('xpu')
+    A_xpu = A.to("xpu")
 
     Q, R = torch.linalg.qr(A, mode=mode)
     Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
 
     assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5)
     assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5)
-    assert Q_xpu.device.type == 'xpu' and R_xpu.device.type == 'xpu'
+    assert Q_xpu.device.type == "xpu" and R_xpu.device.type == "xpu"

From e8b49e7a721412f277dbee0a88a53f8554c1b2ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 26 Nov 2025 11:21:14 +0000
Subject: [PATCH 12/15] Move QR to mkl

---
 src/ATen/native/xpu/BatchLinearAlgebra.cpp    |  18 +++
 src/ATen/native/xpu/QR.cpp                    |  14 ---
 .../native/xpu/mkl/BatchLinearAlgebra.cpp     | 104 ++++++++++++++++
 src/ATen/native/xpu/mkl/BatchLinearAlgebra.h  |   6 +
 src/ATen/native/xpu/sycl/QRKernel.cpp         | 113 ------------------
 src/ATen/native/xpu/sycl/QRKernel.h           |  13 --
 6 files changed, 128 insertions(+), 140 deletions(-)
 delete mode 100644 src/ATen/native/xpu/QR.cpp
 delete mode 100644 src/ATen/native/xpu/sycl/QRKernel.cpp
 delete mode 100644 src/ATen/native/xpu/sycl/QRKernel.h

diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
index 8036419c07..aa38a8c39c 100644
--- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
@@ -2,6 +2,7 @@
 #include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/ops/linalg_qr_native.h>
 #if defined(USE_ONEMKL_XPU)
 #include <ATen/native/xpu/mkl/BatchLinearAlgebra.h>
 #endif // USE_ONEMKL_XPU
@@ -64,4 +65,21 @@ void lu_factor_kernel_xpu(
 
 REGISTER_XPU_DISPATCH(lu_factor_stub, &lu_factor_kernel_xpu);
 
+TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A,
+                               std::string_view mode,
+                               const Tensor & Q,
+                               const Tensor & R) {
+#if defined(USE_ONEMKL_XPU)
+  xpu::linalg_qr_kernel(A, mode, Q, R);
+#else
+  auto A_cpu = A.to(A.options().device(kCPU));
+  auto Q_cpu = Q.to(Q.options().device(kCPU));
+  auto R_cpu = R.to(R.options().device(kCPU));
+  at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode);  
+  Q.copy_(Q_cpu);
+  R.copy_(R_cpu);
+#endif // USE_ONEMKL_XPU
+}
+
+
 } // namespace at::native
diff --git a/src/ATen/native/xpu/QR.cpp b/src/ATen/native/xpu/QR.cpp
deleted file mode 100644
index 37d8896b20..0000000000
--- a/src/ATen/native/xpu/QR.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ATen/native/xpu/sycl/QRKernel.h>
-#include <ATen/ops/linalg_qr_native.h>
-
-namespace at::native {
-
-TORCH_IMPL_FUNC(linalg_qr_xpu_out)
-(const at::Tensor& A,
- std::string_view mode,
- const at::Tensor& Q,
- const at::Tensor& R) {
-  xpu::linalg_qr_kernel(A, mode, Q, R);
-}
-
-} // namespace at::native
diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
index 26e80fa4d0..12425d6ad6 100644
--- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
@@ -561,4 +561,108 @@ void lu_factor_mkl(
   pivots.copy_(pivots_);
 }
 
+void linalg_qr_kernel(
+    const at::Tensor& A,
+    std::string_view mode,
+    const at::Tensor& Q,
+    const at::Tensor& R) {
+
+  //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
+  //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
+
+  at::Tensor a_contig = A.contiguous();
+  at::Tensor result_r = at::clone(a_contig);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU);
+  auto dimensions = A.sizes();
+
+  result_r = result_r.transpose(-2, -1).contiguous();
+
+  int numel = a_contig.numel();
+  int range = a_contig.dim();
+  int64_t n = a_contig.sizes().at(range - 2);
+  int64_t m = a_contig.sizes().at(range - 1);
+  int64_t mn = int64_t(m * n);
+  int64_t b = numel / mn;
+
+  int out_q_columns = m > n ? n : m;
+  if (n > m && mode == "complete") {
+    out_q_columns = n;
+  }
+
+  std::vector v(dimensions.begin(), dimensions.end());
+  if (mode != "r") {
+    v[range - 1] = v[range - 2];
+    v[range - 2] = out_q_columns;
+  } else {
+    v = std::vector<long>({0, 0});
+  }
+  auto q_dimensions = at::IntArrayRef(v);
+
+  at::Tensor result_q = at::empty(q_dimensions, options);
+
+
+
+  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+
+  int64_t bufsize1 =
+      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+  int64_t bufsize2 =
+      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+
+  int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1;
+  int64_t tau_len = m > n ? n : m;
+  float* sbuffer = sycl::malloc_device<float>(bufsize, queue);
+  float* tau_buf = sycl::malloc_device<float>(tau_len, queue);
+  float* r_buf = result_r.data_ptr<float>();
+
+  float* q_buf = NULL;
+  if (mode != "r") {
+    q_buf = result_q.data_ptr<float>();
+  }
+
+  for (int batch_item = 0; batch_item < b; batch_item++) {
+    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize)
+        .wait();
+
+    if (mode != "r") {
+      // copy relevant part of R matrix to Q matrix
+      int copy_columns = out_q_columns > m ? m : out_q_columns;
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+
+      oneapi::mkl::lapack::orgqr(
+          queue,
+          n,
+          out_q_columns,
+          tau_len,
+          q_buf,
+          n,
+          tau_buf,
+          sbuffer,
+          bufsize)
+          .wait();
+
+      q_buf += n * out_q_columns;
+    }
+
+    r_buf += mn;
+
+  } // batch
+
+  sycl::free(sbuffer, queue);
+  sycl::free(tau_buf, queue);
+
+  if ((mode == "reduced" || mode == "r") && n > m) {
+    result_r =
+        result_r
+            .index(
+                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
+            .contiguous();
+  }
+
+  Q.set_(result_q.transpose(-2, -1));
+  R.set_(result_r.transpose(-2, -1).triu_());
+  queue.wait();
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h
index c1cc1da5c6..ef846c4d6b 100644
--- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h
+++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h
@@ -16,4 +16,10 @@ TORCH_XPU_API void lu_factor_mkl(
     const Tensor& info,
     bool pivot);
 
+TORCH_XPU_API void linalg_qr_kernel(
+    const at::Tensor& A,
+    std::string_view mode,
+    const at::Tensor& Q,
+    const at::Tensor& R);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/QRKernel.cpp b/src/ATen/native/xpu/sycl/QRKernel.cpp
deleted file mode 100644
index 761744a8a8..0000000000
--- a/src/ATen/native/xpu/sycl/QRKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <ATen/Operators.h>
-#include <c10/xpu/XPUStream.h>
-#include <oneapi/mkl/lapack.hpp>
-#include <torch/all.h>
-#include <torch/library.h>
-
-namespace at::native::xpu {
-
-void linalg_qr_kernel(
-    const at::Tensor& A,
-    std::string_view mode,
-    const at::Tensor& Q,
-    const at::Tensor& R) {
-
-  //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
-  //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
-
-  at::Tensor a_contig = A.contiguous();
-  at::Tensor result_r = at::clone(a_contig);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU);
-  auto dimensions = A.sizes();
-
-  result_r = result_r.transpose(-2, -1).contiguous();
-
-  int numel = a_contig.numel();
-  int range = a_contig.dim();
-  int64_t n = a_contig.sizes().at(range - 2);
-  int64_t m = a_contig.sizes().at(range - 1);
-  int64_t mn = int64_t(m * n);
-  int64_t b = numel / mn;
-
-  int out_q_columns = m > n ? n : m;
-  if (n > m && mode == "complete") {
-    out_q_columns = n;
-  }
-
-  std::vector v(dimensions.begin(), dimensions.end());
-  if (mode != "r") {
-    v[range - 1] = v[range - 2];
-    v[range - 2] = out_q_columns;
-  } else {
-    v = std::vector<long>({0, 0});
-  }
-  auto q_dimensions = at::IntArrayRef(v);
-
-  at::Tensor result_q = at::empty(q_dimensions, options);
-
-
-
-  sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
-
-  int64_t bufsize1 =
-      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
-  int64_t bufsize2 =
-      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
-
-  int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1;
-  int64_t tau_len = m > n ? n : m;
-  float* sbuffer = sycl::malloc_device<float>(bufsize, queue);
-  float* tau_buf = sycl::malloc_device<float>(tau_len, queue);
-  float* r_buf = result_r.data_ptr<float>();
-
-  float* q_buf = NULL;
-  if (mode != "r") {
-    q_buf = result_q.data_ptr<float>();
-  }
-
-  for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize)
-        .wait();
-
-    if (mode != "r") {
-      // copy relevant part of R matrix to Q matrix
-      int copy_columns = out_q_columns > m ? m : out_q_columns;
-      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
-
-      oneapi::mkl::lapack::orgqr(
-          queue,
-          n,
-          out_q_columns,
-          tau_len,
-          q_buf,
-          n,
-          tau_buf,
-          sbuffer,
-          bufsize)
-          .wait();
-
-      q_buf += n * out_q_columns;
-    }
-
-    r_buf += mn;
-
-  } // batch
-
-  sycl::free(sbuffer, queue);
-  sycl::free(tau_buf, queue);
-
-  if ((mode == "reduced" || mode == "r") && n > m) {
-    result_r =
-        result_r
-            .index(
-                {"...", at::indexing::Slice(0, n), at::indexing::Slice(0, m)})
-            .contiguous();
-  }
-
-  Q.set_(result_q.transpose(-2, -1));
-  R.set_(result_r.transpose(-2, -1).triu_());
-  queue.wait();
-}
-
-} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/QRKernel.h b/src/ATen/native/xpu/sycl/QRKernel.h
deleted file mode 100644
index 93696feed7..0000000000
--- a/src/ATen/native/xpu/sycl/QRKernel.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include <comm/xpu_aten.h>
-
-namespace at::native::xpu {
-
-TORCH_XPU_API void linalg_qr_kernel(
-    const at::Tensor& A,
-    std::string_view mode,
-    const at::Tensor& Q,
-    const at::Tensor& R);
-
-} // namespace at::native::xpu

From 419eb5731cdd1fb3df1b1b448c0cff5361d4c449 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 26 Nov 2025 12:10:02 +0000
Subject: [PATCH 13/15] Port ut to test_linalg_xpu

---
 src/ATen/native/xpu/BatchLinearAlgebra.cpp |  2 +-
 test/xpu/test_linalg_qr.py                 | 41 -----------------
 test/xpu/test_linalg_xpu.py                | 53 ++++++++++++++++++++++
 3 files changed, 54 insertions(+), 42 deletions(-)
 delete mode 100644 test/xpu/test_linalg_qr.py

diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
index aa38a8c39c..ea56200f12 100644
--- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
@@ -75,7 +75,7 @@ TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A,
   auto A_cpu = A.to(A.options().device(kCPU));
   auto Q_cpu = Q.to(Q.options().device(kCPU));
   auto R_cpu = R.to(R.options().device(kCPU));
-  at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode);  
+  at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode);
   Q.copy_(Q_cpu);
   R.copy_(R_cpu);
 #endif // USE_ONEMKL_XPU
diff --git a/test/xpu/test_linalg_qr.py b/test/xpu/test_linalg_qr.py
deleted file mode 100644
index 2aaffab7bd..0000000000
--- a/test/xpu/test_linalg_qr.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pytest
-import torch
-
-
-@pytest.mark.parametrize("dtype", [torch.float32])
-@pytest.mark.parametrize("mode", ["reduced", "complete", "r"])
-@pytest.mark.parametrize(
-    "shape",
-    [
-        # 2D matrices
-        (5, 3),
-        (3, 5),
-        (8, 6),
-        (6, 8),
-        # 3D batched matrices
-        (2, 3, 3),
-        (3, 4, 4),
-        (2, 5, 3),
-        (2, 3, 5),
-        # 4D batched matrices
-        (2, 3, 4, 4),
-        (1, 2, 5, 3),
-        (2, 1, 3, 5),
-        # Edge cases
-        (1, 1),
-        (10, 1),
-        (1, 10),
-        (2, 1, 1),
-        (1, 1, 1),
-    ],
-)
-def test_linalg_qr(dtype, mode, shape):
-    A = torch.randn(shape, dtype=dtype)
-    A_xpu = A.to("xpu")
-
-    Q, R = torch.linalg.qr(A, mode=mode)
-    Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
-
-    assert torch.allclose(Q, Q_xpu.cpu(), atol=1e-5, rtol=1e-5)
-    assert torch.allclose(R, R_xpu.cpu(), atol=1e-5, rtol=1e-5)
-    assert Q_xpu.device.type == "xpu" and R_xpu.device.type == "xpu"
diff --git a/test/xpu/test_linalg_xpu.py b/test/xpu/test_linalg_xpu.py
index c7b88ccc9e..aa1fb745e4 100644
--- a/test/xpu/test_linalg_xpu.py
+++ b/test/xpu/test_linalg_xpu.py
@@ -473,6 +473,57 @@ def __tunableop_ctx(self):
                 pass
 
 
+@parametrize("batch", [1, 3])
+@parametrize("m", [0, 1, 12])
+@parametrize("n", [0, 1, 17])
+@dtypes(torch.float32)
+def qr_mode_r(self, device, dtype, batch, m, n):
+    if batch > 1:
+        A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu")
+    else:
+        A_cpu = torch.randn(m, n, dtype=dtype, device="cpu")
+    A_xpu = A_cpu.to(device)
+
+    R_cpu = torch.linalg.qr(A_cpu, mode="r").R
+    R_xpu = torch.linalg.qr(A_xpu, mode="r").R
+    self.assertEqual(R_xpu, R_cpu, atol=1e-5, rtol=1e-5)
+
+    # Verify that R is upper triangular
+    lower_triangle = torch.tril(R_xpu, diagonal=-1)
+    self.assertEqual(lower_triangle.sum(), 0.0, atol=0.0, rtol=0.0)
+
+
+@parametrize("batch", [1, 3])
+@parametrize("m", [0, 1, 12])
+@parametrize("n", [0, 1, 17])
+@parametrize("mode", ["reduced", "complete"])
+@dtypes(torch.float32)
+def qr_modes_reduced_complete(self, device, dtype, batch, m, n, mode):
+    if batch > 1:
+        A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu")
+    else:
+        A_cpu = torch.randn(m, n, dtype=dtype, device="cpu")
+    A_xpu = A_cpu.to(device)
+
+    Q_cpu, R_cpu = torch.linalg.qr(A_cpu, mode=mode)
+    Q_xpu, R_xpu = torch.linalg.qr(A_xpu, mode=mode)
+
+    self.assertEqual(Q_xpu, Q_cpu, atol=1e-5, rtol=1e-5)
+    self.assertEqual(R_xpu, R_cpu, atol=1e-5, rtol=1e-5)
+
+    # Verify Q is orthogonal: Q^T @ Q should be identity
+    QTQ_xpu = torch.matmul(Q_xpu.mT, Q_xpu)
+    k = min(m, n) if mode == "reduced" else m
+    identity = torch.eye(k, dtype=dtype, device=device)
+    if batch > 1:
+        identity = identity.expand(batch, k, k)
+    self.assertEqual(QTQ_xpu, identity, atol=1e-5, rtol=1e-5)
+
+    # Verify that R is upper triangular
+    lower_triangle = torch.tril(R_xpu, diagonal=-1)
+    self.assertEqual(lower_triangle.sum(), 0.0, atol=0.0, rtol=0.0)
+
+
 with XPUPatchForImport(False):
     from test_linalg import TestLinalg
 
@@ -493,6 +544,8 @@ def __tunableop_ctx(self):
 TestLinalg.test_ck_blas_library = ck_blas_library
 TestLinalg.test_addmm_relu_tunableop_rocm = addmm_relu_tunableop_rocm
 TestLinalg._tunableop_ctx = __tunableop_ctx
+TestLinalg.test_qr_mode_r = qr_mode_r
+TestLinalg.test_qr_modes_reduced_complete = qr_modes_reduced_complete
 
 TestLinalg._default_dtype_check_enabled = True
 instantiate_device_type_tests(TestLinalg, globals(), only_for=("xpu"), allow_xpu=True)

From 7906ef245d8eee63026a1b800464b381f3dca623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 26 Nov 2025 13:33:11 +0000
Subject: [PATCH 14/15] Fix the non-MKL fallback path

---
 src/ATen/native/xpu/BatchLinearAlgebra.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
index ea56200f12..e50a5cf4b4 100644
--- a/src/ATen/native/xpu/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
@@ -3,6 +3,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/ops/linalg_qr_native.h>
+#include <ATen/ops/linalg_qr_cpu_dispatch.h>
 #if defined(USE_ONEMKL_XPU)
 #include <ATen/native/xpu/mkl/BatchLinearAlgebra.h>
 #endif // USE_ONEMKL_XPU
@@ -72,10 +73,10 @@ TORCH_IMPL_FUNC(linalg_qr_xpu_out)(const Tensor& A,
 #if defined(USE_ONEMKL_XPU)
   xpu::linalg_qr_kernel(A, mode, Q, R);
 #else
-  auto A_cpu = A.to(A.options().device(kCPU));
-  auto Q_cpu = Q.to(Q.options().device(kCPU));
-  auto R_cpu = R.to(R.options().device(kCPU));
-  at::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode);
+  auto A_cpu = A.to(at::kCPU);
+  auto Q_cpu = at::empty_like(Q, at::kCPU);
+  auto R_cpu = at::empty_like(R, at::kCPU);
+  at::cpu::linalg_qr_out(Q_cpu, R_cpu, A_cpu, mode);
   Q.copy_(Q_cpu);
   R.copy_(R_cpu);
 #endif // USE_ONEMKL_XPU

From c995189fc9e528cbb1c7505b46724e98258ab3ff Mon Sep 17 00:00:00 2001
From: Michal Wiktor <michal.wiktor@intel.com>
Date: Mon, 1 Dec 2025 14:29:53 +0000
Subject: [PATCH 15/15] Production ready fused QR kernel

---
 .../native/xpu/mkl/BatchLinearAlgebra.cpp     | 57 +++++++++++++------
 test/xpu/test_linalg_xpu.py                   |  8 +--
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
index 12425d6ad6..3b75487429 100644
--- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
@@ -561,19 +561,19 @@ void lu_factor_mkl(
   pivots.copy_(pivots_);
 }
 
-void linalg_qr_kernel(
+
+template <typename scalar_t>
+void linalg_qr_kernel_impl(
     const at::Tensor& A,
     std::string_view mode,
     const at::Tensor& Q,
     const at::Tensor& R) {
 
-  //TORCH_CHECK(A.device().is_xpu(), "a must be an XPU tensor");
-  //TORCH_CHECK(A.dtype() == at::kFloat, "a must be float");
 
   at::Tensor a_contig = A.contiguous();
   at::Tensor result_r = at::clone(a_contig);
 
-  auto options = at::TensorOptions().dtype(at::kFloat).device(kXPU);
+  auto options = at::TensorOptions().dtype(A.dtype()).device(kXPU);
   auto dimensions = A.sizes();
 
   result_r = result_r.transpose(-2, -1).contiguous();
@@ -583,7 +583,15 @@ void linalg_qr_kernel(
   int64_t n = a_contig.sizes().at(range - 2);
   int64_t m = a_contig.sizes().at(range - 1);
   int64_t mn = int64_t(m * n);
-  int64_t b = numel / mn;
+  int64_t b = numel ==0 ? 0 : numel / mn;
+
+
+  if (b==0 && mode=="complete" && n>0) {
+    b=1;
+    for (int dimension=0; dimension<range-2; dimension++) {
+      b*=a_contig.sizes().at(dimension);
+    }
+  }
 
   int out_q_columns = m > n ? n : m;
   if (n > m && mode == "complete") {
@@ -595,7 +603,7 @@ void linalg_qr_kernel(
     v[range - 1] = v[range - 2];
     v[range - 2] = out_q_columns;
   } else {
-    v = std::vector<long>({0, 0});
+    v = std::vector<long>({0});
   }
   auto q_dimensions = at::IntArrayRef(v);
 
@@ -606,29 +614,32 @@ void linalg_qr_kernel(
   sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
 
   int64_t bufsize1 =
-      oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, n, m, n);
+      oneapi::mkl::lapack::geqrf_scratchpad_size<scalar_t>(queue, n+1, m+1, n+1);
   int64_t bufsize2 =
-      oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, n, m, m, n);
+      oneapi::mkl::lapack::orgqr_scratchpad_size<scalar_t>(queue, n+1, m+1, m+1, n+1);
 
   int64_t bufsize = bufsize2 > bufsize1 ? bufsize2 : bufsize1;
   int64_t tau_len = m > n ? n : m;
-  float* sbuffer = sycl::malloc_device<float>(bufsize, queue);
-  float* tau_buf = sycl::malloc_device<float>(tau_len, queue);
-  float* r_buf = result_r.data_ptr<float>();
+  scalar_t* sbuffer = sycl::malloc_device<scalar_t>(bufsize, queue);
+  scalar_t* tau_buf = sycl::malloc_device<scalar_t>(tau_len, queue);
+  scalar_t* r_buf = result_r.data_ptr<scalar_t>();
 
-  float* q_buf = NULL;
+  scalar_t* q_buf = nullptr;
   if (mode != "r") {
-    q_buf = result_q.data_ptr<float>();
+    q_buf = result_q.data_ptr<scalar_t>();
   }
 
   for (int batch_item = 0; batch_item < b; batch_item++) {
-    oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize)
+
+
+    if (mn!=0) // make QR if there is something to orthogonalize
+      oneapi::mkl::lapack::geqrf(queue, n, m, r_buf, n, tau_buf, sbuffer, bufsize)
         .wait();
 
     if (mode != "r") {
       // copy relevant part of R matrix to Q matrix
       int copy_columns = out_q_columns > m ? m : out_q_columns;
-      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(float)).wait();
+      queue.memcpy(q_buf, r_buf, n * copy_columns * sizeof(scalar_t)).wait();
 
       oneapi::mkl::lapack::orgqr(
           queue,
@@ -660,9 +671,23 @@ void linalg_qr_kernel(
             .contiguous();
   }
 
-  Q.set_(result_q.transpose(-2, -1));
+  // normal case, non-zero dimensions
+  if (mode!="r") {
+    result_q.transpose_(-2, -1);
+  }
+  Q.set_(result_q);
   R.set_(result_r.transpose(-2, -1).triu_());
   queue.wait();
 }
 
+void linalg_qr_kernel(
+    const at::Tensor& A,
+    std::string_view mode,
+    const at::Tensor& Q,
+    const at::Tensor& R) {
+  AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "linalg_qr_xpu", [&] {
+    linalg_qr_kernel_impl<scalar_t>(A, mode, Q, R);
+  });
+}
 } // namespace at::native::xpu
+  //
diff --git a/test/xpu/test_linalg_xpu.py b/test/xpu/test_linalg_xpu.py
index aa1fb745e4..edc70c25ce 100644
--- a/test/xpu/test_linalg_xpu.py
+++ b/test/xpu/test_linalg_xpu.py
@@ -474,9 +474,9 @@ def __tunableop_ctx(self):
 
 
 @parametrize("batch", [1, 3])
-@parametrize("m", [0, 1, 12])
-@parametrize("n", [0, 1, 17])
-@dtypes(torch.float32)
+@parametrize("m", [1, 12])
+@parametrize("n", [1, 17])
+@dtypes(torch.float32, torch.float64)
 def qr_mode_r(self, device, dtype, batch, m, n):
     if batch > 1:
         A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu")
@@ -497,7 +497,7 @@ def qr_mode_r(self, device, dtype, batch, m, n):
 @parametrize("m", [0, 1, 12])
 @parametrize("n", [0, 1, 17])
 @parametrize("mode", ["reduced", "complete"])
-@dtypes(torch.float32)
+@dtypes(torch.float32, torch.float64)
 def qr_modes_reduced_complete(self, device, dtype, batch, m, n, mode):
     if batch > 1:
         A_cpu = torch.randn(batch, m, n, dtype=dtype, device="cpu")