tracel-ai
diff --git a/‎crates/cubecl-attention/src/components/args.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-attention/src/components/args.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-attention/src/components/global/dummy/attention.rs‎
Lines changed: 6 additions & 5 deletions b/‎crates/cubecl-attention/src/components/global/dummy/attention.rs‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎crates/cubecl-attention/src/components/global/dummy/load.rs‎
Lines changed: 19 additions & 17 deletions b/‎crates/cubecl-attention/src/components/global/dummy/load.rs‎
Lines changed: 19 additions & 17 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/dummy/attention.rs‎
Lines changed: 3 additions & 3 deletions b/‎crates/cubecl-attention/src/components/stage/dummy/attention.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎crates/cubecl-attention/src/components/tile/base.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-attention/src/components/tile/base.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-attention/src/components/tile/dummy/attention.rs‎
Lines changed: 3 additions & 3 deletions b/‎crates/cubecl-attention/src/components/tile/dummy/attention.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎crates/cubecl-attention/src/components/tile/dummy/writer.rs‎
Lines changed: 3 additions & 8 deletions b/‎crates/cubecl-attention/src/components/tile/dummy/writer.rs‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎crates/cubecl-attention/src/lib.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-attention/src/lib.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-convolution/src/components/global/base.rs‎
Lines changed: 15 additions & 11 deletions b/‎crates/cubecl-convolution/src/components/global/base.rs‎
Lines changed: 15 additions & 11 deletions
@@ -493,7 +493,7 @@ impl<EI: Numeric, EO: Numeric, GA: AttentionArgs> TensorOutput<EI, EO, GA> {
 
     /// Get the buffer length of the tensor.
     pub fn buffer_len(&self) -> u32 {
-        unsafe { GA::len_out(&(*self.state)) }
+        unsafe { GA::buffer_len_out(&(*self.state)) }
     }
 
     /// Get the line size of the tensor.
 
@@ -112,7 +112,7 @@ impl<
     ) -> DummyQueryLoader<AP, Self::Config> {
         comment!("Global: Init Query Loader");
         let layout =
-            SimpleGlobalLayout::new(&query, config.global_memory_config(FlashIdent::Query));
+            SimpleGlobalLayout::new(&query, 0, config.global_memory_config(FlashIdent::Query));
         DummyQueryLoader::<AP, Self::Config>::new(q_offset, query.view(layout), config)
     }
 
@@ -121,7 +121,7 @@ impl<
         #[comptime] config: Self::Config,
     ) -> Self::KeyLoader {
         comment!("Global: Init Key Loader");
-        let layout = SimpleGlobalLayout::new(&key, config.global_memory_config(FlashIdent::Key));
+        let layout = SimpleGlobalLayout::new(&key, 0, config.global_memory_config(FlashIdent::Key));
         DummyKeyLoader::new(key.view(layout), config)
     }
 
@@ -131,7 +131,7 @@ impl<
     ) -> Self::ValueLoader {
         comment!("Global: Init Value Loader");
         let layout =
-            SimpleGlobalLayout::new(&value, config.global_memory_config(FlashIdent::Value));
+            SimpleGlobalLayout::new(&value, 0, config.global_memory_config(FlashIdent::Value));
         DummyValueLoader::new(value.view(layout), config)
     }
 
@@ -141,7 +141,8 @@ impl<
         #[comptime] config: Self::Config,
     ) -> Self::Writer {
         comment!("Global: Init Writer");
-        let layout = SimpleGlobalLayout::new(&out, config.global_memory_config(FlashIdent::Out));
-        SA::init_writer(q_offset, out.view_mut(layout))
+        let layout = SimpleGlobalLayout::new(&out, 0, config.global_memory_config(FlashIdent::Out));
+        let out = out.view_mut(layout);
+        SA::init_writer(out.slice_mut_unchecked((q_offset, 0), out.shape()))
     }
 }
@@ -4,7 +4,7 @@ use cubecl_matmul::components::global::memory::{TensorReader, ViewDirection};
 use cubecl_matmul::components::stage::{FullStageReader, StageMemory};
 use cubecl_matmul::components::tile::Tile;
 use cubecl_matmul::components::{MatrixLayout, StageIdent};
-use cubecl_std::tensor::{View, layout::Coords3d};
+use cubecl_std::tensor::{View, layout::Coords2d};
 use std::marker::PhantomData;
 
 use crate::components::global::base::GlobalAttentionConfig;
@@ -41,8 +41,9 @@ pub struct DummyValueLoader<AP: AttentionPrecision, G: GlobalAttentionConfig> {
 
 #[cube]
 impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyQueryLoader<AP, G> {
-    pub fn new(q_offset: u32, query: View<Line<AP::EI>, Coords3d>, #[comptime] _config: G) -> Self {
-        let tensor_reader = TensorReader::new(query, (0u32.runtime(), q_offset, 0u32.runtime()));
+    pub fn new(q_offset: u32, query: View<Line<AP::EI>, Coords2d>, #[comptime] _config: G) -> Self {
+        let query = query.slice((q_offset, 0), query.shape());
+        let tensor_reader = TensorReader::new(query);
 
         DummyQueryLoader::<AP, G> {
             tensor_reader,
@@ -55,14 +56,17 @@ impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyQueryLoader<AP, G> {
 
         let attention_tile_size = config.stage_config().tile_config().attention_tile_size();
         let tile = Tile::<AP::EI> {
-            slice: self.tensor_reader.view.slice(
-                (
-                    self.tensor_reader.row_offset.read() * attention_tile_size.seq_q,
-                    0u32.runtime(),
-                    0u32.runtime(),
-                ),
-                attention_tile_size.query_size(),
-            ),
+            slice: self
+                .tensor_reader
+                .view
+                .slice(
+                    (
+                        self.tensor_reader.row_offset.read() * attention_tile_size.seq_q,
+                        0u32.runtime(),
+                    ),
+                    (1u32, attention_tile_size.query_size()).runtime(),
+                )
+                .to_linear_slice(),
             stride: attention_tile_size.num_cols(FlashIdent::Query),
             layout: MatrixLayout::RowMajor,
         };
@@ -73,9 +77,8 @@ impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyQueryLoader<AP, G> {
 
 #[cube]
 impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyKeyLoader<AP, G> {
-    pub fn new(key: View<Line<AP::EI>, Coords3d>, #[comptime] config: G) -> Self {
-        let tensor_reader =
-            TensorReader::new(key, (0u32.runtime(), 0u32.runtime(), 0u32.runtime()));
+    pub fn new(key: View<Line<AP::EI>, Coords2d>, #[comptime] config: G) -> Self {
+        let tensor_reader = TensorReader::new(key);
         let stage_memory = StageMemory::new::<G::ScoreStageMemoryConfig>(
             1u32,
             StageIdent::Rhs,
@@ -140,9 +143,8 @@ impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyKeyLoader<AP, G> {
 
 #[cube]
 impl<AP: AttentionPrecision, G: GlobalAttentionConfig> DummyValueLoader<AP, G> {
-    pub fn new(value: View<Line<AP::EI>, Coords3d>, #[comptime] config: G) -> Self {
-        let tensor_reader =
-            TensorReader::new(value, (0u32.runtime(), 0u32.runtime(), 0u32.runtime()));
+    pub fn new(value: View<Line<AP::EI>, Coords2d>, #[comptime] config: G) -> Self {
+        let tensor_reader = TensorReader::new(value);
         let stage_memory = StageMemory::new::<G::ValueStageMemoryConfig>(
             1u32,
             StageIdent::Rhs,
 
@@ -2,7 +2,7 @@ use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 use cubecl_matmul::components::stage::{StageMemoryConfig, StageReaderFamily};
 use cubecl_std::CubeOption;
-use cubecl_std::tensor::{View, layout::Coords3d};
+use cubecl_std::tensor::{View, layout::Coords2d};
 use std::{fmt::Debug, hash::Hash};
 
 use crate::components::{
@@ -96,7 +96,7 @@ pub trait StageAttention<AP: AttentionPrecision>: 'static + Send + Sync {
         #[comptime] global_config: G,
     );
 
-    fn init_writer(q_offset: u32, tensor: View<Line<AP::EO>, Coords3d, ReadWrite>) -> Self::Writer;
+    fn init_writer(tensor: View<Line<AP::EO>, Coords2d, ReadWrite>) -> Self::Writer;
 
     fn init_fragments(
         query_reader: QueryRegisterReader<AP::EI>,
 
@@ -3,7 +3,7 @@ use cubecl_core::prelude::*;
 use cubecl_matmul::components::{stage::StageReader, tile::loader::Strided};
 use cubecl_std::CubeOption;
 use cubecl_std::tensor::View;
-use cubecl_std::tensor::layout::Coords3d;
+use cubecl_std::tensor::layout::Coords2d;
 use std::marker::PhantomData;
 
 use crate::components::global::dummy::QueryRegisterReader;
@@ -83,8 +83,8 @@ impl<AP: AttentionPrecision, R: StageReader<AP::ES, TileKind = Strided>, TA: Til
         TA::write::<G>(acc, writer, stage_config.tile_config(), global_config);
     }
 
-    fn init_writer(q_offset: u32, out: View<Line<AP::EO>, Coords3d, ReadWrite>) -> Self::Writer {
-        TA::init_writer(q_offset, out)
+    fn init_writer(out: View<Line<AP::EO>, Coords2d, ReadWrite>) -> Self::Writer {
+        TA::init_writer(out)
     }
 
     fn init_fragments(
 
@@ -5,7 +5,7 @@ use cubecl_matmul::components::{
     tile::Tile,
 };
 use cubecl_std::CubeOption;
-use cubecl_std::tensor::{View, layout::Coords3d};
+use cubecl_std::tensor::{View, layout::Coords2d};
 
 use crate::components::global::dummy::QueryRegisterReader;
 use crate::components::{
@@ -84,7 +84,7 @@ pub trait TileAttention<AP: AttentionPrecision>: 'static + Send + Sync {
         #[comptime] global_config: G,
     );
 
-    fn init_writer(q_offset: u32, tensor: View<Line<AP::EO>, Coords3d, ReadWrite>) -> Self::Writer;
+    fn init_writer(tensor: View<Line<AP::EO>, Coords2d, ReadWrite>) -> Self::Writer;
 
     fn init_fragments(
         query_reader: QueryRegisterReader<AP::EI>,
 
@@ -2,7 +2,7 @@ use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 use cubecl_matmul::components::tile::Tile;
 use cubecl_std::tensor::View;
-use cubecl_std::tensor::layout::Coords3d;
+use cubecl_std::tensor::layout::Coords2d;
 use cubecl_std::{CubeOption, CubeOptionExpand};
 use std::marker::PhantomData;
 
@@ -135,8 +135,8 @@ impl<AP: AttentionPrecision, FM: FlashMatmul<AP::FlashPrecision>> TileAttention<
         )
     }
 
-    fn init_writer(q_offset: u32, out: View<Line<AP::EO>, Coords3d, ReadWrite>) -> Self::Writer {
-        DummyWriter::new(out, q_offset, 0, 0)
+    fn init_writer(out: View<Line<AP::EO>, Coords2d, ReadWrite>) -> Self::Writer {
+        DummyWriter::new(out)
     }
 
     fn init_fragments(
 
@@ -3,7 +3,7 @@ use cubecl_core::prelude::*;
 use cubecl_matmul::components::{global::memory::TensorWriter, stage::StageMemoryConfig as _};
 use cubecl_std::{
     div_ceil,
-    tensor::{View, layout::Coords3d},
+    tensor::{View, layout::Coords2d},
 };
 
 use crate::components::{FlashIdent, global::GlobalAttentionConfig};
@@ -17,14 +17,9 @@ pub struct DummyWriter<EO: Numeric> {
 
 #[cube]
 impl<EO: Numeric> DummyWriter<EO> {
-    pub fn new(
-        tensor: View<Line<EO>, Coords3d, ReadWrite>,
-        x_offset: u32,
-        y_offset: u32,
-        batch_offset: u32,
-    ) -> Self {
+    pub fn new(tensor: View<Line<EO>, Coords2d, ReadWrite>) -> Self {
         DummyWriter::<EO> {
-            tensor_writer: TensorWriter::new(tensor, x_offset, y_offset, batch_offset),
+            tensor_writer: TensorWriter::new(tensor),
         }
     }
 
 
@@ -1,5 +1,5 @@
 #![allow(unknown_lints)] // `manual_div_ceil` only appeared in 1.83
-#![allow(clippy::manual_div_ceil)]
+#![allow(clippy::manual_div_ceil, clippy::manual_is_multiple_of)]
 
 mod base;
 /// Components for matrix multiplication
 
@@ -6,7 +6,10 @@ use cubecl_matmul::components::{
     global::StageUnloader,
     stage::{ContiguousTilingLayout, RowMajorTilingOrder},
 };
-use cubecl_std::{CubeOption, tensor::r#virtual::VirtualTensor};
+use cubecl_std::{
+    CubeOption,
+    tensor::{layout::Coords2d, r#virtual::VirtualTensor},
+};
 
 use crate::{
     components::{ConvGemmConfig, ConvolutionProblem, global::entry_point::ConvolutionLaunch},
@@ -44,7 +47,7 @@ pub trait GlobalConvolution<MP: MatmulPrecision>: 'static + Send + Sync {
     type Config: ConvGemmConfig;
 
     /// The writer used to write the results to the output feature map
-    type StageWriter: StageUnloader<AccG<MP>>;
+    type StageUnloader: StageUnloader<AccG<MP>>;
     /// The type of the tile matmul accumulator
     type Accumulators: CubeType;
 
@@ -58,7 +61,7 @@ pub trait GlobalConvolution<MP: MatmulPrecision>: 'static + Send + Sync {
         lhs_loader: Self::LhsStageLoader,
         rhs_loader: Self::RhsStageLoader,
         acc_loader: Self::AccStageLoader,
-        writer: Self::StageWriter,
+        writer: Self::StageUnloader,
         acc: &mut Self::Accumulators,
         k_range: (u32, u32),
         #[comptime] config: Self::Config,
@@ -67,17 +70,17 @@ pub trait GlobalConvolution<MP: MatmulPrecision>: 'static + Send + Sync {
     /// Initializes the loader for the input feature map with an appropriate layout
     fn init_lhs_loader(
         lhs: VirtualTensor<LhsG<MP>>,
-        x_offset: u32,
-        y_offset: u32,
+        offset: Coords2d,
+        view_shape: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
     ) -> Self::LhsStageLoader;
 
     /// Initializes the loader for the weights with an appropriate layout
     fn init_rhs_loader(
         rhs: VirtualTensor<RhsG<MP>>,
-        x_offset: u32,
-        y_offset: u32,
+        offset: Coords2d,
+        view_shape: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
     ) -> Self::RhsStageLoader;
@@ -86,17 +89,18 @@ pub trait GlobalConvolution<MP: MatmulPrecision>: 'static + Send + Sync {
     fn init_bias_loader(
         bias: CubeOption<VirtualTensor<AccG<MP>>>,
         n_offset: u32,
+        slice_size: u32,
         #[comptime] config: Self::Config,
     ) -> Self::AccStageLoader;
 
     /// Initializes the output feature map loader with an appropriate layout
-    fn init_writer(
+    fn init_global_writer(
         out: VirtualTensor<AccG<MP>, ReadWrite>,
-        x_offset: u32,
-        y_offset: u32,
+        offset: Coords2d,
+        view_shape: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
-    ) -> Self::StageWriter;
+    ) -> Self::StageUnloader;
 
     /// Initializes a new accumulator for the tile matmul
     fn init_accumulator(#[comptime] config: Self::Config) -> Self::Accumulators;
Original file line number	Diff line number	Diff line change
`@@ -493,7 +493,7 @@ impl<EI: Numeric, EO: Numeric, GA: AttentionArgs> TensorOutput<EI, EO, GA> {`
`493`	`493`
`494`	`494`	`/// Get the buffer length of the tensor.`
`495`	`495`	`pub fn buffer_len(&self) -> u32 {`
`496`		`- unsafe { GA::len_out(&(*self.state)) }`
	`496`	`+ unsafe { GA::buffer_len_out(&(*self.state)) }`
`497`	`497`	`}`
`498`	`498`
`499`	`499`	`/// Get the line size of the tensor.`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@ use cubecl_core as cubecl;`
`2`	`2`	`use cubecl_core::prelude::*;`
`3`	`3`	`use cubecl_matmul::components::tile::Tile;`
`4`	`4`	`use cubecl_std::tensor::View;`
`5`		`-use cubecl_std::tensor::layout::Coords3d;`
	`5`	`+use cubecl_std::tensor::layout::Coords2d;`
`6`	`6`	`use cubecl_std::{CubeOption, CubeOptionExpand};`
`7`	`7`	`use std::marker::PhantomData;`
`8`	`8`
`@@ -135,8 +135,8 @@ impl<AP: AttentionPrecision, FM: FlashMatmul<AP::FlashPrecision>> TileAttention<`
`135`	`135`	`)`
`136`	`136`	`}`
`137`	`137`
`138`		`- fn init_writer(q_offset: u32, out: View<Line<AP::EO>, Coords3d, ReadWrite>) -> Self::Writer {`
`139`		`- DummyWriter::new(out, q_offset, 0, 0)`
	`138`	`+ fn init_writer(out: View<Line<AP::EO>, Coords2d, ReadWrite>) -> Self::Writer {`
	`139`	`+ DummyWriter::new(out)`
`140`	`140`	`}`
`141`	`141`
`142`	`142`	`fn init_fragments(`