From 7928505d70c73ce4a137bc8d36a280e8c97ebffc Mon Sep 17 00:00:00 2001
From: AndreSlavescu <andre.slavescu@gmail.com>
Date: Fri, 20 Mar 2026 15:01:28 -0400
Subject: [PATCH] docs updates with diagrams

---
 docs/_static/compilation-pipeline.svg |  90 ++++++++++++++
 docs/_static/gemm-tiling.svg          | 161 ++++++++++++++++++++++++++
 docs/_static/morton-swizzle.svg       | 150 ++++++++++++++++++++++++
 docs/_static/simdgroup-layout.svg     |  92 +++++++++++++++
 docs/_static/tiling-overview.svg      | 105 +++++++++++++++++
 docs/_static/unified-memory.svg       |  60 ++++++++++
 docs/conf.py                          |   2 +-
 docs/examples/fused-activations.rst   |  19 ++-
 docs/examples/layernorm.rst           |   2 +-
 docs/examples/matmul.rst              |  20 ++--
 docs/examples/softmax.rst             |  28 +----
 docs/examples/vector-add.rst          |  12 +-
 docs/getting-started/first-kernel.rst |  14 ++-
 docs/getting-started/install.rst      |   2 +-
 docs/guide/autotuning.rst             |  16 ++-
 docs/guide/language.rst               |   8 +-
 docs/guide/memory.rst                 |  20 +++-
 docs/guide/tile-ops.rst               |  40 +++----
 18 files changed, 753 insertions(+), 88 deletions(-)
 create mode 100644 docs/_static/compilation-pipeline.svg
 create mode 100644 docs/_static/gemm-tiling.svg
 create mode 100644 docs/_static/morton-swizzle.svg
 create mode 100644 docs/_static/simdgroup-layout.svg
 create mode 100644 docs/_static/tiling-overview.svg
 create mode 100644 docs/_static/unified-memory.svg
diff --git a/docs/_static/compilation-pipeline.svg b/docs/_static/compilation-pipeline.svg
new file mode 100644
index 0000000..0b7d635
--- /dev/null
+++ b/docs/_static/compilation-pipeline.svg
@@ -0,0 +1,90 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 680 920" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+    <marker id="arrow" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#444"/>
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="680" height="920" fill="#fff"/>
+
+  <!-- ====== Stage 1: Input Python code ====== -->
+  <text x="340" y="36" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Input Python code</text>
+
+  <rect x="120" y="50" width="440" height="120" rx="16" fill="#fff" stroke="#333" stroke-width="2"/>
+  <text font-family="'JetBrains Mono', 'Fira Code', 'SF Mono', Menlo, monospace" font-size="13" fill="#333">
+    <tspan x="148" y="80" fill="#7c3aed">@metile.kernel</tspan>
+    <tspan x="148" y="100" fill="#7c3aed">def</tspan><tspan fill="#333"> matmul(A, B, C, M, N, K, ...):</tspan>
+    <tspan x="168" y="120">acc = metile.zeros((BM, BN))</tspan>
+    <tspan x="168" y="140" fill="#7c3aed">for</tspan><tspan fill="#333"> k </tspan><tspan fill="#7c3aed">in</tspan><tspan fill="#333"> metile.tile_range(...):</tspan>
+    <tspan x="188" y="156" fill="#888">acc = metile.dot(a, b, acc)</tspan>
+  </text>
+
+  <!-- Arrow -->
+  <line x1="340" y1="170" x2="340" y2="210" stroke="#444" stroke-width="2" marker-end="url(#arrow)"/>
+  <text x="356" y="196" font-size="12" fill="#555" font-style="italic">trace</text>
+
+  <!-- ====== Stage 2: Tile IR ====== -->
+  <text x="340" y="234" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Tile IR</text>
+
+  <rect x="120" y="248" width="440" height="100" rx="16" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="340" y="278" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Hardware-agnostic operations</text>
+
+  <rect x="160" y="292" width="360" height="40" rx="10" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="340" y="317" text-anchor="middle" font-size="13" fill="#333">Dot, TileLoad, TileStore, ForRange, Zeros, ...</text>
+
+  <!-- Arrow -->
+  <line x1="340" y1="348" x2="340" y2="388" stroke="#444" stroke-width="2" marker-end="url(#arrow)"/>
+  <text x="356" y="374" font-size="12" fill="#555" font-style="italic">lower</text>
+
+  <!-- ====== Stage 3: Metal IR ====== -->
+  <text x="340" y="412" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Metal IR</text>
+
+  <rect x="120" y="426" width="440" height="120" rx="16" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="340" y="456" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Apple GPU primitives</text>
+
+  <!-- Two sub-boxes side by side -->
+  <rect x="145" y="468" width="190" height="60" rx="10" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="240" y="494" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Simdgroup MMA</text>
+  <text x="240" y="512" text-anchor="middle" font-size="11" fill="#555">M1 / M2 / M3</text>
+
+  <rect x="345" y="468" width="190" height="60" rx="10" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="440" y="494" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Tensor Ops</text>
+  <text x="440" y="512" text-anchor="middle" font-size="11" fill="#555">M4+</text>
+
+  <!-- Arrow -->
+  <line x1="340" y1="546" x2="340" y2="586" stroke="#444" stroke-width="2" marker-end="url(#arrow)"/>
+  <text x="356" y="572" font-size="12" fill="#555" font-style="italic">optimize</text>
+
+  <!-- ====== Stage 4: Optimization Passes ====== -->
+  <text x="340" y="610" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Optimization Passes</text>
+
+  <rect x="120" y="624" width="440" height="56" rx="16" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <rect x="145" y="636" width="390" height="30" rx="8" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="340" y="657" text-anchor="middle" font-size="11" fill="#333">vectorize, serpentine MMA, double-buffer, split-K, swizzle, fold</text>
+
+  <!-- Arrow -->
+  <line x1="340" y1="680" x2="340" y2="720" stroke="#444" stroke-width="2" marker-end="url(#arrow)"/>
+  <text x="356" y="706" font-size="12" fill="#555" font-style="italic">emit</text>
+
+  <!-- ====== Stage 5: MSL Source ====== -->
+  <text x="340" y="744" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">MSL Source</text>
+
+  <rect x="120" y="758" width="440" height="72" rx="16" fill="#fff" stroke="#333" stroke-width="2"/>
+  <text font-family="'JetBrains Mono', 'Fira Code', 'SF Mono', Menlo, monospace" font-size="11" fill="#555">
+    <tspan x="148" y="782">[[kernel]] void mtile_matmul(</tspan>
+    <tspan x="168" y="798">device float* A [[buffer(0)]], ...)</tspan>
+    <tspan x="148" y="814">{ ... simdgroup_multiply_accumulate ... }</tspan>
+  </text>
+
+  <!-- Arrow -->
+  <line x1="340" y1="830" x2="340" y2="860" stroke="#444" stroke-width="2" marker-end="url(#arrow)"/>
+  <text x="356" y="851" font-size="12" fill="#555" font-style="italic">xcrun metal -O2</text>
+
+  <!-- ====== Stage 6: Binary ====== -->
+  <rect x="120" y="866" width="440" height="40" rx="16" fill="#c8e6c9" stroke="#333" stroke-width="2"/>
+  <text x="340" y="892" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">Metal Compute Pipeline (.metallib)</text>
+</svg>
diff --git a/docs/_static/gemm-tiling.svg b/docs/_static/gemm-tiling.svg
new file mode 100644
index 0000000..809a7d9
--- /dev/null
+++ b/docs/_static/gemm-tiling.svg
@@ -0,0 +1,161 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 780 900" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+    <marker id="arr" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#444"/>
+    </marker>
+  </defs>
+
+  <rect width="780" height="900" fill="#fff"/>
+
+  <!-- Title -->
+  <text x="390" y="36" text-anchor="middle" font-size="20" font-weight="bold" fill="#2563eb">GEMM Tiling: C = A x B</text>
+
+  <!-- ====== Top row: A @ B = C ====== -->
+
+  <!-- Matrix A -->
+  <text x="110" y="82" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">A</text>
+  <text x="110" y="96" text-anchor="middle" font-size="12" fill="#555">M x K</text>
+  <rect x="40" y="104" width="140" height="200" rx="12" fill="#c8e6c9" stroke="#333" stroke-width="2"/>
+  <!-- Highlighted row block (top: first BM rows) -->
+  <rect x="48" y="112" width="124" height="36" rx="6" fill="#66bb6a" opacity="0.4" stroke="#2e7d32" stroke-width="1.5" stroke-dasharray="4,3"/>
+  <text x="110" y="136" text-anchor="middle" font-size="10" font-weight="bold" fill="#1b5e20">BM rows</text>
+  <!-- Dimension labels -->
+  <text x="110" y="320" text-anchor="middle" font-size="11" fill="#888">K</text>
+  <text x="24" y="204" font-size="11" fill="#888" transform="rotate(-90, 24, 204)">M</text>
+
+  <!-- x symbol -->
+  <text x="210" y="210" text-anchor="middle" font-size="24" font-weight="bold" fill="#333">x</text>
+
+  <!-- Matrix B (K x N): width=200 for N, height=140 for K -->
+  <text x="340" y="82" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">B</text>
+  <text x="340" y="96" text-anchor="middle" font-size="12" fill="#555">K x N</text>
+  <rect x="240" y="104" width="200" height="140" rx="12" fill="#bbdefb" stroke="#333" stroke-width="2"/>
+  <!-- Highlighted: BK x BN = half K height (70px) x quarter N width (50px) -->
+  <rect x="248" y="112" width="50" height="62" rx="6" fill="#42a5f5" opacity="0.4" stroke="#1565c0" stroke-width="1.5" stroke-dasharray="4,3"/>
+  <text x="273" y="148" text-anchor="middle" font-size="9" font-weight="bold" fill="#0d47a1">BN</text>
+  <!-- Dimension labels -->
+  <text x="340" y="260" text-anchor="middle" font-size="11" fill="#888">N</text>
+  <text x="228" y="174" font-size="11" fill="#888" transform="rotate(-90, 228, 174)">K</text>
+
+  <!-- = symbol -->
+  <text x="470" y="210" text-anchor="middle" font-size="24" font-weight="bold" fill="#333">=</text>
+
+  <!-- Matrix C with grid (M x N): 240w x 200h, 4x4 grid -> 60w x 50h cells -->
+  <text x="620" y="82" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">C</text>
+  <rect x="500" y="104" width="240" height="200" rx="12" fill="#fff9c4" stroke="#333" stroke-width="2"/>
+
+  <!-- 4x4 grid lines: cells are 60w x 50h -->
+  <line x1="560" y1="104" x2="560" y2="304" stroke="#e0e0e0" stroke-width="1"/>
+  <line x1="620" y1="104" x2="620" y2="304" stroke="#e0e0e0" stroke-width="1"/>
+  <line x1="680" y1="104" x2="680" y2="304" stroke="#e0e0e0" stroke-width="1"/>
+  <line x1="500" y1="154" x2="740" y2="154" stroke="#e0e0e0" stroke-width="1"/>
+  <line x1="500" y1="204" x2="740" y2="204" stroke="#e0e0e0" stroke-width="1"/>
+  <line x1="500" y1="254" x2="740" y2="254" stroke="#e0e0e0" stroke-width="1"/>
+
+  <!-- Highlighted tile (0,0): inset within first cell -->
+  <rect x="505" y="109" width="50" height="40" rx="4" fill="#ffb74d" opacity="0.5" stroke="#e65100" stroke-width="1.5" stroke-dasharray="4,3"/>
+
+  <!-- Tile labels: centered in each 60x50 cell -->
+  <text x="530" y="134" text-anchor="middle" font-size="9" font-weight="bold" fill="#bf360c">(0,0)</text>
+  <text x="590" y="134" text-anchor="middle" font-size="8" fill="#999">(0,1)</text>
+  <text x="650" y="134" text-anchor="middle" font-size="8" fill="#999">(0,2)</text>
+  <text x="710" y="134" text-anchor="middle" font-size="8" fill="#999">(0,3)</text>
+  <text x="530" y="184" text-anchor="middle" font-size="8" fill="#999">(1,0)</text>
+  <text x="590" y="184" text-anchor="middle" font-size="8" fill="#999">(1,1)</text>
+  <text x="650" y="184" text-anchor="middle" font-size="8" fill="#999">(1,2)</text>
+  <text x="710" y="184" text-anchor="middle" font-size="8" fill="#999">(1,3)</text>
+  <text x="530" y="234" text-anchor="middle" font-size="8" fill="#999">(2,0)</text>
+  <text x="590" y="234" text-anchor="middle" font-size="8" fill="#999">(2,1)</text>
+  <text x="650" y="234" text-anchor="middle" font-size="8" fill="#999">(2,2)</text>
+  <text x="710" y="234" text-anchor="middle" font-size="8" fill="#999">(2,3)</text>
+  <text x="530" y="284" text-anchor="middle" font-size="8" fill="#999">(3,0)</text>
+  <text x="590" y="284" text-anchor="middle" font-size="8" fill="#999">(3,1)</text>
+  <text x="650" y="284" text-anchor="middle" font-size="8" fill="#999">(3,2)</text>
+  <text x="710" y="284" text-anchor="middle" font-size="8" fill="#999">(3,3)</text>
+
+  <!-- Dimension labels -->
+  <text x="620" y="320" text-anchor="middle" font-size="11" fill="#888">N</text>
+  <text x="488" y="204" font-size="11" fill="#888" transform="rotate(-90, 488, 204)">M</text>
+
+  <!-- Grid note -->
+  <text x="620" y="340" text-anchor="middle" font-size="12" fill="#555">grid = (ceil(M/BM), ceil(N/BN))</text>
+  <text x="620" y="358" text-anchor="middle" font-size="12" fill="#555">one program instance per tile</text>
+
+  <!-- ====== Arrow down to K-loop section ====== -->
+  <line x1="390" y1="370" x2="390" y2="410" stroke="#444" stroke-width="2" marker-end="url(#arr)"/>
+
+  <!-- ====== K-loop detail ====== -->
+  <text x="390" y="440" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Inside pid=(0,0): the K-loop</text>
+
+  <!-- Iteration 1 box -->
+  <rect x="40" y="460" width="700" height="130" rx="14" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="390" y="485" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">Iteration k = 0</text>
+
+  <!-- A tile -->
+  <rect x="70" y="500" width="120" height="70" rx="10" fill="#c8e6c9" stroke="#333" stroke-width="1.5"/>
+  <text x="130" y="530" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">A tile</text>
+  <text x="130" y="550" text-anchor="middle" font-size="11" fill="#555">BM x BK</text>
+
+  <!-- x -->
+  <text x="215" y="540" text-anchor="middle" font-size="18" font-weight="bold" fill="#333">x</text>
+
+  <!-- B tile -->
+  <rect x="240" y="500" width="120" height="70" rx="10" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="300" y="530" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">B tile</text>
+  <text x="300" y="550" text-anchor="middle" font-size="11" fill="#555">BK x BN</text>
+
+  <!-- += -->
+  <text x="392" y="540" text-anchor="middle" font-size="18" font-weight="bold" fill="#333">+=</text>
+
+  <!-- acc -->
+  <rect x="430" y="500" width="130" height="70" rx="10" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="495" y="528" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">acc</text>
+  <text x="495" y="548" text-anchor="middle" font-size="11" fill="#555">BM x BN</text>
+  <text x="495" y="562" text-anchor="middle" font-size="10" fill="#999">(in registers)</text>
+
+  <!-- dot label -->
+  <rect x="590" y="515" width="120" height="36" rx="8" fill="#e8eaf6" stroke="#333" stroke-width="1.5"/>
+  <text x="650" y="539" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">metile.dot()</text>
+
+  <!-- Arrow between iterations -->
+  <line x1="390" y1="590" x2="390" y2="618" stroke="#444" stroke-width="2" marker-end="url(#arr)"/>
+  <text x="420" y="610" font-size="12" fill="#555" font-style="italic">k += BK</text>
+
+  <!-- Iteration 2 box -->
+  <rect x="40" y="624" width="700" height="130" rx="14" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="390" y="649" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">Iteration k = BK</text>
+
+  <rect x="70" y="664" width="120" height="70" rx="10" fill="#c8e6c9" stroke="#333" stroke-width="1.5"/>
+  <text x="130" y="694" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">A tile</text>
+  <text x="130" y="714" text-anchor="middle" font-size="11" fill="#555">BM x BK</text>
+
+  <text x="215" y="704" text-anchor="middle" font-size="18" font-weight="bold" fill="#333">x</text>
+
+  <rect x="240" y="664" width="120" height="70" rx="10" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="300" y="694" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">B tile</text>
+  <text x="300" y="714" text-anchor="middle" font-size="11" fill="#555">BK x BN</text>
+
+  <text x="392" y="704" text-anchor="middle" font-size="18" font-weight="bold" fill="#333">+=</text>
+
+  <rect x="430" y="664" width="130" height="70" rx="10" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="495" y="692" text-anchor="middle" font-size="13" font-weight="bold" fill="#333">acc</text>
+  <text x="495" y="712" text-anchor="middle" font-size="11" fill="#555">BM x BN</text>
+  <text x="495" y="726" text-anchor="middle" font-size="10" fill="#999">(accumulated)</text>
+
+  <rect x="590" y="679" width="120" height="36" rx="8" fill="#e8eaf6" stroke="#333" stroke-width="1.5"/>
+  <text x="650" y="703" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">metile.dot()</text>
+
+  <!-- More iterations dots -->
+  <text x="390" y="778" text-anchor="middle" font-size="18" fill="#999">. . .   repeated K / BK times   . . .</text>
+
+  <!-- Arrow to store -->
+  <line x1="390" y1="790" x2="390" y2="818" stroke="#444" stroke-width="2" marker-end="url(#arr)"/>
+
+  <!-- Final store -->
+  <rect x="200" y="824" width="380" height="56" rx="14" fill="#c8e6c9" stroke="#333" stroke-width="2"/>
+  <text x="390" y="852" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">metile.tile_store(C, ..., acc)</text>
+  <text x="390" y="870" text-anchor="middle" font-size="11" fill="#555">write BM x BN result to global memory</text>
+</svg>
diff --git a/docs/_static/morton-swizzle.svg b/docs/_static/morton-swizzle.svg
new file mode 100644
index 0000000..9c86b7f
--- /dev/null
+++ b/docs/_static/morton-swizzle.svg
@@ -0,0 +1,150 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 720 440" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+    <marker id="marr" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="5" markerHeight="5" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#2563eb"/>
+    </marker>
+  </defs>
+
+  <rect width="720" height="440" fill="#fff"/>
+
+  <text x="360" y="32" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Tile Scheduling: Morton vs Linear</text>
+
+  <!-- 4 colors, one per 2x2 block group -->
+  <!-- Block A (tiles 0-3):  #ef9a9a red    -->
+  <!-- Block B (tiles 4-7):  #90caf9 blue   -->
+  <!-- Block C (tiles 8-11): #a5d6a7 green  -->
+  <!-- Block D (tiles 12-15): #fff59d yellow -->
+
+  <!-- ====== Left: Linear ====== -->
+  <text x="155" y="66" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Linear (row-major)</text>
+
+  <rect x="40" y="82" width="230" height="230" rx="10" fill="#fff" stroke="#333" stroke-width="2"/>
+
+  <!-- Row 0 -->
+  <rect x="48" y="90" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="74" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">0</text>
+
+  <rect x="104" y="90" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="130" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">1</text>
+
+  <rect x="160" y="90" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="186" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">2</text>
+
+  <rect x="216" y="90" width="46" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="239" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">3</text>
+
+  <!-- Row 1 -->
+  <rect x="48" y="146" width="52" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="74" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">4</text>
+
+  <rect x="104" y="146" width="52" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="130" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">5</text>
+
+  <rect x="160" y="146" width="52" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="186" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">6</text>
+
+  <rect x="216" y="146" width="46" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="239" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">7</text>
+
+  <!-- Row 2 -->
+  <rect x="48" y="202" width="52" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="74" y="233" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">8</text>
+
+  <rect x="104" y="202" width="52" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="130" y="233" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">9</text>
+
+  <rect x="160" y="202" width="52" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="186" y="233" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">10</text>
+
+  <rect x="216" y="202" width="46" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="239" y="233" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">11</text>
+
+  <!-- Row 3 -->
+  <rect x="48" y="258" width="52" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="74" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">12</text>
+
+  <rect x="104" y="258" width="52" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="130" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">13</text>
+
+  <rect x="160" y="258" width="52" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="186" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">14</text>
+
+  <rect x="216" y="258" width="46" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="239" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">15</text>
+
+  <text x="155" y="332" text-anchor="middle" font-size="11" fill="#888">tiles read in row order,</text>
+  <text x="155" y="348" text-anchor="middle" font-size="11" fill="#888">poor L2 reuse for A and B</text>
+
+  <!-- ====== Right: Morton ====== -->
+  <text x="555" y="66" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Morton (Z-order)</text>
+
+  <rect x="440" y="82" width="230" height="230" rx="10" fill="#fff" stroke="#333" stroke-width="2"/>
+
+  <!-- Row 0: tiles 0,1 (red), 4,5 (blue) -->
+  <rect x="448" y="90" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="474" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">0</text>
+
+  <rect x="504" y="90" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="530" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">1</text>
+
+  <rect x="560" y="90" width="52" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="586" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">4</text>
+
+  <rect x="616" y="90" width="46" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="639" y="121" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">5</text>
+
+  <!-- Row 1: tiles 2,3 (red), 6,7 (blue) -->
+  <rect x="448" y="146" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="474" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">2</text>
+
+  <rect x="504" y="146" width="52" height="52" rx="6" fill="#ef9a9a" stroke="#333" stroke-width="1"/>
+  <text x="530" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">3</text>
+
+  <rect x="560" y="146" width="52" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="586" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">6</text>
+
+  <rect x="616" y="146" width="46" height="52" rx="6" fill="#90caf9" stroke="#333" stroke-width="1"/>
+  <text x="639" y="177" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">7</text>
+
+  <!-- Row 2: tiles 8,9 (green), 12,13 (yellow) -->
+  <rect x="448" y="202" width="52" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="474" y="233" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">8</text>
+
+  <rect x="504" y="202" width="52" height="52" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="530" y="233" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">9</text>
+
+  <rect x="560" y="202" width="52" height="52" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="586" y="233" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">12</text>
+
+  <rect x="616" y="202" width="46" height="52" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="639" y="233" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">13</text>
+
+  <!-- Row 3: tiles 10,11 (green), 14,15 (yellow) -->
+  <rect x="448" y="258" width="52" height="46" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="474" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">10</text>
+
+  <rect x="504" y="258" width="52" height="46" rx="6" fill="#a5d6a7" stroke="#333" stroke-width="1"/>
+  <text x="530" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">11</text>
+
+  <rect x="560" y="258" width="52" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="586" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">14</text>
+
+  <rect x="616" y="258" width="46" height="46" rx="6" fill="#fff59d" stroke="#333" stroke-width="1"/>
+  <text x="639" y="286" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">15</text>
+
+  <!-- Z-path arrows in first 2x2 block -->
+  <line x1="474" y1="126" x2="524" y2="126" stroke="#2563eb" stroke-width="2" marker-end="url(#marr)"/>
+  <line x1="530" y1="132" x2="480" y2="158" stroke="#2563eb" stroke-width="2" marker-end="url(#marr)"/>
+  <line x1="474" y1="182" x2="524" y2="182" stroke="#2563eb" stroke-width="2" marker-end="url(#marr)"/>
+
+  <text x="555" y="332" text-anchor="middle" font-size="11" fill="#888">2x2 blocks processed together,</text>
+  <text x="555" y="348" text-anchor="middle" font-size="11" fill="#888">neighbors share A-rows and B-cols in L2</text>
+
+  <!-- ====== Bottom explanation ====== -->
+  <rect x="100" y="370" width="520" height="56" rx="12" fill="#dbeafe" stroke="#2563eb" stroke-width="1.5"/>
+  <text x="360" y="394" text-anchor="middle" font-size="12" font-weight="bold" fill="#2563eb">Why Morton?</text>
+  <text x="360" y="414" text-anchor="middle" font-size="11" fill="#555">Tiles 0,1,2,3 share two A-rows and two B-columns in L2 cache. Less memory traffic.</text>
+</svg>
diff --git a/docs/_static/simdgroup-layout.svg b/docs/_static/simdgroup-layout.svg
new file mode 100644
index 0000000..4c8e2f1
--- /dev/null
+++ b/docs/_static/simdgroup-layout.svg
@@ -0,0 +1,92 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 400" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+  </defs>
+
+  <rect width="620" height="400" fill="#fff"/>
+
+  <text x="320" y="32" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Simdgroup Layout (WM=4, WN=4)</text>
+  <text x="320" y="52" text-anchor="middle" font-size="12" fill="#888">128 x 128 output tile, 16 simdgroups, each handles 32 x 32</text>
+
+  <!-- Outer tile border -->
+  <rect x="60" y="72" width="520" height="280" rx="12" fill="#fff9c4" stroke="#333" stroke-width="2"/>
+
+  <!-- Column headers -->
+  <text x="185" y="90" text-anchor="middle" font-size="10" fill="#888">col 0..31</text>
+  <text x="310" y="90" text-anchor="middle" font-size="10" fill="#888">col 32..63</text>
+  <text x="435" y="90" text-anchor="middle" font-size="10" fill="#888">col 64..95</text>
+
+  <!-- Row 0 -->
+  <rect x="75" y="98" width="115" height="55" rx="8" fill="#ef9a9a" stroke="#333" stroke-width="1.5"/>
+  <text x="132" y="122" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(0,0)</text>
+  <text x="132" y="138" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="200" y="98" width="115" height="55" rx="8" fill="#ce93d8" stroke="#333" stroke-width="1.5"/>
+  <text x="257" y="122" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(0,1)</text>
+  <text x="257" y="138" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="325" y="98" width="115" height="55" rx="8" fill="#90caf9" stroke="#333" stroke-width="1.5"/>
+  <text x="382" y="122" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(0,2)</text>
+  <text x="382" y="138" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="450" y="98" width="115" height="55" rx="8" fill="#80cbc4" stroke="#333" stroke-width="1.5"/>
+  <text x="507" y="122" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(0,3)</text>
+  <text x="507" y="138" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <!-- Row 1 -->
+  <rect x="75" y="161" width="115" height="55" rx="8" fill="#a5d6a7" stroke="#333" stroke-width="1.5"/>
+  <text x="132" y="185" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(1,0)</text>
+  <text x="132" y="201" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="200" y="161" width="115" height="55" rx="8" fill="#fff59d" stroke="#333" stroke-width="1.5"/>
+  <text x="257" y="185" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(1,1)</text>
+  <text x="257" y="201" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="325" y="161" width="115" height="55" rx="8" fill="#ffab91" stroke="#333" stroke-width="1.5"/>
+  <text x="382" y="185" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(1,2)</text>
+  <text x="382" y="201" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="450" y="161" width="115" height="55" rx="8" fill="#b0bec5" stroke="#333" stroke-width="1.5"/>
+  <text x="507" y="185" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(1,3)</text>
+  <text x="507" y="201" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <!-- Row 2 -->
+  <rect x="75" y="224" width="115" height="55" rx="8" fill="#b39ddb" stroke="#333" stroke-width="1.5"/>
+  <text x="132" y="248" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(2,0)</text>
+  <text x="132" y="264" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="200" y="224" width="115" height="55" rx="8" fill="#80deea" stroke="#333" stroke-width="1.5"/>
+  <text x="257" y="248" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(2,1)</text>
+  <text x="257" y="264" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="325" y="224" width="115" height="55" rx="8" fill="#ef9a9a" stroke="#333" stroke-width="1.5"/>
+  <text x="382" y="248" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(2,2)</text>
+  <text x="382" y="264" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="450" y="224" width="115" height="55" rx="8" fill="#a5d6a7" stroke="#333" stroke-width="1.5"/>
+  <text x="507" y="248" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(2,3)</text>
+  <text x="507" y="264" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <!-- Row 3 -->
+  <rect x="75" y="287" width="115" height="55" rx="8" fill="#90caf9" stroke="#333" stroke-width="1.5"/>
+  <text x="132" y="311" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(3,0)</text>
+  <text x="132" y="327" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="200" y="287" width="115" height="55" rx="8" fill="#ce93d8" stroke="#333" stroke-width="1.5"/>
+  <text x="257" y="311" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(3,1)</text>
+  <text x="257" y="327" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="325" y="287" width="115" height="55" rx="8" fill="#fff59d" stroke="#333" stroke-width="1.5"/>
+  <text x="382" y="311" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(3,2)</text>
+  <text x="382" y="327" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <rect x="450" y="287" width="115" height="55" rx="8" fill="#ffab91" stroke="#333" stroke-width="1.5"/>
+  <text x="507" y="311" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">sg(3,3)</text>
+  <text x="507" y="327" text-anchor="middle" font-size="9" fill="#666">32 x 32</text>
+
+  <!-- Bottom note -->
+  <text x="320" y="374" text-anchor="middle" font-size="11" fill="#555">Each simdgroup = 32 threads computing one subtile independently.</text>
+  <text x="320" y="392" text-anchor="middle" font-size="11" fill="#555">sg_row = sgid / WN,   sg_col = sgid % WN</text>
+</svg>
diff --git a/docs/_static/tiling-overview.svg b/docs/_static/tiling-overview.svg
new file mode 100644
index 0000000..7a856de
--- /dev/null
+++ b/docs/_static/tiling-overview.svg
@@ -0,0 +1,105 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 760 520" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+    <marker id="arr" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#444"/>
+    </marker>
+  </defs>
+
+  <rect width="760" height="520" fill="#fff"/>
+
+  <text x="380" y="32" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">How Tiling Works</text>
+
+  <!-- ====== Matrix C tiled grid ====== -->
+  <text x="130" y="66" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Output matrix C (M x N)</text>
+
+  <rect x="30" y="80" width="200" height="200" rx="10" fill="#dbeafe" stroke="#333" stroke-width="2"/>
+
+  <!-- Grid lines -->
+  <line x1="130" y1="80" x2="130" y2="280" stroke="#93c5fd" stroke-width="1"/>
+  <line x1="30" y1="180" x2="230" y2="180" stroke="#93c5fd" stroke-width="1"/>
+
+  <!-- Tile labels -->
+  <text x="80" y="136" text-anchor="middle" font-size="12" font-weight="bold" fill="#1e40af">pid=(0,0)</text>
+  <text x="80" y="154" text-anchor="middle" font-size="10" fill="#999">BM x BN</text>
+  <text x="180" y="136" text-anchor="middle" font-size="11" fill="#888">pid=(0,1)</text>
+  <text x="80" y="236" text-anchor="middle" font-size="11" fill="#888">pid=(1,0)</text>
+  <text x="180" y="236" text-anchor="middle" font-size="11" fill="#888">pid=(1,1)</text>
+
+  <!-- Highlight tile (0,0) -->
+  <rect x="30" y="80" width="100" height="100" rx="6" fill="#60a5fa" opacity="0.35" stroke="#1d4ed8" stroke-width="2.5"/>
+
+  <!-- Dimension labels -->
+  <text x="130" y="300" text-anchor="middle" font-size="11" fill="#888">N</text>
+  <text x="18" y="180" font-size="11" fill="#888" transform="rotate(-90,18,180)">M</text>
+
+  <text x="130" y="322" text-anchor="middle" font-size="11" fill="#555">grid = (ceil(M/BM), ceil(N/BN))</text>
+
+  <!-- ====== Arrow ====== -->
+  <line x1="250" y1="170" x2="300" y2="170" stroke="#444" stroke-width="2" marker-end="url(#arr)"/>
+  <text x="275" y="160" text-anchor="middle" font-size="10" fill="#555" font-style="italic">zoom in</text>
+
+  <!-- ====== Inside pid=(0,0): K-loop ====== -->
+  <rect x="310" y="56" width="430" height="440" rx="14" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="525" y="84" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Inside pid=(0,0)</text>
+
+  <!-- K iteration 0 -->
+  <text x="525" y="114" text-anchor="middle" font-size="11" font-weight="bold" fill="#555">k = 0</text>
+
+  <rect x="340" y="124" width="90" height="64" rx="8" fill="#c8e6c9" stroke="#333" stroke-width="1.5"/>
+  <text x="385" y="152" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">A tile</text>
+  <text x="385" y="168" text-anchor="middle" font-size="9" fill="#666">BM x BK</text>
+
+  <text x="445" y="160" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">x</text>
+
+  <rect x="462" y="124" width="70" height="64" rx="8" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="497" y="152" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">B tile</text>
+  <text x="497" y="168" text-anchor="middle" font-size="9" fill="#666">BK x BN</text>
+
+  <text x="553" y="160" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">+=</text>
+
+  <rect x="578" y="124" width="80" height="64" rx="8" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="618" y="152" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">acc</text>
+  <text x="618" y="168" text-anchor="middle" font-size="9" fill="#666">BM x BN</text>
+
+  <!-- Arrow down -->
+  <line x1="525" y1="198" x2="525" y2="226" stroke="#444" stroke-width="1.5" marker-end="url(#arr)"/>
+  <text x="545" y="216" font-size="10" fill="#555" font-style="italic">k += BK</text>
+
+  <!-- K iteration 1 -->
+  <text x="525" y="246" text-anchor="middle" font-size="11" font-weight="bold" fill="#555">k = BK</text>
+
+  <rect x="340" y="256" width="90" height="64" rx="8" fill="#c8e6c9" stroke="#333" stroke-width="1.5"/>
+  <text x="385" y="284" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">A tile</text>
+  <text x="385" y="300" text-anchor="middle" font-size="9" fill="#666">BM x BK</text>
+
+  <text x="445" y="292" text-anchor="middle" font-size="16" font-weight="bold" fill="#333">x</text>
+
+  <rect x="462" y="256" width="70" height="64" rx="8" fill="#bbdefb" stroke="#333" stroke-width="1.5"/>
+  <text x="497" y="284" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">B tile</text>
+  <text x="497" y="300" text-anchor="middle" font-size="9" fill="#666">BK x BN</text>
+
+  <text x="553" y="292" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">+=</text>
+
+  <rect x="578" y="256" width="80" height="64" rx="8" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="618" y="284" text-anchor="middle" font-size="11" font-weight="bold" fill="#333">acc</text>
+  <text x="618" y="300" text-anchor="middle" font-size="9" fill="#666">accumulated</text>
+
+  <!-- Dots -->
+  <text x="525" y="348" text-anchor="middle" font-size="14" fill="#aaa">. . .   K / BK iterations   . . .</text>
+
+  <!-- Arrow to store -->
+  <line x1="525" y1="362" x2="525" y2="390" stroke="#444" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- Store -->
+  <rect x="390" y="396" width="270" height="44" rx="10" fill="#c8e6c9" stroke="#333" stroke-width="2"/>
+  <text x="525" y="424" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">tile_store(C, ..., acc)</text>
+
+  <!-- dot() callout -->
+  <rect x="680" y="138" width="48" height="38" rx="8" fill="#e8eaf6" stroke="#333" stroke-width="1.5"/>
+  <text x="704" y="162" text-anchor="middle" font-size="9" font-weight="bold" fill="#333">dot()</text>
+  <rect x="680" y="270" width="48" height="38" rx="8" fill="#e8eaf6" stroke="#333" stroke-width="1.5"/>
+  <text x="704" y="294" text-anchor="middle" font-size="9" font-weight="bold" fill="#333">dot()</text>
+</svg>
diff --git a/docs/_static/unified-memory.svg b/docs/_static/unified-memory.svg
new file mode 100644
index 0000000..bf6bdde
--- /dev/null
+++ b/docs/_static/unified-memory.svg
@@ -0,0 +1,60 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 680 400" font-family="'Comfortaa', 'Nunito', 'Varela Round', system-ui, sans-serif">
+  <defs>
+    <style>
+      @import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@400;600;700&amp;display=swap');
+    </style>
+    <marker id="arrow" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#444"/>
+    </marker>
+    <marker id="arrow-both" viewBox="0 0 10 10" refX="5" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M 0 1 L 8 5 L 0 9 z" fill="#2563eb"/>
+    </marker>
+  </defs>
+
+  <rect width="680" height="400" fill="#fff"/>
+
+  <!-- Title -->
+  <text x="340" y="34" text-anchor="middle" font-size="17" font-weight="bold" fill="#2563eb">Unified Memory on Apple Silicon</text>
+
+  <!-- ====== CPU side ====== -->
+  <rect x="40" y="60" width="200" height="130" rx="14" fill="#fce4ec" stroke="#333" stroke-width="2"/>
+  <text x="140" y="90" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">CPU</text>
+
+  <rect x="60" y="102" width="160" height="36" rx="8" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="140" y="126" text-anchor="middle" font-size="12" fill="#333">Python / numpy</text>
+
+  <rect x="60" y="146" width="160" height="30" rx="8" fill="#fff" stroke="#333" stroke-width="1.5"/>
+  <text x="140" y="166" text-anchor="middle" font-size="11" fill="#555" font-family="'JetBrains Mono', 'Fira Code', monospace">out.numpy()</text>
+
+  <!-- ====== GPU side ====== -->
+  <rect x="440" y="60" width="200" height="130" rx="14" fill="#bbdefb" stroke="#333" stroke-width="2"/>
+  <text x="540" y="90" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">Apple GPU</text>
+
+  <rect x="460" y="102" width="160" height="36" rx="8" fill="#fff9c4" stroke="#333" stroke-width="1.5"/>
+  <text x="540" y="126" text-anchor="middle" font-size="12" fill="#333">Metal compute kernel</text>
+
+  <rect x="460" y="146" width="160" height="30" rx="8" fill="#fff" stroke="#333" stroke-width="1.5"/>
+  <text x="540" y="166" text-anchor="middle" font-size="11" fill="#555" font-family="'JetBrains Mono', 'Fira Code', monospace">device float* A</text>
+
+  <!-- ====== Shared memory block (center bottom) ====== -->
+  <rect x="155" y="260" width="370" height="100" rx="16" fill="#c8e6c9" stroke="#333" stroke-width="2"/>
+  <text x="340" y="295" text-anchor="middle" font-size="15" font-weight="bold" fill="#333">Shared Physical Memory</text>
+
+  <rect x="195" y="310" width="280" height="34" rx="8" fill="#fff" stroke="#333" stroke-width="1.5"/>
+  <text x="335" y="333" text-anchor="middle" font-size="12" fill="#555" font-family="'JetBrains Mono', 'Fira Code', monospace">metile.Buffer(data=np_array)</text>
+
+  <!-- ====== Arrows: CPU <-> Memory ====== -->
+  <line x1="140" y1="190" x2="260" y2="260" stroke="#2563eb" stroke-width="2.5" marker-end="url(#arrow-both)"/>
+  <line x1="260" y1="260" x2="140" y2="190" stroke="#2563eb" stroke-width="2.5" marker-end="url(#arrow-both)"/>
+
+  <!-- ====== Arrows: GPU <-> Memory ====== -->
+  <line x1="540" y1="190" x2="420" y2="260" stroke="#2563eb" stroke-width="2.5" marker-end="url(#arrow-both)"/>
+  <line x1="420" y1="260" x2="540" y2="190" stroke="#2563eb" stroke-width="2.5" marker-end="url(#arrow-both)"/>
+
+  <!-- ====== "no copy" callout ====== -->
+  <rect x="260" y="210" width="160" height="32" rx="10" fill="#dbeafe" stroke="#2563eb" stroke-width="1.5"/>
+  <text x="340" y="231" text-anchor="middle" font-size="13" font-weight="bold" fill="#2563eb">zero copy</text>
+
+  <!-- ====== Bottom note ====== -->
+  <text x="340" y="386" text-anchor="middle" font-size="11" fill="#888">Both CPU and GPU read/write the same physical address. No transfers needed.</text>
+</svg>
diff --git a/docs/conf.py b/docs/conf.py
index 4b0035a..e74c011 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
 project = "meTile"
-copyright = "2025, Andre Slavescu"
+copyright = "2026, Andre Slavescu"
 author = "Andre Slavescu"
 
 extensions = [
diff --git a/docs/examples/fused-activations.rst b/docs/examples/fused-activations.rst
index 565a159..70c3166 100644
--- a/docs/examples/fused-activations.rst
+++ b/docs/examples/fused-activations.rst
@@ -8,7 +8,7 @@ to run different computations on different simdgroup subsets within a single ker
 Simple Activations
 ------------------
 
-Element-wise kernels follow the same pattern as vector add — load, compute, store:
+Element-wise kernels follow the same pattern as vector add (load, compute, store):
 
 .. code-block:: python
 
@@ -39,7 +39,7 @@ Fused GEMM + Activation
 ------------------------
 
 When an activation follows a ``dot`` operation, the compiler fuses it into the GEMM epilogue.
-The activation runs on register-resident data — no global memory round-trip:
+The activation runs on register-resident data, no global memory round-trip:
 
 .. code-block:: python
 
@@ -54,7 +54,7 @@ The activation runs on register-resident data — no global memory round-trip:
            a = metile.tile_load(A, pid_m * BLOCK_M, k, K, (BLOCK_M, BLOCK_K))
            b = metile.tile_load(B, k, pid_n * BLOCK_N, N, (BLOCK_K, BLOCK_N))
            acc = metile.dot(a, b, acc)
-       # Fused GELU epilogue — runs on accumulator registers
+       # Fused GELU epilogue, runs on accumulator registers
        acc = acc / (1.0 + metile.exp(-1.702 * acc))
        metile.tile_store(C, pid_m * BLOCK_M, pid_n * BLOCK_N, N, acc, (BLOCK_M, BLOCK_N))
 
@@ -64,7 +64,7 @@ Simdgroup Roles
 
 Apple GPUs organize threads into 32-thread **simdgroups**. A threadgroup can contain
 multiple simdgroups. With ``simdgroup_role``, you can assign different work to different
-simdgroup subsets — useful for computing multiple outputs in a single dispatch:
+simdgroup subsets, useful for computing multiple outputs in a single dispatch:
 
 .. code-block:: python
 
@@ -85,16 +85,13 @@ simdgroup subsets — useful for computing multiple outputs in a single dispatch
            metile.store(out_sqrt + offs, metile.sqrt(metile.abs(x)), mask=mask)
 
 With ``num_roles=2``, the threadgroup's simdgroups are split in half. Role 0 computes
-exponentials while role 1 computes square roots — simultaneously, in the same kernel launch.
-
-This is useful when you need multiple derived outputs from the same input and want to
-avoid the overhead of multiple kernel dispatches.
+exponentials while role 1 computes square roots, simultaneously, in the same kernel launch.
 
 
 GEGLU (Gated GELU)
 -------------------
 
-A practical use of simdgroup roles — computing the gate and up projections of GEGLU
+A practical use of simdgroup roles for computing the gate and up projections of GEGLU
 in parallel:
 
 .. code-block:: python
@@ -121,6 +118,6 @@ Concepts Introduced
 
 - Element-wise activation patterns
 - ``metile.exp`` for activation functions
-- Fused GEMM epilogues — zero-cost post-GEMM operations
-- ``metile.simdgroup_role`` — split work across simdgroup subsets
+- Fused GEMM epilogues: zero-cost post-GEMM operations
+- ``metile.simdgroup_role``: split work across simdgroup subsets
 - Multiple outputs from a single kernel
diff --git a/docs/examples/layernorm.rst b/docs/examples/layernorm.rst
index 282f34e..bbc3fb3 100644
--- a/docs/examples/layernorm.rst
+++ b/docs/examples/layernorm.rst
@@ -68,5 +68,5 @@ Concepts Introduced
 - Three-pass algorithm (mean, variance, normalize)
 - Scalar accumulators across tiled loops
 - ``metile.sum`` reduction
-- ``metile.sqrt`` — element-wise square root
+- ``metile.sqrt``: element-wise square root
 - Loading separate weight/bias arrays (shared across all rows)
diff --git a/docs/examples/matmul.rst b/docs/examples/matmul.rst
index 20ca67d..3f49e57 100644
--- a/docs/examples/matmul.rst
+++ b/docs/examples/matmul.rst
@@ -30,7 +30,7 @@ Basic GEMM
 Launching
 ---------
 
-The grid is 2D — one program instance per output tile:
+The grid is 2D, one program instance per output tile:
 
 .. code-block:: python
 
@@ -64,8 +64,8 @@ The compiler maps ``dot`` to the appropriate hardware:
 Fused GEMM + ReLU
 ------------------
 
-Element-wise operations after the GEMM loop are fused into the kernel's epilogue —
-they run on register-resident data with zero extra memory traffic:
+Element-wise operations after the GEMM loop are fused into the kernel's epilogue.
+They run on register-resident data with zero extra memory traffic:
 
 .. code-block:: python
 
@@ -80,7 +80,7 @@ they run on register-resident data with zero extra memory traffic:
            a = metile.tile_load(A, pid_m * BLOCK_M, k, K, (BLOCK_M, BLOCK_K))
            b = metile.tile_load(B, k, pid_n * BLOCK_N, N, (BLOCK_K, BLOCK_N))
            acc = metile.dot(a, b, acc)
-       acc = metile.where(acc > 0, acc, 0)   # fused ReLU — no global memory round-trip
+       acc = metile.where(acc > 0, acc, 0)   # fused ReLU, no global memory round-trip
        metile.tile_store(C, pid_m * BLOCK_M, pid_n * BLOCK_N, N, acc, (BLOCK_M, BLOCK_N))
 
 
@@ -132,9 +132,9 @@ See :doc:`/guide/autotuning` for the full autotuning guide.
 Concepts Introduced
 -------------------
 
-- ``metile.zeros`` — register-resident accumulator initialization
-- ``metile.dot`` — tile-level matrix multiply-accumulate
-- ``metile.tile_load`` / ``metile.tile_store`` — 2D strided memory access
-- 2D grids — ``kernel[(grid_m, grid_n)]``
-- Fused epilogues — element-wise ops after GEMM are free
-- Tile swizzle — cache-friendly scheduling patterns
+- ``metile.zeros``: register-resident accumulator initialization
+- ``metile.dot``: tile-level matrix multiply-accumulate
+- ``metile.tile_load`` / ``metile.tile_store``: 2D strided memory access
+- 2D grids: ``kernel[(grid_m, grid_n)]``
+- Fused epilogues: element-wise ops after GEMM are free
+- Tile swizzle: cache-friendly scheduling patterns
diff --git a/docs/examples/softmax.rst b/docs/examples/softmax.rst
index 9c5fc28..e172b1f 100644
--- a/docs/examples/softmax.rst
+++ b/docs/examples/softmax.rst
@@ -53,30 +53,12 @@ Each program instance handles one row. The grid is 1D with one instance per row:
    softmax[(rows,)](X, Out, cols, BLOCK=256)
 
 
-How It Works
-------------
-
-The kernel makes three passes over each row:
-
-1. **Find max** — ``metile.maximum`` computes element-wise max across tiles, then
-   ``metile.max`` reduces the tile to a scalar. This is needed for numerical stability
-   (subtracting the max prevents overflow in ``exp``).
-
-2. **Sum exponentials** — accumulates ``exp(x - m)`` across all tiles, then
-   ``metile.sum`` reduces to a scalar denominator.
-
-3. **Normalize** — divides each ``exp(x - m)`` by the sum.
-
-Each pass iterates over the row in chunks of ``BLOCK`` elements using ``tile_range``.
-The ``mask`` ensures correctness when ``N`` is not a multiple of ``BLOCK``.
-
-
 Concepts Introduced
 -------------------
 
-- ``metile.tile_range`` — tiling loop for iterating over a dimension
-- ``metile.maximum`` / ``metile.max`` — element-wise max and reduction
-- ``metile.sum`` — sum reduction
-- ``metile.exp`` — element-wise exponential
-- Multi-pass algorithms — reading the same data multiple times in different passes
+- ``metile.tile_range``: tiling loop for iterating over a dimension
+- ``metile.maximum`` / ``metile.max``: element-wise max and reduction
+- ``metile.sum``: sum reduction
+- ``metile.exp``: element-wise exponential
+- Multi-pass algorithms: reading the same data multiple times in different passes
 - Scalar accumulators (``m``, ``s``) carried across loop iterations
diff --git a/docs/examples/vector-add.rst b/docs/examples/vector-add.rst
index 6d0f08b..c28f0e7 100644
--- a/docs/examples/vector-add.rst
+++ b/docs/examples/vector-add.rst
@@ -37,9 +37,9 @@ The simplest meTile kernel: add two arrays element by element.
 Concepts Introduced
 -------------------
 
-- ``@metile.kernel`` — compile a Python function to Metal
-- ``metile.program_id`` — which program instance am I?
-- ``metile.arange`` — tile of consecutive indices
-- ``metile.load`` / ``metile.store`` — masked memory access
-- ``metile.Buffer`` — zero-copy GPU memory
-- ``kernel[grid]()`` — launch with a grid of instances
+- ``@metile.kernel``: compile a Python function to Metal
+- ``metile.program_id``: which program instance am I?
+- ``metile.arange``: tile of consecutive indices
+- ``metile.load`` / ``metile.store``: masked memory access
+- ``metile.Buffer``: zero-copy GPU memory
+- ``kernel[grid]()``: launch with a grid of instances
diff --git a/docs/getting-started/first-kernel.rst b/docs/getting-started/first-kernel.rst
index f703011..3dace45 100644
--- a/docs/getting-started/first-kernel.rst
+++ b/docs/getting-started/first-kernel.rst
@@ -30,7 +30,7 @@ Let's break this down line by line.
    Device pointers to GPU memory. These map to ``device float*`` in Metal.
 
 ``N``
-   A runtime scalar — passed as a ``constant int&`` to the shader.
+   A runtime scalar, passed as a ``constant int&`` to the shader.
 
 ``BLOCK: metile.constexpr``
    A **compile-time constant**. The value is baked directly into the shader. Changing it
@@ -75,7 +75,7 @@ Launching
 
 ``metile.Buffer``
    Wraps a Metal buffer in unified memory. CPU and GPU share the same physical memory on
-   Apple Silicon — there is no copy between host and device.
+   Apple Silicon, so there is no copy between host and device.
 
 ``metile.Buffer.zeros((N,))``
    Allocates a zeroed buffer of ``N`` float32 elements.
@@ -100,6 +100,10 @@ When you call ``add[grid](...)``, meTile:
 5. **Compiles** with ``xcrun metal -O2`` (or JIT if Xcode is unavailable)
 6. **Dispatches** the compute pipeline on the GPU
 
+.. image:: /_static/compilation-pipeline.svg
+   :alt: meTile compilation pipeline: Python to Tile IR to Metal IR to MSL to GPU
+   :width: 100%
+
 You can inspect any stage with the ``METILE_DEBUG`` environment variable:
 
 .. code-block:: bash
@@ -112,6 +116,6 @@ You can inspect any stage with the ``METILE_DEBUG`` environment variable:
 What's Next
 -----------
 
-- :doc:`/guide/language` — full language reference for what you can write inside ``@metile.kernel``
-- :doc:`/examples/softmax` — a more complex kernel with reductions and multiple passes
-- :doc:`/examples/matmul` — tile-level matrix multiply with ``dot`` and ``tile_load``
+- :doc:`/guide/language` for the full language reference
+- :doc:`/examples/softmax` for a more complex kernel with reductions and multiple passes
+- :doc:`/examples/matmul` for tile-level matrix multiply with ``dot`` and ``tile_load``
diff --git a/docs/getting-started/install.rst b/docs/getting-started/install.rst
index 62f2551..0927668 100644
--- a/docs/getting-started/install.rst
+++ b/docs/getting-started/install.rst
@@ -5,7 +5,7 @@ Requirements
 ------------
 
 - macOS 13 (Ventura) or later
-- Apple Silicon (M1, M2, M3, M4 — any variant)
+- Apple Silicon (M1, or later)
 - Python 3.10+
 
 Install
diff --git a/docs/guide/autotuning.rst b/docs/guide/autotuning.rst
index 3adb846..68d0f7a 100644
--- a/docs/guide/autotuning.rst
+++ b/docs/guide/autotuning.rst
@@ -52,6 +52,20 @@ On the first call with new key values, the autotuner:
 
 Subsequent calls with the same key values use the cached winner with zero overhead.
 
+.. code-block:: text
+
+   First call (M=1024, N=1024, K=1024):
+   +--------------------------------------------------+
+   |  Config(BM=64,  BN=64,  BK=32):   1.26ms         |
+   |  Config(BM=128, BN=128, BK=64):   0.62ms  <--    |  winner cached
+   |  Config(BM=128, BN=128, BK=128):  0.91ms         |
+   +--------------------------------------------------+
+
+   Subsequent calls (same M, N, K):
+   +--------------------------------------------------+
+   |  cached -> Config(BM=128, BN=128, BK=64)         |  no re-tuning
+   +--------------------------------------------------+
+
 
 Config Object
 -------------
@@ -117,6 +131,6 @@ fast dispatcher that skips all Python overhead on subsequent calls:
 
    dispatch = autotuned_matmul[grid].prepare(A, B, C, M, N, K)
 
-   # Hot loop — minimal Python overhead per call
+   # hot path with minimal python overhead
    for _ in range(1000):
        dispatch()
diff --git a/docs/guide/language.rst b/docs/guide/language.rst
index 18cb2f2..fb3a45b 100644
--- a/docs/guide/language.rst
+++ b/docs/guide/language.rst
@@ -2,7 +2,7 @@ Language Reference
 ==================
 
 meTile provides a Python eDSL (embedded domain-specific language) for writing GPU kernels. Functions
-decorated with ``@metile.kernel`` are traced and compiled to Metal shaders — they are not executed
+decorated with ``@metile.kernel`` are traced and compiled to Metal shaders. They are not executed
 as regular Python.
 
 This page documents every construct available inside a ``@metile.kernel`` function.
@@ -19,9 +19,9 @@ Kernel Definition
 
 Parameters are either:
 
-- **Pointers** — numpy arrays or ``metile.Buffer`` objects become ``device float*`` in Metal
-- **Scalars** — Python ints/floats become ``constant int&`` or ``constant float&``
-- **Constexprs** — annotated with ``metile.constexpr``, baked into the shader at compile time
+- **Pointers**: numpy arrays or ``metile.Buffer`` objects become ``device float*`` in Metal
+- **Scalars**: Python ints/floats become ``constant int&`` or ``constant float&``
+- **Constexprs**: annotated with ``metile.constexpr``, baked into the shader at compile time
 
 Constexprs are passed as keyword arguments at launch:
 
diff --git a/docs/guide/memory.rst b/docs/guide/memory.rst
index 94ba84f..afe95f4 100644
--- a/docs/guide/memory.rst
+++ b/docs/guide/memory.rst
@@ -1,9 +1,13 @@
 Memory Model
 ============
 
-Apple Silicon has a **unified memory architecture** — the CPU and GPU share the same physical
+Apple Silicon has a **unified memory architecture** where the CPU and GPU share the same physical
 memory. meTile exposes this directly through ``metile.Buffer``.
 
+.. image:: /_static/unified-memory.svg
+   :alt: Unified memory: CPU and GPU both access the same physical memory through metile.Buffer
+   :width: 100%
+
 
 Buffers
 -------
@@ -13,7 +17,7 @@ Buffers
    import numpy as np
    import metile
 
-   # Create from numpy (zero-copy — the GPU reads the same memory)
+   # Create from numpy (zero-copy, the GPU reads the same memory)
    x = metile.Buffer(data=np.random.randn(1024).astype(np.float32))
 
    # Allocate zeroed
@@ -71,6 +75,16 @@ memory access:
    x = metile.load(X + offs, mask=mask)       # masked-off lanes read 0
    metile.store(Out + offs, x, mask=mask)      # masked-off lanes are skipped
 
+.. code-block:: text
+
+   N = 10, BLOCK = 4, pid = 2 (last instance)
+
+   offs = [8, 9, 10, 11]
+   mask = [T, T, F, F] # values 10 and 11 are out of bounds
+
+   load:  reads x[8], x[9], returns 0 for indices 10, 11
+   store: writes out[8], out[9], skips indices 10, 11
+
 Masking is essential for correctness. Without it, the last program instance would read/write
 past the end of the array.
 
@@ -85,5 +99,5 @@ For kernels that need inter-thread communication within a threadgroup, use share
    buf = metile.shared(size=256, dtype="f32")
    metile.barrier()   # synchronize all threads in the threadgroup
 
-Shared memory is threadgroup-local — it is not visible to other threadgroups. Use
+Shared memory is threadgroup-local and not visible to other threadgroups. Use
 ``metile.barrier()`` to synchronize access within a threadgroup.
diff --git a/docs/guide/tile-ops.rst b/docs/guide/tile-ops.rst
index fd68120..7fbe675 100644
--- a/docs/guide/tile-ops.rst
+++ b/docs/guide/tile-ops.rst
@@ -12,13 +12,13 @@ The Two Backends
 meTile automatically selects the best backend for your hardware when compiling GEMM kernels:
 
 **Simdgroup Matrix (M1/M2/M3)**
-   Uses ``simdgroup_matrix<float, 8, 8>`` — Apple's 8x8 matrix multiply-accumulate
+   Uses ``simdgroup_matrix<float, 8, 8>``, Apple's 8x8 matrix multiply-accumulate
    primitive. Each simdgroup (32 threads) collaboratively computes an 8x8 tile.
    The compiler tiles the output across multiple simdgroups and uses threadgroup
    (shared) memory to stage data.
 
 **Metal 4 Tensor Ops (M4+)**
-   Uses ``matmul2d`` with ``cooperative_tensor`` — Metal 4's hardware matrix multiply
+   Uses ``matmul2d`` with ``cooperative_tensor``, Metal 4's hardware matrix multiply
    descriptors. Each simdgroup independently loads data from device memory into
    register-resident cooperative tensors and runs the MMA. No threadgroup memory needed.
 
@@ -29,20 +29,12 @@ your hardware and chooses the right path.
 How Tiling Works
 ----------------
 
-A GEMM kernel tiles the computation into blocks:
+A GEMM kernel tiles the computation into blocks. Each program instance computes
+one output tile, iterating over K to accumulate partial products:
 
-.. code-block:: text
-
-   Output C (M x N)              Each tile is BLOCK_M x BLOCK_N
-   ┌─────────┬─────────┐
-   │ (0,0)   │ (0,1)   │        Each program instance computes one tile.
-   │ 128x128 │ 128x128 │        The K dimension is tiled with BLOCK_K.
-   ├─────────┼─────────┤
-   │ (1,0)   │ (1,1)   │        grid = (ceil(M/BLOCK_M), ceil(N/BLOCK_N))
-   │ 128x128 │ 128x128 │
-   └─────────┴─────────┘
-
-Inside each tile, the K-loop accumulates partial results:
+.. image:: /_static/tiling-overview.svg
+   :alt: Output matrix tiled into blocks, with K-loop detail showing tile_load and dot accumulation
+   :width: 100%
 
 .. code-block:: python
 
@@ -81,7 +73,11 @@ The tile sizes are compile-time constants that control how the hardware is used:
      - 2, 4
 
 ``WM`` and ``WN`` control how many simdgroups tile the output block. With ``WM=4, WN=4``,
-16 simdgroups each handle a ``(BLOCK_M/WM) x (BLOCK_N/WN)`` = 32x32 subtile.
+16 simdgroups each handle a ``(BLOCK_M/WM) x (BLOCK_N/WN)`` = 32x32 subtile:
+
+.. image:: /_static/simdgroup-layout.svg
+   :alt: 4x4 simdgroup grid layout, 16 simdgroups each handling a 32x32 subtile
+   :width: 100%
 
 
 Fused Epilogues
@@ -94,7 +90,7 @@ and fuses them into the kernel. No extra memory traffic:
 
    acc = metile.dot(a, b, acc)
 
-   # These are fused into the GEMM — no global memory round-trip
+   # These are fused into the GEMM, no global memory round-trip
    acc = metile.where(acc > 0, acc, 0)      # ReLU
    acc = acc * scale                          # scale
    acc = metile.exp(acc)                      # unary
@@ -109,14 +105,14 @@ Tile Scheduling
 For 2D grids, the order in which tiles are assigned to threadgroups affects L2 cache locality.
 meTile supports several scheduling patterns:
 
-**Morton (Z-order)** — default
-   Tiles are assigned in 2x2 blocks following a Z-curve. Adjacent threadgroups share
-   A-row and B-column data in L2 cache.
+.. image:: /_static/morton-swizzle.svg
+   :alt: Morton Z-order vs linear tile scheduling, showing how 2x2 blocks share L2 cache
+   :width: 100%
 
-**Diagonal**
+**Diagonal**:
    Column assignment is rotated by the row index. Distributes memory traffic.
 
-**Linear**
+**Linear**:
    Simple row-major assignment. No locality optimization.
 
 The compiler applies Morton scheduling by default. You can override it: