diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0123d919fc1..4c4e4c0f6f0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -219,7 +219,7 @@ jobs:
             target: wasm32-unknown-unknown
             env:
               rustflags: "RUSTFLAGS='-A warnings --cfg getrandom_backend=\"wasm_js\"'"
-            args: "--target wasm32-unknown-unknown --exclude vortex --exclude vortex-cuda --exclude vortex-datafusion --exclude vortex-duckdb --exclude vortex-tui --exclude vortex-zstd"
+            args: "--target wasm32-unknown-unknown --exclude vortex --exclude vortex-cuda --exclude vortex-nvcomp --exclude vortex-datafusion --exclude vortex-duckdb --exclude vortex-tui --exclude vortex-zstd"
     steps:
       - uses: runs-on/action@v2
         with:
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index 9a07cd281a2..1c48a31c7df 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -21,6 +21,8 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: file_io
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
@@ -69,6 +71,8 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: array_ops
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
@@ -103,6 +107,45 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: compress_roundtrip
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+
+  # ============================================================================
+  # GPU Compress Fuzzer (CUDA)
+  # ============================================================================
+  gpu_compress_fuzz:
+    name: "GPU Compress Fuzz"
+    uses: ./.github/workflows/run-fuzzer.yml
+    with:
+      fuzz_target: compress_gpu
+      family: "g4dn"
+      image: "ubuntu24-gpu-x64"
+      extra_features: "cuda"
+    secrets:
+      R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+      R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+
+#  report-gpu-compress-fuzz-failures:
+#    name: "Report GPU Compress Fuzz Failures"
+#    needs: gpu_compress_fuzz
+#    if: always() && needs.gpu_compress_fuzz.outputs.crashes_found == 'true'
+#    permissions:
+#      issues: write
+#      contents: read
+#      id-token: write
+#      pull-requests: read
+#    uses: ./.github/workflows/report-fuzz-crash.yml
+#    with:
+#      fuzz_target: compress_gpu
+#      crash_file: ${{ needs.gpu_compress_fuzz.outputs.first_crash_name }}
+#      artifact_url: ${{ needs.gpu_compress_fuzz.outputs.artifact_url }}
+#      artifact_name: compress_gpu-crash-artifacts
+#      logs_artifact_name: compress_gpu-logs
+#      branch: ${{ github.ref_name }}
+#      commit: ${{ github.sha }}
+#    secrets:
+#      claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+#      gh_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/run-fuzzer.yml b/.github/workflows/run-fuzzer.yml
index 302555d7b54..c649b951256 100644
--- a/.github/workflows/run-fuzzer.yml
+++ b/.github/workflows/run-fuzzer.yml
@@ -12,6 +12,21 @@ on:
         required: false
         type: number
         default: 7200
+      family:
+        description: "Runner family (e.g., m8g.large for CPU, g5+g4dn+g6 for GPU)"
+        required: false
+        type: string
+        default: "m8g.large"
+      image:
+        description: "Runner image (e.g., ubuntu24-full-arm64, ubuntu24-gpu-x64)"
+        required: false
+        type: string
+        default: "ubuntu24-full-arm64"
+      extra_features:
+        description: "Extra cargo features to enable (e.g., cuda)"
+        required: false
+        type: string
+        default: ""
     outputs:
       crashes_found:
         description: "Whether crashes were found"
@@ -34,8 +49,8 @@ jobs:
     timeout-minutes: 230  # almost 4 hours
     runs-on:
       - runs-on=${{ github.run_id }}
-      - family=m8g.large
-      - image=ubuntu24-full-arm64
+      - family=${{ inputs.family }}
+      - image=${{ inputs.image }}
       - disk=large
       - extras=s3-cache
       - tag=${{ inputs.fuzz_target }}-fuzz
@@ -43,11 +58,6 @@ jobs:
       crashes_found: ${{ steps.check.outputs.crashes_found }}
       first_crash_name: ${{ steps.check.outputs.first_crash_name }}
       artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }}
-    env:
-      AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
-      AWS_REGION: "us-east-1"
-      AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
     steps:
       - uses: runs-on/action@v2
         with:
@@ -70,6 +80,11 @@ jobs:
 
       - name: Restore corpus
         shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+          AWS_REGION: "us-east-1"
+          AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
         run: |
           CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
           CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
@@ -99,8 +114,13 @@ jobs:
       - name: Run fuzzing target
         id: fuzz
         run: |
+          FEATURES_FLAG=""
+          if [ -n "${{ inputs.extra_features }}" ]; then
+            FEATURES_FLAG="--features ${{ inputs.extra_features }}"
+          fi
           RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 \
             cargo +nightly fuzz run --release --debug-assertions \
+            $FEATURES_FLAG \
             ${{ inputs.fuzz_target }} -- \
             -max_total_time=${{ inputs.max_time }} -rss_limit_mb=0 \
             2>&1 | tee fuzz_output.log
@@ -149,6 +169,11 @@ jobs:
 
       - name: Persist corpus
         shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+          AWS_REGION: "us-east-1"
+          AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
         run: |
           CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
           CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
diff --git a/.gitignore b/.gitignore
index ffae91d865a..a68417f916f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -229,6 +229,3 @@ sweep.timestamp
 
 # CUDA
 *.ptx
-
-# nvCOMP SDK (downloaded at build time)
-vortex-cuda/nvcomp/sdk/
diff --git a/Cargo.lock b/Cargo.lock
index d36bdffab8f..6f86172083b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2504,6 +2504,7 @@ dependencies = [
  "tokio",
  "url",
  "vortex-bench",
+ "vortex-cuda",
  "vortex-datafusion",
 ]
 
@@ -3814,6 +3815,7 @@ dependencies = [
  "url",
  "vortex",
  "vortex-bench",
+ "vortex-cuda",
  "vortex-duckdb",
 ]
 
@@ -10094,6 +10096,7 @@ dependencies = [
  "vortex-buffer",
  "vortex-bytebool",
  "vortex-compute",
+ "vortex-cuda",
  "vortex-datetime-parts",
  "vortex-decimal-byte-parts",
  "vortex-dtype",
@@ -10643,10 +10646,12 @@ dependencies = [
  "itertools 0.14.0",
  "libfuzzer-sys",
  "strum 0.27.2",
+ "tokio",
  "vortex",
  "vortex-array",
  "vortex-btrblocks",
  "vortex-buffer",
+ "vortex-cuda",
  "vortex-dtype",
  "vortex-error",
  "vortex-file",
@@ -10797,6 +10802,7 @@ name = "vortex-nvcomp"
 version = "0.1.0"
 dependencies = [
  "bindgen",
+ "libloading 0.8.9",
  "reqwest",
  "tar",
  "xz2",
diff --git a/Cargo.toml b/Cargo.toml
index 86582a0c114..835f6aa3ebb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -150,6 +150,7 @@ jiff = "0.2.0"
 kanal = "0.1.1"
 lending-iterator = "0.1.7"
 libfuzzer-sys = "0.4"
+libloading = "0.8"
 log = { version = "0.4.21" }
 loom = { version = "0.7", features = ["checkpoint"] }
 memmap2 = "0.9.5"
@@ -271,6 +272,7 @@ vortex-zstd = { version = "0.1.0", path = "./encodings/zstd", default-features =
 
 # No version constraints for unpublished crates.
 vortex-bench = { path = "./vortex-bench", default-features = false }
+vortex-cuda = { path = "./vortex-cuda", default-features = false }
 vortex-duckdb = { path = "./vortex-duckdb", default-features = false }
 
 [workspace.dependencies.getrandom_v03]
diff --git a/benchmarks/datafusion-bench/Cargo.toml b/benchmarks/datafusion-bench/Cargo.toml
index 3e5ea7064b3..4133cad2fae 100644
--- a/benchmarks/datafusion-bench/Cargo.toml
+++ b/benchmarks/datafusion-bench/Cargo.toml
@@ -36,10 +36,14 @@ parking_lot = { workspace = true }
 tokio = { workspace = true, features = ["full"] }
 url = { workspace = true }
 vortex-bench = { workspace = true }
+vortex-cuda = { workspace = true, optional = true }
 vortex-datafusion = { workspace = true }
 
 [build-dependencies]
 get_dir = { workspace = true }
 
+[features]
+cuda = ["dep:vortex-cuda"]
+
 [lints]
 workspace = true
diff --git a/benchmarks/duckdb-bench/Cargo.toml b/benchmarks/duckdb-bench/Cargo.toml
index 61b03b5c413..2407b938cb3 100644
--- a/benchmarks/duckdb-bench/Cargo.toml
+++ b/benchmarks/duckdb-bench/Cargo.toml
@@ -23,10 +23,14 @@ tracing = { workspace = true }
 url = { workspace = true }
 vortex = { workspace = true }
 vortex-bench = { workspace = true }
+vortex-cuda = { workspace = true, optional = true }
 vortex-duckdb = { workspace = true }
 
 [build-dependencies]
 get_dir = { workspace = true }
 
+[features]
+cuda = ["dep:vortex-cuda"]
+
 [lints]
 workspace = true
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index a9e2119bdb4..7226f353960 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -22,6 +22,7 @@ default = ["native"]
 native = ["libfuzzer-sys", "zstd", "vortex-file", "vortex/files"]
 wasmfuzz = []
 zstd = ["vortex/zstd"]
+cuda = ["vortex-cuda", "tokio"]
 
 [dependencies]
 # Always needed - arbitrary is used for input generation
@@ -48,6 +49,10 @@ vortex-utils = { workspace = true }
 libfuzzer-sys = { workspace = true, optional = true }
 vortex-file = { workspace = true, optional = true }
 
+# GPU support dependencies (optional, only for CUDA fuzzing)
+tokio = { workspace = true, features = ["rt", "macros"], optional = true }
+vortex-cuda = { workspace = true, optional = true }
+
 [lints]
 workspace = true
 
@@ -82,3 +87,11 @@ name = "compress_roundtrip"
 path = "fuzz_targets/compress_roundtrip.rs"
 test = false
 required-features = ["native"]
+
+[[bin]]
+bench = false
+doc = false
+name = "compress_gpu"
+path = "fuzz_targets/compress_gpu.rs"
+test = false
+required-features = ["native", "cuda"]
diff --git a/fuzz/fuzz_targets/compress_gpu.rs b/fuzz/fuzz_targets/compress_gpu.rs
new file mode 100644
index 00000000000..eea7ff6c5e3
--- /dev/null
+++ b/fuzz/fuzz_targets/compress_gpu.rs
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![no_main]
+#![allow(clippy::unwrap_used, clippy::result_large_err)]
+
+use libfuzzer_sys::Corpus;
+use libfuzzer_sys::fuzz_target;
+use vortex_error::vortex_panic;
+use vortex_fuzz::FuzzCompressGpu;
+use vortex_fuzz::run_compress_gpu;
+
+fuzz_target!(|fuzz: FuzzCompressGpu| -> Corpus {
+    // Use tokio runtime to run async GPU fuzzer
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    match rt.block_on(run_compress_gpu(fuzz)) {
+        Ok(true) => Corpus::Keep,
+        Ok(false) => Corpus::Reject,
+        Err(e) => {
+            vortex_panic!("{e}");
+        }
+    }
+});
diff --git a/fuzz/src/gpu/mod.rs b/fuzz/src/gpu/mod.rs
new file mode 100644
index 00000000000..0ad80c7a75b
--- /dev/null
+++ b/fuzz/src/gpu/mod.rs
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! GPU fuzzer module for testing CUDA decompression.
+//!
+//! This module generates arbitrary instances of GPU-supported compressed encodings,
+//! then verifies that GPU decompression produces the same results as CPU decompression.
+
+use arbitrary::Arbitrary;
+use arbitrary::Result;
+use arbitrary::Unstructured;
+use vortex_array::ArrayRef;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ArbitraryDictArray;
+use vortex_dtype::Nullability;
+use vortex_dtype::PType;
+
+use crate::error::Backtrace;
+use crate::error::VortexFuzzError;
+use crate::error::VortexFuzzResult;
+
+/// Which GPU-supported encoding to generate.
+#[derive(Debug, Clone, Copy)]
+pub enum GpuEncodingKind {
+    /// Dictionary encoding with GPU take support.
+    Dict,
+}
+
+impl<'a> Arbitrary<'a> for GpuEncodingKind {
+    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
+        // Currently only Dict is supported
+        match u.int_in_range(0..=0)? {
+            0 => Ok(GpuEncodingKind::Dict),
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Input for the GPU decompression fuzzer.
+#[derive(Debug)]
+pub struct FuzzCompressGpu {
+    pub array: ArrayRef,
+}
+
+impl<'a> Arbitrary<'a> for FuzzCompressGpu {
+    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
+        let kind: GpuEncodingKind = u.arbitrary()?;
+
+        let array = match kind {
+            GpuEncodingKind::Dict => {
+                // Dict already has Arbitrary support, use primitive values for GPU compatibility
+                let dtype = arbitrary_gpu_primitive_dtype(u)?;
+                ArbitraryDictArray::with_dtype(u, &dtype, None)?
+                    .0
+                    .into_array()
+            }
+        };
+
+        Ok(FuzzCompressGpu { array })
+    }
+}
+
+/// Generate a random primitive DType suitable for GPU operations.
+fn arbitrary_gpu_primitive_dtype(u: &mut Unstructured) -> Result<vortex_dtype::DType> {
+    let nullability: Nullability = u.arbitrary()?;
+    let ptype = match u.int_in_range(0..=9)? {
+        0 => PType::U8,
+        1 => PType::U16,
+        2 => PType::U32,
+        3 => PType::U64,
+        4 => PType::I8,
+        5 => PType::I16,
+        6 => PType::I32,
+        7 => PType::I64,
+        8 => PType::F32,
+        9 => PType::F64,
+        _ => unreachable!(),
+    };
+    Ok(vortex_dtype::DType::Primitive(ptype, nullability))
+}
+
+/// Run the GPU decompression fuzzer.
+///
+/// This function:
+/// 1. Decompresses the array on CPU (reference)
+/// 2. Decompresses the array on GPU
+/// 3. Copies GPU result back to host using `CanonicalCudaExt::to_host`
+/// 4. Compares the results
+///
+/// Returns:
+/// - `Ok(true)` - test passed, keep in corpus
+/// - `Ok(false)` - test skipped (e.g., no CUDA), reject from corpus
+/// - `Err(_)` - a bug was found
+#[allow(clippy::result_large_err)]
+pub async fn run_compress_gpu(fuzz: FuzzCompressGpu) -> VortexFuzzResult<bool> {
+    use vortex::VortexSessionDefault;
+    use vortex::session::VortexSession;
+    use vortex_cuda::CanonicalCudaExt;
+    use vortex_cuda::CudaSession;
+    use vortex_cuda::executor::CudaArrayExt;
+    use vortex_error::VortexExpect;
+
+    // Runtime check - skip if CUDA is not available
+    if !vortex_cuda::cuda_available() {
+        return Ok(false);
+    }
+
+    let FuzzCompressGpu { array } = fuzz;
+
+    // Store original properties for error reporting
+    let original_len = array.len();
+
+    // 1. CPU decompression (reference)
+    let cpu_canonical = match array.to_canonical() {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 2. Create CUDA execution context
+    let session = VortexSession::default();
+
+    let mut cuda_ctx =
+        CudaSession::create_execution_ctx(&session).vortex_expect("cannot create session");
+
+    // 3. GPU decompression
+    let gpu_canonical = match array.clone().execute_cuda(&mut cuda_ctx).await {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 4. Copy GPU result back to host using CanonicalCudaExt
+    let gpu_host_canonical = match gpu_canonical.to_host().await {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 5. Compare canonicals
+    let cpu_array = cpu_canonical.into_array();
+    let gpu_array = gpu_host_canonical.into_array();
+
+    // Verify dtype is preserved
+    if cpu_array.dtype() != gpu_array.dtype() {
+        return Err(VortexFuzzError::DTypeMismatch(
+            cpu_array,
+            gpu_array,
+            0,
+            Backtrace::capture(),
+        ));
+    }
+
+    // Verify length is preserved
+    if original_len != gpu_array.len() {
+        return Err(VortexFuzzError::LengthMismatch(
+            original_len,
+            gpu_array.len(),
+            array,
+            gpu_array,
+            0,
+            Backtrace::capture(),
+        ));
+    }
+
+    // Compare element by element
+    for i in 0..original_len {
+        let cpu_scalar = cpu_array
+            .scalar_at(i)
+            .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?;
+        let gpu_scalar = gpu_array
+            .scalar_at(i)
+            .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?;
+
+        if cpu_scalar != gpu_scalar {
+            return Err(VortexFuzzError::ArrayNotEqual(
+                cpu_scalar,
+                gpu_scalar,
+                i,
+                cpu_array,
+                gpu_array,
+                0,
+                Backtrace::capture(),
+            ));
+        }
+    }
+
+    Ok(true)
+}
diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs
index dd0b3022642..781f31a1c0b 100644
--- a/fuzz/src/lib.rs
+++ b/fuzz/src/lib.rs
@@ -10,6 +10,10 @@ pub mod error;
 // File module only available for native builds (requires vortex-file which uses tokio)
 #[cfg(not(target_arch = "wasm32"))]
 pub mod file;
+
+// GPU fuzzer module (only available when cuda feature is enabled)
+#[cfg(feature = "cuda")]
+pub mod gpu;
 pub use array::Action;
 pub use array::CompressorStrategy;
 pub use array::ExpectedValue;
@@ -20,6 +24,10 @@ pub use compress::FuzzCompressRoundtrip;
 pub use compress::run_compress_roundtrip;
 #[cfg(not(target_arch = "wasm32"))]
 pub use file::FuzzFileAction;
+#[cfg(feature = "cuda")]
+pub use gpu::FuzzCompressGpu;
+#[cfg(feature = "cuda")]
+pub use gpu::run_compress_gpu;
 
 // Runtime initialization - platform-specific
 #[cfg(not(target_arch = "wasm32"))]
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index 49b9ce3479c..d22b1acc326 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -123,7 +123,7 @@ fn benchmark_dict_u32_u8(c: &mut Criterion) {
             &dict_array,
             |b, dict_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     // Get values and codes arrays
@@ -187,7 +187,7 @@ fn benchmark_dict_u32_u16(c: &mut Criterion) {
             &dict_array,
             |b, dict_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let values: Vec<u32> = (0..4096).map(|i| i * 100).collect();
@@ -250,7 +250,7 @@ fn benchmark_dict_u64_u8(c: &mut Criterion) {
             &dict_array,
             |b, dict_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let values: Vec<u64> = (0..256).map(|i| i * 1_000_000).collect();
@@ -313,7 +313,7 @@ fn benchmark_dict_u64_u32(c: &mut Criterion) {
             &dict_array,
             |b, dict_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let values: Vec<u64> = (0..65536).map(|i| i * 1000).collect();
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index fff66d204f2..90c89063649 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -173,7 +173,7 @@ fn benchmark_for_u8(c: &mut Criterion) {
             &for_array,
             |b, for_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let encoded = for_array.encoded();
@@ -224,7 +224,7 @@ fn benchmark_for_u16(c: &mut Criterion) {
             &for_array,
             |b, for_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let encoded = for_array.encoded();
@@ -275,7 +275,7 @@ fn benchmark_for_u32(c: &mut Criterion) {
             &for_array,
             |b, for_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let encoded = for_array.encoded();
@@ -326,7 +326,7 @@ fn benchmark_for_u64(c: &mut Criterion) {
             &for_array,
             |b, for_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let encoded = for_array.encoded();
diff --git a/vortex-cuda/benches/zstd_cuda.rs b/vortex-cuda/benches/zstd_cuda.rs
index cab84dc9a00..cc21f2be862 100644
--- a/vortex-cuda/benches/zstd_cuda.rs
+++ b/vortex-cuda/benches/zstd_cuda.rs
@@ -123,7 +123,7 @@ fn benchmark_zstd_cuda_decompress(c: &mut Criterion) {
             &zstd_array,
             |b, zstd_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context");
 
                     let mut total_time = Duration::ZERO;
diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
index 77595a1e5f9..f975f6795b6 100644
--- a/vortex-cuda/build.rs
+++ b/vortex-cuda/build.rs
@@ -17,10 +17,6 @@ fn main() {
         return;
     }
 
-    // Include runtime library path for nvCOMP dylib.
-    let nvcomp_lib = env::var("DEP_NVCOMP_LIB_DIR").expect("Library path for nvCOMP not found");
-    println!("cargo:rustc-link-arg=-Wl,-rpath,{nvcomp_lib}");
-
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
     let kernels_dir = Path::new(&manifest_dir).join("kernels");
 
diff --git a/vortex-cuda/nvcomp/Cargo.toml b/vortex-cuda/nvcomp/Cargo.toml
index c081960b476..59d6f8656cc 100644
--- a/vortex-cuda/nvcomp/Cargo.toml
+++ b/vortex-cuda/nvcomp/Cargo.toml
@@ -12,12 +12,12 @@ readme = { workspace = true }
 repository = { workspace = true }
 rust-version = { workspace = true }
 version = { workspace = true }
-links = "nvcomp"
 
 [lints]
 workspace = true
 
 [dependencies]
+libloading = { workspace = true }
 
 [build-dependencies]
 bindgen = { workspace = true }
diff --git a/vortex-cuda/nvcomp/build.rs b/vortex-cuda/nvcomp/build.rs
index 2d33a83f9f0..8ada54e46ad 100644
--- a/vortex-cuda/nvcomp/build.rs
+++ b/vortex-cuda/nvcomp/build.rs
@@ -5,6 +5,8 @@
 //!
 //! Bindings are generated unconditionally. This allows for development against the
 //! CUDA APIs in environments that don't support CUDA.
+//!
+//! The library is loaded at runtime via libloading.
 
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::expect_used)]
@@ -35,9 +37,8 @@ fn main() {
     println!("cargo::rustc-check-cfg=cfg(cuda_available)");
     println!("cargo:rerun-if-env-changed=CUDA_PATH");
 
-    let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let nvcomp_dir = manifest_dir.join("sdk");
+    let nvcomp_dir = out_dir.join("nvcomp-sdk");
 
     // Create CUDA stub header in OUT_DIR for bindgen
     let cuda_stub_dir = out_dir.join("cuda-stub");
@@ -57,20 +58,8 @@ fn main() {
     );
 
     let include_dir = nvcomp_dir.join("include");
-    let lib_dir = nvcomp_dir.join("lib");
-
-    println!(
-        "cargo:rerun-if-changed={}",
-        include_dir.join("nvcomp.h").display()
-    );
-    println!(
-        "cargo:rerun-if-changed={}",
-        include_dir.join("nvcomp/zstd.h").display()
-    );
 
     if !include_dir.exists() {
-        println!("cargo:warning=Downloading nvCOMP SDK from {}", url);
-
         let response = reqwest::blocking::get(&url)
             .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}"));
 
@@ -81,10 +70,6 @@ fn main() {
         );
 
         let bytes = response.bytes().unwrap();
-        println!(
-            "cargo:warning=Downloaded {} bytes, extracting...",
-            bytes.len()
-        );
 
         // Extract tar.xz archive.
         let cursor = Cursor::new(bytes.as_ref());
@@ -102,22 +87,26 @@ fn main() {
         }
         fs::rename(&extracted, &nvcomp_dir).unwrap();
         fs::remove_dir_all(&temp_dir).ok();
-
-        println!(
-            "cargo:warning=nvCOMP SDK extracted to {}",
-            nvcomp_dir.display()
-        );
     }
 
+    // Functions are loaded at runtime via libloading to avoid link-time symbol resolution.
     let bindings = bindgen::Builder::default()
         .header(include_dir.join("nvcomp.h").to_string_lossy())
         .header(include_dir.join("nvcomp/zstd.h").to_string_lossy())
         .clang_arg(format!("-I{}", include_dir.display()))
         .clang_arg(format!("-I{}", cuda_stub_dir.display()))
-        .allowlist_function("nvcompBatchedZstd.*")
+        .allowlist_type("nvcompStatus_t")
+        .allowlist_type("nvcompBatchedZstdDecompressOpts_t")
+        .allowlist_type("nvcompDecompressBackend_t")
+        .allowlist_function("nvcompBatchedZstdDecompressGetTempSizeAsync")
+        .allowlist_function("nvcompBatchedZstdDecompressAsync")
+        .dynamic_library_name("NvcompLibrary")
+        .dynamic_link_require_all(true)
+        .wrap_unsafe_ops(true)
         .blocklist_type("CUstream_st")
         .blocklist_type("cudaStream_t")
-        .raw_line("// FFI bindings to nvCOMP (generated by bindgen).")
+        .raw_line("// FFI type definitions for nvCOMP (generated by bindgen).")
+        .raw_line("// Functions are loaded at runtime via libloading.")
         .raw_line("")
         .raw_line("pub type cudaStream_t = *mut std::ffi::c_void;")
         .generate()
@@ -126,21 +115,8 @@ fn main() {
     bindings.write_to_file(out_dir.join("sys.rs")).unwrap();
 
     // Set cuda_available cfg if CUDA is detected on the system.
-    // Gates tests and benchmarks that require CUDA at runtime.
+    // This gates tests and benchmarks that require CUDA at runtime.
     if cuda_available() {
-        // Link against nvcomp dynamically.
-        println!("cargo:rustc-link-search=native={}", lib_dir.display());
-        println!("cargo:rustc-link-lib=dylib=nvcomp");
-        println!("cargo:rustc-link-arg=-Wl,-rpath,{}", lib_dir.display());
-
-        // Export the library path for downstream crates via the `links` manifest key.
-        // Downstream crates can access this via `env::var("DEP_NVCOMP_LIB_DIR")` in their
-        // build.rs and add their own rpath:
-        //
-        // if let Ok(nvcomp_lib) = env::var("DEP_NVCOMP_LIB_DIR") {
-        //     println!("cargo:rustc-link-arg=-Wl,-rpath,{nvcomp_lib}");
-        // }
-        println!("cargo:lib_dir={}", lib_dir.display());
         println!("cargo:rustc-cfg=cuda_available");
     }
 }
diff --git a/vortex-cuda/nvcomp/src/error.rs b/vortex-cuda/nvcomp/src/error.rs
index 3c3aeb62d0f..4ba56c40c7c 100644
--- a/vortex-cuda/nvcomp/src/error.rs
+++ b/vortex-cuda/nvcomp/src/error.rs
@@ -6,8 +6,10 @@
 use crate::sys;
 
 /// Error type for nvcomp operations.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum NvcompError {
+    /// Failed to load the nvcomp library at runtime.
+    LibraryLoadError(String),
     /// Invalid value provided.
     InvalidValue,
     /// Operation not supported.
@@ -43,6 +45,7 @@ pub enum NvcompError {
 impl std::fmt::Display for NvcompError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            Self::LibraryLoadError(msg) => write!(f, "nvcomp: failed to load library: {}", msg),
             Self::InvalidValue => write!(f, "nvcomp: invalid value"),
             Self::NotSupported => write!(f, "nvcomp: operation not supported"),
             Self::CannotDecompress => write!(f, "nvcomp: cannot decompress"),
diff --git a/vortex-cuda/nvcomp/src/lib.rs b/vortex-cuda/nvcomp/src/lib.rs
index b2444dc5e93..38e5acfb724 100644
--- a/vortex-cuda/nvcomp/src/lib.rs
+++ b/vortex-cuda/nvcomp/src/lib.rs
@@ -3,19 +3,25 @@
 
 //! Rust bindings to NVIDIA nvCOMP compression library.
 //!
-//! This crate provides raw FFI bindings to nvCOMP, generated via bindgen
-//! from the nvCOMP C headers. The nvCOMP SDK is automatically downloaded
-//! at build time.
+//! This crate provides bindings to nvCOMP, with the library loaded at runtime
+//! via `libloading`. This allows the crate to compile on systems without CUDA
+//! or nvcomp installed - the library is only required at runtime when the
+//! functions are actually called.
 //!
 //! # Platform Support
 //!
 //! nvCOMP is only available on Linux x86_64 and ARM64. On other platforms,
-//! this crate still builds against the CUDA APIs but can't be run.
+//! this crate still compiles but will fail at runtime when trying to load
+//! the library.
 //!
 //! # Runtime Requirements
 //!
-//! The nvcomp library is linked dynamically.
+//! The nvcomp library must be available at runtime.
 
+use std::path::PathBuf;
+use std::sync::OnceLock;
+
+/// Raw FFI type definitions and dynamically-loaded function pointers from bindgen.
 #[allow(
     non_upper_case_globals,
     non_camel_case_types,
@@ -30,6 +36,36 @@ pub mod zstd;
 
 pub use error::NvcompError;
 
+/// The loaded nvcomp library instance.
+static NVCOMP_LIB: OnceLock<Result<sys::NvcompLibrary, String>> = OnceLock::new();
+
+fn load_nvcomp() -> Result<sys::NvcompLibrary, String> {
+    let lib_name = "libnvcomp.so";
+    let build_lib_dir = env!("OUT_DIR");
+    let sdk_lib_path = PathBuf::from(build_lib_dir)
+        .join("nvcomp-sdk")
+        .join("lib")
+        .join(lib_name);
+
+    // SAFETY: The library at the SDK path is a valid nvcomp shared library
+    // downloaded during the build process.
+    unsafe {
+        sys::NvcompLibrary::new(&sdk_lib_path)
+            .map_err(|e| format!("Failed to load nvcomp library: {e}"))
+    }
+}
+
+/// Gets a reference to the loaded nvcomp library.
+///
+/// The library is loaded lazily on first access. Returns an error if the
+/// library cannot be found or loaded.
+pub fn nvcomp_library() -> Result<&'static sys::NvcompLibrary, NvcompError> {
+    NVCOMP_LIB
+        .get_or_init(load_nvcomp)
+        .as_ref()
+        .map_err(|e| NvcompError::LibraryLoadError(e.clone()))
+}
+
 #[cfg(test)]
 #[cfg(cuda_available)]
 mod tests {
diff --git a/vortex-cuda/nvcomp/src/zstd.rs b/vortex-cuda/nvcomp/src/zstd.rs
index be1871626df..7a326d3fb25 100644
--- a/vortex-cuda/nvcomp/src/zstd.rs
+++ b/vortex-cuda/nvcomp/src/zstd.rs
@@ -7,6 +7,7 @@ use std::ffi::c_void;
 
 use crate::error::NvcompError;
 use crate::error::check_status;
+use crate::nvcomp_library;
 use crate::sys;
 
 /// Backend selection for nvcomp decompression.
@@ -88,10 +89,12 @@ pub fn get_decompress_temp_size_with_opts(
     max_total_uncompressed_bytes: usize,
     opts: ZstdDecompressOpts,
 ) -> Result<usize, NvcompError> {
+    let library = nvcomp_library()?;
+
     let mut temp_bytes: usize = 0;
 
     let status = unsafe {
-        sys::nvcompBatchedZstdDecompressGetTempSizeAsync(
+        library.nvcompBatchedZstdDecompressGetTempSizeAsync(
             num_chunks,
             max_uncompressed_chunk_bytes,
             opts.to_nvcomp(),
@@ -181,8 +184,10 @@ pub unsafe fn decompress_async_with_opts(
     stream: sys::cudaStream_t,
     opts: ZstdDecompressOpts,
 ) -> Result<(), NvcompError> {
+    let library = nvcomp_library()?;
+
     let status = unsafe {
-        sys::nvcompBatchedZstdDecompressAsync(
+        library.nvcompBatchedZstdDecompressAsync(
             device_compressed_ptrs,
             device_compressed_bytes,
             device_uncompressed_bytes,
diff --git a/vortex-cuda/src/canonical.rs b/vortex-cuda/src/canonical.rs
index a161123c03f..7306c80aec1 100644
--- a/vortex-cuda/src/canonical.rs
+++ b/vortex-cuda/src/canonical.rs
@@ -15,14 +15,14 @@ use vortex_error::VortexResult;
 /// Move all canonical data from to_host from device.
 #[async_trait]
 pub trait CanonicalCudaExt {
-    async fn into_host(self) -> VortexResult<Self>
+    async fn to_host(self) -> VortexResult<Self>
     where
         Self: Sized;
 }
 
 #[async_trait]
 impl CanonicalCudaExt for Canonical {
-    async fn into_host(self) -> VortexResult<Self> {
+    async fn to_host(self) -> VortexResult<Self> {
         match self {
             n @ Canonical::Null(_) => Ok(n),
             Canonical::Bool(bool) => {
@@ -61,7 +61,7 @@ impl CanonicalCudaExt for Canonical {
                     )
                 }))
             }
-            _ => todo!("support other types once they have `BufferHandle`s"),
+            _ => todo!(),
         }
     }
 }
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 07a2b52b262..a1f4766bbdf 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -284,7 +284,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_u32_values_u8_codes() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values: [100, 200, 300, 400]
@@ -316,7 +316,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_u64_values_u16_codes() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values: large u64 values
@@ -351,7 +351,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_i32_values_u32_codes() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values: signed integers including negatives
@@ -383,7 +383,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_large_array() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary with 256 values
@@ -415,7 +415,7 @@ mod tests {
 
     #[test]
     fn test_cuda_dict_values_with_validity() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values with nulls: [100, null, 300, 400]
@@ -451,7 +451,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_codes_with_validity() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values: [100, 200, 300, 400]
@@ -489,7 +489,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_both_with_validity() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values with nulls: [100, null, 300, 400]
@@ -534,7 +534,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_i64_values_with_validity() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Dictionary values with nulls (i64)
@@ -580,7 +580,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_all_valid_matches_baseline() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Non-nullable values
@@ -624,7 +624,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i8_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 2 uses i8 backing type
@@ -653,7 +653,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i16_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 4 uses i16 backing type
@@ -682,7 +682,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i32_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 9 uses i32 backing type
@@ -711,7 +711,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i64_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 18 uses i64 backing type
@@ -743,7 +743,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i128_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 38 uses i128 backing type
@@ -780,7 +780,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_dict_decimal_i256_values() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Precision 76 uses i256 backing type
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index ddb4717649f..e04cd56f1b2 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -132,7 +132,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_alp_decompression_f32() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // Create encoded values (what ALP would produce)
@@ -154,7 +154,7 @@ mod tests {
             .execute(alp_array.to_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
-            .into_host()
+            .to_host()
             .await?
             .into_array();
 
diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
index 1c66cd503cf..7760815271e 100644
--- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
@@ -77,7 +77,7 @@ mod tests {
         #[case] precision: u8,
         #[case] scale: i8,
     ) {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("create execution context");
 
         let decimal_dtype = DecimalDType::new(precision, scale);
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index 4f16031b3db..c33ad6c62ac 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -135,7 +135,7 @@ mod tests {
     #[case::u64(make_for_array((0..5000).map(|i| (i % 5000) as u64).collect(), 1000000u64))]
     #[tokio::test]
     async fn test_cuda_for_decompression(#[case] for_array: FoRArray) -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         let cpu_result = for_array.to_canonical()?;
@@ -144,7 +144,7 @@ mod tests {
             .execute(for_array.to_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
-            .into_host()
+            .to_host()
             .await?
             .into_array();
 
diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs
index 5b4c7cb6948..52d92472cc1 100644
--- a/vortex-cuda/src/kernel/encodings/zigzag.rs
+++ b/vortex-cuda/src/kernel/encodings/zigzag.rs
@@ -123,7 +123,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_zigzag_decompression_u32() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         // ZigZag encoding: 0->0, 1->-1, 2->1, 3->-2, 4->2, ...
@@ -140,7 +140,7 @@ mod tests {
             .execute(zigzag_array.to_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
-            .into_host()
+            .to_host()
             .await?
             .into_array();
 
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index 1681e77a3aa..508990897d8 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -316,7 +316,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_zstd_decompression_utf8() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         let strings = VarBinViewArray::from_iter_str([
@@ -341,7 +341,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_zstd_decompression_multiple_frames() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         let strings = VarBinViewArray::from_iter_str([
@@ -376,7 +376,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_cuda_zstd_decompression_sliced() -> VortexResult<()> {
-        let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty())
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
         let strings = VarBinViewArray::from_iter_str([
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 82625a71cdd..da448e9b065 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -3,6 +3,8 @@
 
 //! CUDA support for Vortex arrays.
 
+use std::process::Command;
+
 mod canonical;
 mod device_buffer;
 pub mod executor;
@@ -25,6 +27,7 @@ pub use kernel::ZstdKernelPrep;
 pub use kernel::launch_cuda_kernel_impl;
 pub use kernel::zstd_kernel_prepare;
 pub use session::CudaSession;
+pub use session::CudaSessionExt;
 use vortex_alp::ALPVTable;
 use vortex_array::arrays::DictVTable;
 use vortex_decimal_byte_parts::DecimalBytePartsVTable;
@@ -33,6 +36,14 @@ pub use vortex_nvcomp as nvcomp;
 use vortex_zigzag::ZigZagVTable;
 use vortex_zstd::ZstdVTable;
 
+/// Checks if CUDA is available on the system by looking for nvcc.
+pub fn cuda_available() -> bool {
+    Command::new("nvcc")
+        .arg("--version")
+        .output()
+        .is_ok_and(|o| o.status.success())
+}
+
 /// Registers CUDA kernels.
 pub fn initialize_cuda(session: &CudaSession) {
     tracing::info!("Registering CUDA kernels");
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index 16fc917e327..c83128def3e 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -14,7 +14,7 @@ use vortex_session::SessionExt;
 use vortex_utils::aliases::dash_map::DashMap;
 
 use crate::executor::CudaExecute;
-use crate::executor::CudaExecutionCtx;
+pub use crate::executor::CudaExecutionCtx;
 use crate::kernel::KernelLoader;
 
 /// CUDA session for GPU accelerated execution.
@@ -40,7 +40,7 @@ impl CudaSession {
 
     /// Creates a new CUDA execution context.
     pub fn create_execution_ctx(
-        vortex_session: vortex_session::VortexSession,
+        vortex_session: &vortex_session::VortexSession,
     ) -> VortexResult<CudaExecutionCtx> {
         let stream = vortex_session
             .cuda_session()
diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml
index 1a61feefed6..45ffcedb889 100644
--- a/vortex/Cargo.toml
+++ b/vortex/Cargo.toml
@@ -26,6 +26,7 @@ vortex-btrblocks = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-bytebool = { workspace = true }
 vortex-compute = { workspace = true }
+
 vortex-datetime-parts = { workspace = true }
 vortex-decimal-byte-parts = { workspace = true }
 vortex-dtype = { workspace = true, default-features = true }
@@ -51,6 +52,9 @@ vortex-utils = { workspace = true }
 vortex-zigzag = { workspace = true }
 vortex-zstd = { workspace = true, optional = true }
 
+[target.'cfg(target_os = "linux")'.dependencies]
+vortex-cuda = { workspace = true, optional = true }
+
 [dev-dependencies]
 anyhow = { workspace = true }
 arrow-array = { workspace = true }
diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs
index 1f0c99fe284..fa2083883cf 100644
--- a/vortex/src/lib.rs
+++ b/vortex/src/lib.rs
@@ -159,6 +159,16 @@ impl VortexSessionDefault for VortexSession {
             .with::<ExprSession>()
             .with::<RuntimeSession>();
 
+        #[cfg(all(feature = "vortex-cuda", target_os = "linux"))]
+        // Even if the CUDA feature is enabled we need to check at
+        // runtime whether CUDA is available in the current environment.
+        if vortex_cuda::cuda_available() {
+            session = session.with::<CudaSession>();
+            use vortex_cuda::CudaSession;
+            use vortex_cuda::CudaSessionExt;
+            vortex_cuda::initialize_cuda(&session.cuda_session());
+        }
+
         #[cfg(feature = "files")]
         file::register_default_encodings(&mut session);