diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0123d919fc1..4c4e4c0f6f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -219,7 +219,7 @@ jobs: target: wasm32-unknown-unknown env: rustflags: "RUSTFLAGS='-A warnings --cfg getrandom_backend=\"wasm_js\"'" - args: "--target wasm32-unknown-unknown --exclude vortex --exclude vortex-cuda --exclude vortex-datafusion --exclude vortex-duckdb --exclude vortex-tui --exclude vortex-zstd" + args: "--target wasm32-unknown-unknown --exclude vortex --exclude vortex-cuda --exclude vortex-nvcomp --exclude vortex-datafusion --exclude vortex-duckdb --exclude vortex-tui --exclude vortex-zstd" steps: - uses: runs-on/action@v2 with: diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 9a07cd281a2..1c48a31c7df 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -21,6 +21,8 @@ jobs: uses: ./.github/workflows/run-fuzzer.yml with: fuzz_target: file_io + family: "m8g.large" + image: "ubuntu24-full-arm64" secrets: R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} @@ -69,6 +71,8 @@ jobs: uses: ./.github/workflows/run-fuzzer.yml with: fuzz_target: array_ops + family: "m8g.large" + image: "ubuntu24-full-arm64" secrets: R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} @@ -103,6 +107,45 @@ jobs: uses: ./.github/workflows/run-fuzzer.yml with: fuzz_target: compress_roundtrip + family: "m8g.large" + image: "ubuntu24-full-arm64" secrets: R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + + # ============================================================================ + # GPU Compress Fuzzer (CUDA) + # ============================================================================ + gpu_compress_fuzz: + name: "GPU Compress Fuzz" + uses: ./.github/workflows/run-fuzzer.yml + with: + fuzz_target: compress_gpu + family: "g4dn" + image: "ubuntu24-gpu-x64" + extra_features: "cuda" + secrets: + R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + +# report-gpu-compress-fuzz-failures: +# name: "Report GPU Compress Fuzz Failures" +# needs: gpu_compress_fuzz +# if: always() && needs.gpu_compress_fuzz.outputs.crashes_found == 'true' +# permissions: +# issues: write +# contents: read +# id-token: write +# pull-requests: read +# uses: ./.github/workflows/report-fuzz-crash.yml +# with: +# fuzz_target: compress_gpu +# crash_file: ${{ needs.gpu_compress_fuzz.outputs.first_crash_name }} +# artifact_url: ${{ needs.gpu_compress_fuzz.outputs.artifact_url }} +# artifact_name: compress_gpu-crash-artifacts +# logs_artifact_name: compress_gpu-logs +# branch: ${{ github.ref_name }} +# commit: ${{ github.sha }} +# secrets: +# claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} +# gh_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/run-fuzzer.yml b/.github/workflows/run-fuzzer.yml index 302555d7b54..c649b951256 100644 --- a/.github/workflows/run-fuzzer.yml +++ b/.github/workflows/run-fuzzer.yml @@ -12,6 +12,21 @@ on: required: false type: number default: 7200 + family: + description: "Runner family (e.g., m8g.large for CPU, g5+g4dn+g6 for GPU)" + required: false + type: string + default: "m8g.large" + image: + description: "Runner image (e.g., ubuntu24-full-arm64, ubuntu24-gpu-x64)" + required: false + type: string + default: "ubuntu24-full-arm64" + extra_features: + description: "Extra cargo features to enable (e.g., cuda)" + required: false + type: string + default: "" outputs: crashes_found: description: "Whether crashes were found" @@ -34,8 +49,8 @@ jobs: timeout-minutes: 230 # almost 4 hours runs-on: - runs-on=${{ github.run_id }} - - family=m8g.large - - image=ubuntu24-full-arm64 + - family=${{ inputs.family }} + - image=${{ inputs.image }} - disk=large - extras=s3-cache - tag=${{ inputs.fuzz_target }}-fuzz @@ -43,11 +58,6 @@ jobs: crashes_found: ${{ steps.check.outputs.crashes_found }} first_crash_name: ${{ steps.check.outputs.first_crash_name }} artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }} - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" steps: - uses: runs-on/action@v2 with: @@ -70,6 +80,11 @@ jobs: - name: Restore corpus shell: bash + env: + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" @@ -99,8 +114,13 @@ jobs: - name: Run fuzzing target id: fuzz run: | + FEATURES_FLAG="" + if [ -n "${{ inputs.extra_features }}" ]; then + FEATURES_FLAG="--features ${{ inputs.extra_features }}" + fi RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 \ cargo +nightly fuzz run --release --debug-assertions \ + $FEATURES_FLAG \ ${{ inputs.fuzz_target }} -- \ -max_total_time=${{ inputs.max_time }} -rss_limit_mb=0 \ 2>&1 | tee fuzz_output.log @@ -149,6 +169,11 @@ jobs: - name: Persist corpus shell: bash + env: + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" diff --git a/.gitignore b/.gitignore index ffae91d865a..a68417f916f 100644 --- a/.gitignore +++ b/.gitignore @@ -229,6 +229,3 @@ sweep.timestamp # CUDA *.ptx - -# nvCOMP SDK (downloaded at build time) -vortex-cuda/nvcomp/sdk/ diff --git a/Cargo.lock b/Cargo.lock index d36bdffab8f..6f86172083b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2504,6 +2504,7 @@ dependencies = [ "tokio", "url", "vortex-bench", + "vortex-cuda", "vortex-datafusion", ] @@ -3814,6 +3815,7 @@ dependencies = [ "url", "vortex", "vortex-bench", + "vortex-cuda", "vortex-duckdb", ] @@ -10094,6 +10096,7 @@ dependencies = [ "vortex-buffer", "vortex-bytebool", "vortex-compute", + "vortex-cuda", "vortex-datetime-parts", "vortex-decimal-byte-parts", "vortex-dtype", @@ -10643,10 +10646,12 @@ dependencies = [ "itertools 0.14.0", "libfuzzer-sys", "strum 0.27.2", + "tokio", "vortex", "vortex-array", "vortex-btrblocks", "vortex-buffer", + "vortex-cuda", "vortex-dtype", "vortex-error", "vortex-file", @@ -10797,6 +10802,7 @@ name = "vortex-nvcomp" version = "0.1.0" dependencies = [ "bindgen", + "libloading 0.8.9", "reqwest", "tar", "xz2", diff --git a/Cargo.toml b/Cargo.toml index 86582a0c114..835f6aa3ebb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -150,6 +150,7 @@ jiff = "0.2.0" kanal = "0.1.1" lending-iterator = "0.1.7" libfuzzer-sys = "0.4" +libloading = "0.8" log = { version = "0.4.21" } loom = { version = "0.7", features = ["checkpoint"] } memmap2 = "0.9.5" @@ -271,6 +272,7 @@ vortex-zstd = { version = "0.1.0", path = "./encodings/zstd", default-features = # No version constraints for unpublished crates. vortex-bench = { path = "./vortex-bench", default-features = false } +vortex-cuda = { path = "./vortex-cuda", default-features = false } vortex-duckdb = { path = "./vortex-duckdb", default-features = false } [workspace.dependencies.getrandom_v03] diff --git a/benchmarks/datafusion-bench/Cargo.toml b/benchmarks/datafusion-bench/Cargo.toml index 3e5ea7064b3..4133cad2fae 100644 --- a/benchmarks/datafusion-bench/Cargo.toml +++ b/benchmarks/datafusion-bench/Cargo.toml @@ -36,10 +36,14 @@ parking_lot = { workspace = true } tokio = { workspace = true, features = ["full"] } url = { workspace = true } vortex-bench = { workspace = true } +vortex-cuda = { workspace = true, optional = true } vortex-datafusion = { workspace = true } [build-dependencies] get_dir = { workspace = true } +[features] +cuda = ["dep:vortex-cuda"] + [lints] workspace = true diff --git a/benchmarks/duckdb-bench/Cargo.toml b/benchmarks/duckdb-bench/Cargo.toml index 61b03b5c413..2407b938cb3 100644 --- a/benchmarks/duckdb-bench/Cargo.toml +++ b/benchmarks/duckdb-bench/Cargo.toml @@ -23,10 +23,14 @@ tracing = { workspace = true } url = { workspace = true } vortex = { workspace = true } vortex-bench = { workspace = true } +vortex-cuda = { workspace = true, optional = true } vortex-duckdb = { workspace = true } [build-dependencies] get_dir = { workspace = true } +[features] +cuda = ["dep:vortex-cuda"] + [lints] workspace = true diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index a9e2119bdb4..7226f353960 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -22,6 +22,7 @@ default = ["native"] native = ["libfuzzer-sys", "zstd", "vortex-file", "vortex/files"] wasmfuzz = [] zstd = ["vortex/zstd"] +cuda = ["vortex-cuda", "tokio"] [dependencies] # Always needed - arbitrary is used for input generation @@ -48,6 +49,10 @@ vortex-utils = { workspace = true } libfuzzer-sys = { workspace = true, optional = true } vortex-file = { workspace = true, optional = true } +# GPU support dependencies (optional, only for CUDA fuzzing) +tokio = { workspace = true, features = ["rt", "macros"], optional = true } +vortex-cuda = { workspace = true, optional = true } + [lints] workspace = true @@ -82,3 +87,11 @@ name = "compress_roundtrip" path = "fuzz_targets/compress_roundtrip.rs" test = false required-features = ["native"] + +[[bin]] +bench = false +doc = false +name = "compress_gpu" +path = "fuzz_targets/compress_gpu.rs" +test = false +required-features = ["native", "cuda"] diff --git a/fuzz/fuzz_targets/compress_gpu.rs b/fuzz/fuzz_targets/compress_gpu.rs new file mode 100644 index 00000000000..eea7ff6c5e3 --- /dev/null +++ b/fuzz/fuzz_targets/compress_gpu.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![no_main] +#![allow(clippy::unwrap_used, clippy::result_large_err)] + +use libfuzzer_sys::Corpus; +use libfuzzer_sys::fuzz_target; +use vortex_error::vortex_panic; +use vortex_fuzz::FuzzCompressGpu; +use vortex_fuzz::run_compress_gpu; + +fuzz_target!(|fuzz: FuzzCompressGpu| -> Corpus { + // Use tokio runtime to run async GPU fuzzer + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + match rt.block_on(run_compress_gpu(fuzz)) { + Ok(true) => Corpus::Keep, + Ok(false) => Corpus::Reject, + Err(e) => { + vortex_panic!("{e}"); + } + } +}); diff --git a/fuzz/src/gpu/mod.rs b/fuzz/src/gpu/mod.rs new file mode 100644 index 00000000000..0ad80c7a75b --- /dev/null +++ b/fuzz/src/gpu/mod.rs @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! GPU fuzzer module for testing CUDA decompression. +//! +//! This module generates arbitrary instances of GPU-supported compressed encodings, +//! then verifies that GPU decompression produces the same results as CPU decompression. + +use arbitrary::Arbitrary; +use arbitrary::Result; +use arbitrary::Unstructured; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::arrays::ArbitraryDictArray; +use vortex_dtype::Nullability; +use vortex_dtype::PType; + +use crate::error::Backtrace; +use crate::error::VortexFuzzError; +use crate::error::VortexFuzzResult; + +/// Which GPU-supported encoding to generate. +#[derive(Debug, Clone, Copy)] +pub enum GpuEncodingKind { + /// Dictionary encoding with GPU take support. + Dict, +} + +impl<'a> Arbitrary<'a> for GpuEncodingKind { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + // Currently only Dict is supported + match u.int_in_range(0..=0)? { + 0 => Ok(GpuEncodingKind::Dict), + _ => unreachable!(), + } + } +} + +/// Input for the GPU decompression fuzzer. +#[derive(Debug)] +pub struct FuzzCompressGpu { + pub array: ArrayRef, +} + +impl<'a> Arbitrary<'a> for FuzzCompressGpu { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let kind: GpuEncodingKind = u.arbitrary()?; + + let array = match kind { + GpuEncodingKind::Dict => { + // Dict already has Arbitrary support, use primitive values for GPU compatibility + let dtype = arbitrary_gpu_primitive_dtype(u)?; + ArbitraryDictArray::with_dtype(u, &dtype, None)? + .0 + .into_array() + } + }; + + Ok(FuzzCompressGpu { array }) + } +} + +/// Generate a random primitive DType suitable for GPU operations. +fn arbitrary_gpu_primitive_dtype(u: &mut Unstructured) -> Result { + let nullability: Nullability = u.arbitrary()?; + let ptype = match u.int_in_range(0..=9)? { + 0 => PType::U8, + 1 => PType::U16, + 2 => PType::U32, + 3 => PType::U64, + 4 => PType::I8, + 5 => PType::I16, + 6 => PType::I32, + 7 => PType::I64, + 8 => PType::F32, + 9 => PType::F64, + _ => unreachable!(), + }; + Ok(vortex_dtype::DType::Primitive(ptype, nullability)) +} + +/// Run the GPU decompression fuzzer. +/// +/// This function: +/// 1. Decompresses the array on CPU (reference) +/// 2. Decompresses the array on GPU +/// 3. Copies GPU result back to host using `CanonicalCudaExt::to_host` +/// 4. Compares the results +/// +/// Returns: +/// - `Ok(true)` - test passed, keep in corpus +/// - `Ok(false)` - test skipped (e.g., no CUDA), reject from corpus +/// - `Err(_)` - a bug was found +#[allow(clippy::result_large_err)] +pub async fn run_compress_gpu(fuzz: FuzzCompressGpu) -> VortexFuzzResult { + use vortex::VortexSessionDefault; + use vortex::session::VortexSession; + use vortex_cuda::CanonicalCudaExt; + use vortex_cuda::CudaSession; + use vortex_cuda::executor::CudaArrayExt; + use vortex_error::VortexExpect; + + // Runtime check - skip if CUDA is not available + if !vortex_cuda::cuda_available() { + return Ok(false); + } + + let FuzzCompressGpu { array } = fuzz; + + // Store original properties for error reporting + let original_len = array.len(); + + // 1. CPU decompression (reference) + let cpu_canonical = match array.to_canonical() { + Ok(c) => c, + Err(e) => { + return Err(VortexFuzzError::VortexError(e, Backtrace::capture())); + } + }; + + // 2. Create CUDA execution context + let session = VortexSession::default(); + + let mut cuda_ctx = + CudaSession::create_execution_ctx(&session).vortex_expect("cannot create session"); + + // 3. GPU decompression + let gpu_canonical = match array.clone().execute_cuda(&mut cuda_ctx).await { + Ok(c) => c, + Err(e) => { + return Err(VortexFuzzError::VortexError(e, Backtrace::capture())); + } + }; + + // 4. Copy GPU result back to host using CanonicalCudaExt + let gpu_host_canonical = match gpu_canonical.to_host().await { + Ok(c) => c, + Err(e) => { + return Err(VortexFuzzError::VortexError(e, Backtrace::capture())); + } + }; + + // 5. Compare canonicals + let cpu_array = cpu_canonical.into_array(); + let gpu_array = gpu_host_canonical.into_array(); + + // Verify dtype is preserved + if cpu_array.dtype() != gpu_array.dtype() { + return Err(VortexFuzzError::DTypeMismatch( + cpu_array, + gpu_array, + 0, + Backtrace::capture(), + )); + } + + // Verify length is preserved + if original_len != gpu_array.len() { + return Err(VortexFuzzError::LengthMismatch( + original_len, + gpu_array.len(), + array, + gpu_array, + 0, + Backtrace::capture(), + )); + } + + // Compare element by element + for i in 0..original_len { + let cpu_scalar = cpu_array + .scalar_at(i) + .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?; + let gpu_scalar = gpu_array + .scalar_at(i) + .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?; + + if cpu_scalar != gpu_scalar { + return Err(VortexFuzzError::ArrayNotEqual( + cpu_scalar, + gpu_scalar, + i, + cpu_array, + gpu_array, + 0, + Backtrace::capture(), + )); + } + } + + Ok(true) +} diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index dd0b3022642..781f31a1c0b 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -10,6 +10,10 @@ pub mod error; // File module only available for native builds (requires vortex-file which uses tokio) #[cfg(not(target_arch = "wasm32"))] pub mod file; + +// GPU fuzzer module (only available when cuda feature is enabled) +#[cfg(feature = "cuda")] +pub mod gpu; pub use array::Action; pub use array::CompressorStrategy; pub use array::ExpectedValue; @@ -20,6 +24,10 @@ pub use compress::FuzzCompressRoundtrip; pub use compress::run_compress_roundtrip; #[cfg(not(target_arch = "wasm32"))] pub use file::FuzzFileAction; +#[cfg(feature = "cuda")] +pub use gpu::FuzzCompressGpu; +#[cfg(feature = "cuda")] +pub use gpu::run_compress_gpu; // Runtime initialization - platform-specific #[cfg(not(target_arch = "wasm32"))] diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs index 49b9ce3479c..d22b1acc326 100644 --- a/vortex-cuda/benches/dict_cuda.rs +++ b/vortex-cuda/benches/dict_cuda.rs @@ -123,7 +123,7 @@ fn benchmark_dict_u32_u8(c: &mut Criterion) { &dict_array, |b, dict_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Get values and codes arrays @@ -187,7 +187,7 @@ fn benchmark_dict_u32_u16(c: &mut Criterion) { &dict_array, |b, dict_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let values: Vec = (0..4096).map(|i| i * 100).collect(); @@ -250,7 +250,7 @@ fn benchmark_dict_u64_u8(c: &mut Criterion) { &dict_array, |b, dict_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let values: Vec = (0..256).map(|i| i * 1_000_000).collect(); @@ -313,7 +313,7 @@ fn benchmark_dict_u64_u32(c: &mut Criterion) { &dict_array, |b, dict_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let values: Vec = (0..65536).map(|i| i * 1000).collect(); diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index fff66d204f2..90c89063649 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -173,7 +173,7 @@ fn benchmark_for_u8(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -224,7 +224,7 @@ fn benchmark_for_u16(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -275,7 +275,7 @@ fn benchmark_for_u32(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -326,7 +326,7 @@ fn benchmark_for_u64(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); diff --git a/vortex-cuda/benches/zstd_cuda.rs b/vortex-cuda/benches/zstd_cuda.rs index cab84dc9a00..cc21f2be862 100644 --- a/vortex-cuda/benches/zstd_cuda.rs +++ b/vortex-cuda/benches/zstd_cuda.rs @@ -123,7 +123,7 @@ fn benchmark_zstd_cuda_decompress(c: &mut Criterion) { &zstd_array, |b, zstd_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let mut total_time = Duration::ZERO; diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs index 77595a1e5f9..f975f6795b6 100644 --- a/vortex-cuda/build.rs +++ b/vortex-cuda/build.rs @@ -17,10 +17,6 @@ fn main() { return; } - // Include runtime library path for nvCOMP dylib. - let nvcomp_lib = env::var("DEP_NVCOMP_LIB_DIR").expect("Library path for nvCOMP not found"); - println!("cargo:rustc-link-arg=-Wl,-rpath,{nvcomp_lib}"); - let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir"); let kernels_dir = Path::new(&manifest_dir).join("kernels"); diff --git a/vortex-cuda/nvcomp/Cargo.toml b/vortex-cuda/nvcomp/Cargo.toml index c081960b476..59d6f8656cc 100644 --- a/vortex-cuda/nvcomp/Cargo.toml +++ b/vortex-cuda/nvcomp/Cargo.toml @@ -12,12 +12,12 @@ readme = { workspace = true } repository = { workspace = true } rust-version = { workspace = true } version = { workspace = true } -links = "nvcomp" [lints] workspace = true [dependencies] +libloading = { workspace = true } [build-dependencies] bindgen = { workspace = true } diff --git a/vortex-cuda/nvcomp/build.rs b/vortex-cuda/nvcomp/build.rs index 2d33a83f9f0..8ada54e46ad 100644 --- a/vortex-cuda/nvcomp/build.rs +++ b/vortex-cuda/nvcomp/build.rs @@ -5,6 +5,8 @@ //! //! Bindings are generated unconditionally. This allows for development against the //! CUDA APIs in environments that don't support CUDA. +//! +//! The library is loaded at runtime via libloading. #![expect(clippy::unwrap_used)] #![expect(clippy::expect_used)] @@ -35,9 +37,8 @@ fn main() { println!("cargo::rustc-check-cfg=cfg(cuda_available)"); println!("cargo:rerun-if-env-changed=CUDA_PATH"); - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); - let nvcomp_dir = manifest_dir.join("sdk"); + let nvcomp_dir = out_dir.join("nvcomp-sdk"); // Create CUDA stub header in OUT_DIR for bindgen let cuda_stub_dir = out_dir.join("cuda-stub"); @@ -57,20 +58,8 @@ fn main() { ); let include_dir = nvcomp_dir.join("include"); - let lib_dir = nvcomp_dir.join("lib"); - - println!( - "cargo:rerun-if-changed={}", - include_dir.join("nvcomp.h").display() - ); - println!( - "cargo:rerun-if-changed={}", - include_dir.join("nvcomp/zstd.h").display() - ); if !include_dir.exists() { - println!("cargo:warning=Downloading nvCOMP SDK from {}", url); - let response = reqwest::blocking::get(&url) .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}")); @@ -81,10 +70,6 @@ fn main() { ); let bytes = response.bytes().unwrap(); - println!( - "cargo:warning=Downloaded {} bytes, extracting...", - bytes.len() - ); // Extract tar.xz archive. let cursor = Cursor::new(bytes.as_ref()); @@ -102,22 +87,26 @@ fn main() { } fs::rename(&extracted, &nvcomp_dir).unwrap(); fs::remove_dir_all(&temp_dir).ok(); - - println!( - "cargo:warning=nvCOMP SDK extracted to {}", - nvcomp_dir.display() - ); } + // Functions are loaded at runtime via libloading to avoid link-time symbol resolution. let bindings = bindgen::Builder::default() .header(include_dir.join("nvcomp.h").to_string_lossy()) .header(include_dir.join("nvcomp/zstd.h").to_string_lossy()) .clang_arg(format!("-I{}", include_dir.display())) .clang_arg(format!("-I{}", cuda_stub_dir.display())) - .allowlist_function("nvcompBatchedZstd.*") + .allowlist_type("nvcompStatus_t") + .allowlist_type("nvcompBatchedZstdDecompressOpts_t") + .allowlist_type("nvcompDecompressBackend_t") + .allowlist_function("nvcompBatchedZstdDecompressGetTempSizeAsync") + .allowlist_function("nvcompBatchedZstdDecompressAsync") + .dynamic_library_name("NvcompLibrary") + .dynamic_link_require_all(true) + .wrap_unsafe_ops(true) .blocklist_type("CUstream_st") .blocklist_type("cudaStream_t") - .raw_line("// FFI bindings to nvCOMP (generated by bindgen).") + .raw_line("// FFI type definitions for nvCOMP (generated by bindgen).") + .raw_line("// Functions are loaded at runtime via libloading.") .raw_line("") .raw_line("pub type cudaStream_t = *mut std::ffi::c_void;") .generate() @@ -126,21 +115,8 @@ fn main() { bindings.write_to_file(out_dir.join("sys.rs")).unwrap(); // Set cuda_available cfg if CUDA is detected on the system. - // Gates tests and benchmarks that require CUDA at runtime. + // This gates tests and benchmarks that require CUDA at runtime. if cuda_available() { - // Link against nvcomp dynamically. - println!("cargo:rustc-link-search=native={}", lib_dir.display()); - println!("cargo:rustc-link-lib=dylib=nvcomp"); - println!("cargo:rustc-link-arg=-Wl,-rpath,{}", lib_dir.display()); - - // Export the library path for downstream crates via the `links` manifest key. - // Downstream crates can access this via `env::var("DEP_NVCOMP_LIB_DIR")` in their - // build.rs and add their own rpath: - // - // if let Ok(nvcomp_lib) = env::var("DEP_NVCOMP_LIB_DIR") { - // println!("cargo:rustc-link-arg=-Wl,-rpath,{nvcomp_lib}"); - // } - println!("cargo:lib_dir={}", lib_dir.display()); println!("cargo:rustc-cfg=cuda_available"); } } diff --git a/vortex-cuda/nvcomp/src/error.rs b/vortex-cuda/nvcomp/src/error.rs index 3c3aeb62d0f..4ba56c40c7c 100644 --- a/vortex-cuda/nvcomp/src/error.rs +++ b/vortex-cuda/nvcomp/src/error.rs @@ -6,8 +6,10 @@ use crate::sys; /// Error type for nvcomp operations. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum NvcompError { + /// Failed to load the nvcomp library at runtime. + LibraryLoadError(String), /// Invalid value provided. InvalidValue, /// Operation not supported. @@ -43,6 +45,7 @@ pub enum NvcompError { impl std::fmt::Display for NvcompError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + Self::LibraryLoadError(msg) => write!(f, "nvcomp: failed to load library: {}", msg), Self::InvalidValue => write!(f, "nvcomp: invalid value"), Self::NotSupported => write!(f, "nvcomp: operation not supported"), Self::CannotDecompress => write!(f, "nvcomp: cannot decompress"), diff --git a/vortex-cuda/nvcomp/src/lib.rs b/vortex-cuda/nvcomp/src/lib.rs index b2444dc5e93..38e5acfb724 100644 --- a/vortex-cuda/nvcomp/src/lib.rs +++ b/vortex-cuda/nvcomp/src/lib.rs @@ -3,19 +3,25 @@ //! Rust bindings to NVIDIA nvCOMP compression library. //! -//! This crate provides raw FFI bindings to nvCOMP, generated via bindgen -//! from the nvCOMP C headers. The nvCOMP SDK is automatically downloaded -//! at build time. +//! This crate provides bindings to nvCOMP, with the library loaded at runtime +//! via `libloading`. This allows the crate to compile on systems without CUDA +//! or nvcomp installed - the library is only required at runtime when the +//! functions are actually called. //! //! # Platform Support //! //! nvCOMP is only available on Linux x86_64 and ARM64. On other platforms, -//! this crate still builds against the CUDA APIs but can't be run. +//! this crate still compiles but will fail at runtime when trying to load +//! the library. //! //! # Runtime Requirements //! -//! The nvcomp library is linked dynamically. +//! The nvcomp library must be available at runtime. +use std::path::PathBuf; +use std::sync::OnceLock; + +/// Raw FFI type definitions and dynamically-loaded function pointers from bindgen. #[allow( non_upper_case_globals, non_camel_case_types, @@ -30,6 +36,36 @@ pub mod zstd; pub use error::NvcompError; +/// The loaded nvcomp library instance. +static NVCOMP_LIB: OnceLock> = OnceLock::new(); + +fn load_nvcomp() -> Result { + let lib_name = "libnvcomp.so"; + let build_lib_dir = env!("OUT_DIR"); + let sdk_lib_path = PathBuf::from(build_lib_dir) + .join("nvcomp-sdk") + .join("lib") + .join(lib_name); + + // SAFETY: The library at the SDK path is a valid nvcomp shared library + // downloaded during the build process. + unsafe { + sys::NvcompLibrary::new(&sdk_lib_path) + .map_err(|e| format!("Failed to load nvcomp library: {e}")) + } +} + +/// Gets a reference to the loaded nvcomp library. +/// +/// The library is loaded lazily on first access. Returns an error if the +/// library cannot be found or loaded. +pub fn nvcomp_library() -> Result<&'static sys::NvcompLibrary, NvcompError> { + NVCOMP_LIB + .get_or_init(load_nvcomp) + .as_ref() + .map_err(|e| NvcompError::LibraryLoadError(e.clone())) +} + #[cfg(test)] #[cfg(cuda_available)] mod tests { diff --git a/vortex-cuda/nvcomp/src/zstd.rs b/vortex-cuda/nvcomp/src/zstd.rs index be1871626df..7a326d3fb25 100644 --- a/vortex-cuda/nvcomp/src/zstd.rs +++ b/vortex-cuda/nvcomp/src/zstd.rs @@ -7,6 +7,7 @@ use std::ffi::c_void; use crate::error::NvcompError; use crate::error::check_status; +use crate::nvcomp_library; use crate::sys; /// Backend selection for nvcomp decompression. @@ -88,10 +89,12 @@ pub fn get_decompress_temp_size_with_opts( max_total_uncompressed_bytes: usize, opts: ZstdDecompressOpts, ) -> Result { + let library = nvcomp_library()?; + let mut temp_bytes: usize = 0; let status = unsafe { - sys::nvcompBatchedZstdDecompressGetTempSizeAsync( + library.nvcompBatchedZstdDecompressGetTempSizeAsync( num_chunks, max_uncompressed_chunk_bytes, opts.to_nvcomp(), @@ -181,8 +184,10 @@ pub unsafe fn decompress_async_with_opts( stream: sys::cudaStream_t, opts: ZstdDecompressOpts, ) -> Result<(), NvcompError> { + let library = nvcomp_library()?; + let status = unsafe { - sys::nvcompBatchedZstdDecompressAsync( + library.nvcompBatchedZstdDecompressAsync( device_compressed_ptrs, device_compressed_bytes, device_uncompressed_bytes, diff --git a/vortex-cuda/src/canonical.rs b/vortex-cuda/src/canonical.rs index a161123c03f..7306c80aec1 100644 --- a/vortex-cuda/src/canonical.rs +++ b/vortex-cuda/src/canonical.rs @@ -15,14 +15,14 @@ use vortex_error::VortexResult; /// Move all canonical data from to_host from device. #[async_trait] pub trait CanonicalCudaExt { - async fn into_host(self) -> VortexResult + async fn to_host(self) -> VortexResult where Self: Sized; } #[async_trait] impl CanonicalCudaExt for Canonical { - async fn into_host(self) -> VortexResult { + async fn to_host(self) -> VortexResult { match self { n @ Canonical::Null(_) => Ok(n), Canonical::Bool(bool) => { @@ -61,7 +61,7 @@ impl CanonicalCudaExt for Canonical { ) })) } - _ => todo!("support other types once they have `BufferHandle`s"), + _ => todo!(), } } } diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs index 07a2b52b262..a1f4766bbdf 100644 --- a/vortex-cuda/src/kernel/arrays/dict.rs +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -284,7 +284,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_u32_values_u8_codes() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values: [100, 200, 300, 400] @@ -316,7 +316,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_u64_values_u16_codes() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values: large u64 values @@ -351,7 +351,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_i32_values_u32_codes() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values: signed integers including negatives @@ -383,7 +383,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_large_array() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary with 256 values @@ -415,7 +415,7 @@ mod tests { #[test] fn test_cuda_dict_values_with_validity() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values with nulls: [100, null, 300, 400] @@ -451,7 +451,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_codes_with_validity() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values: [100, 200, 300, 400] @@ -489,7 +489,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_both_with_validity() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values with nulls: [100, null, 300, 400] @@ -534,7 +534,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_i64_values_with_validity() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Dictionary values with nulls (i64) @@ -580,7 +580,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_all_valid_matches_baseline() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Non-nullable values @@ -624,7 +624,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i8_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 2 uses i8 backing type @@ -653,7 +653,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i16_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 4 uses i16 backing type @@ -682,7 +682,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i32_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 9 uses i32 backing type @@ -711,7 +711,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i64_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 18 uses i64 backing type @@ -743,7 +743,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i128_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 38 uses i128 backing type @@ -780,7 +780,7 @@ mod tests { #[tokio::test] async fn test_cuda_dict_decimal_i256_values() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Precision 76 uses i256 backing type diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs index ddb4717649f..e04cd56f1b2 100644 --- a/vortex-cuda/src/kernel/encodings/alp.rs +++ b/vortex-cuda/src/kernel/encodings/alp.rs @@ -132,7 +132,7 @@ mod tests { #[tokio::test] async fn test_cuda_alp_decompression_f32() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // Create encoded values (what ALP would produce) @@ -154,7 +154,7 @@ mod tests { .execute(alp_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed") - .into_host() + .to_host() .await? .into_array(); diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs index 1c66cd503cf..7760815271e 100644 --- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs +++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs @@ -77,7 +77,7 @@ mod tests { #[case] precision: u8, #[case] scale: i8, ) { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("create execution context"); let decimal_dtype = DecimalDType::new(precision, scale); diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 4f16031b3db..c33ad6c62ac 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -135,7 +135,7 @@ mod tests { #[case::u64(make_for_array((0..5000).map(|i| (i % 5000) as u64).collect(), 1000000u64))] #[tokio::test] async fn test_cuda_for_decompression(#[case] for_array: FoRArray) -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let cpu_result = for_array.to_canonical()?; @@ -144,7 +144,7 @@ mod tests { .execute(for_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed") - .into_host() + .to_host() .await? .into_array(); diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs index 5b4c7cb6948..52d92472cc1 100644 --- a/vortex-cuda/src/kernel/encodings/zigzag.rs +++ b/vortex-cuda/src/kernel/encodings/zigzag.rs @@ -123,7 +123,7 @@ mod tests { #[tokio::test] async fn test_cuda_zigzag_decompression_u32() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); // ZigZag encoding: 0->0, 1->-1, 2->1, 3->-2, 4->2, ... @@ -140,7 +140,7 @@ mod tests { .execute(zigzag_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed") - .into_host() + .to_host() .await? .into_array(); diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs index 1681e77a3aa..508990897d8 100644 --- a/vortex-cuda/src/kernel/encodings/zstd.rs +++ b/vortex-cuda/src/kernel/encodings/zstd.rs @@ -316,7 +316,7 @@ mod tests { #[tokio::test] async fn test_cuda_zstd_decompression_utf8() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let strings = VarBinViewArray::from_iter_str([ @@ -341,7 +341,7 @@ mod tests { #[tokio::test] async fn test_cuda_zstd_decompression_multiple_frames() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let strings = VarBinViewArray::from_iter_str([ @@ -376,7 +376,7 @@ mod tests { #[tokio::test] async fn test_cuda_zstd_decompression_sliced() -> VortexResult<()> { - let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); let strings = VarBinViewArray::from_iter_str([ diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index 82625a71cdd..da448e9b065 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -3,6 +3,8 @@ //! CUDA support for Vortex arrays. +use std::process::Command; + mod canonical; mod device_buffer; pub mod executor; @@ -25,6 +27,7 @@ pub use kernel::ZstdKernelPrep; pub use kernel::launch_cuda_kernel_impl; pub use kernel::zstd_kernel_prepare; pub use session::CudaSession; +pub use session::CudaSessionExt; use vortex_alp::ALPVTable; use vortex_array::arrays::DictVTable; use vortex_decimal_byte_parts::DecimalBytePartsVTable; @@ -33,6 +36,14 @@ pub use vortex_nvcomp as nvcomp; use vortex_zigzag::ZigZagVTable; use vortex_zstd::ZstdVTable; +/// Checks if CUDA is available on the system by looking for nvcc. +pub fn cuda_available() -> bool { + Command::new("nvcc") + .arg("--version") + .output() + .is_ok_and(|o| o.status.success()) +} + /// Registers CUDA kernels. pub fn initialize_cuda(session: &CudaSession) { tracing::info!("Registering CUDA kernels"); diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 16fc917e327..c83128def3e 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -14,7 +14,7 @@ use vortex_session::SessionExt; use vortex_utils::aliases::dash_map::DashMap; use crate::executor::CudaExecute; -use crate::executor::CudaExecutionCtx; +pub use crate::executor::CudaExecutionCtx; use crate::kernel::KernelLoader; /// CUDA session for GPU accelerated execution. @@ -40,7 +40,7 @@ impl CudaSession { /// Creates a new CUDA execution context. pub fn create_execution_ctx( - vortex_session: vortex_session::VortexSession, + vortex_session: &vortex_session::VortexSession, ) -> VortexResult { let stream = vortex_session .cuda_session() diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml index 1a61feefed6..45ffcedb889 100644 --- a/vortex/Cargo.toml +++ b/vortex/Cargo.toml @@ -26,6 +26,7 @@ vortex-btrblocks = { workspace = true } vortex-buffer = { workspace = true } vortex-bytebool = { workspace = true } vortex-compute = { workspace = true } + vortex-datetime-parts = { workspace = true } vortex-decimal-byte-parts = { workspace = true } vortex-dtype = { workspace = true, default-features = true } @@ -51,6 +52,9 @@ vortex-utils = { workspace = true } vortex-zigzag = { workspace = true } vortex-zstd = { workspace = true, optional = true } +[target.'cfg(target_os = "linux")'.dependencies] +vortex-cuda = { workspace = true, optional = true } + [dev-dependencies] anyhow = { workspace = true } arrow-array = { workspace = true } diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index 1f0c99fe284..fa2083883cf 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -159,6 +159,16 @@ impl VortexSessionDefault for VortexSession { .with::() .with::(); + #[cfg(all(feature = "vortex-cuda", target_os = "linux"))] + // Even if the CUDA feature is enabled we need to check at + // runtime whether CUDA is available in the current environment. + if vortex_cuda::cuda_available() { + session = session.with::(); + use vortex_cuda::CudaSession; + use vortex_cuda::CudaSessionExt; + vortex_cuda::initialize_cuda(&session.cuda_session()); + } + #[cfg(feature = "files")] file::register_default_encodings(&mut session);