11function set_cudnn_version() {
2- readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
2+ readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
3+ readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
34 readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
45
56 # Parameters for NVIDIA-provided cuDNN library
67 DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
78 readonly DEFAULT_CUDNN_VERSION
89 CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
9- # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION }
10- if is_rocky && ( version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}" ) ; then
11- CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION }"
10+ # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION }
11+ if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
12+ CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION }"
1213 elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
1314 # cuDNN v8 is not distribution for ubuntu20+, debian12
1415 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
@@ -303,30 +304,6 @@ function install_nvidia_nccl() {
303304
304305 local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
305306
306- # https://github.com/NVIDIA/nccl/blob/master/README.md
307- # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
308- # Fermi: SM_20, compute_30
309- # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
310- # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
311- # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
312-
313- # The following architectures are suppored by open kernel driver
314- # Volta: SM_70,SM_72, compute_70,compute_72
315- # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
316-
317- # The following architectures are supported by CUDA v11.8+
318- # Ada: SM_89, compute_89
319- # Hopper: SM_90,SM_90a compute_90,compute_90a
320- # Blackwell: SM_100, compute_100
321- NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
322- NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
323- if version_gt "${CUDA_VERSION}" "11.6" ; then
324- NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
325- if version_ge "${CUDA_VERSION}" "11.8" ; then
326- NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
327- if version_ge "${CUDA_VERSION}" "12.0" ; then
328- NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
329-
330307 mkdir -p "${workdir}"
331308 pushd "${workdir}"
332309
@@ -347,6 +324,30 @@ function install_nvidia_nccl() {
347324 local local_tarball="${workdir}/${build_tarball}"
348325 local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
349326
327+ # https://github.com/NVIDIA/nccl/blob/master/README.md
328+ # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
329+ # Fermi: SM_20, compute_30
330+ # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
331+ # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
332+ # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
333+
334+ # The following architectures are suppored by open kernel driver
335+ # Volta: SM_70,SM_72, compute_70,compute_72
336+ # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
337+
338+ # The following architectures are supported by CUDA v11.8+
339+ # Ada: SM_89, compute_89
340+ # Hopper: SM_90,SM_90a compute_90,compute_90a
341+ # Blackwell: SM_100, compute_100
342+ NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
343+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
344+ if version_gt "${CUDA_VERSION}" "11.6" ; then
345+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
346+ if version_ge "${CUDA_VERSION}" "11.8" ; then
347+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
348+ if version_ge "${CUDA_VERSION}" "12.0" ; then
349+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
350+
350351 output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
351352 if echo "${output}" | grep -q "${gcs_tarball}" ; then
352353 # cache hit - unpack from cache
@@ -369,11 +370,12 @@ function install_nvidia_nccl() {
369370 export NVCC_GENCODE
370371 execute_with_retries make -j$(nproc) pkg.redhat.build
371372 fi
372- tar czvf "/${local_tarball}" "../${build_path}"
373- gcloud storage cp "${local_tarball}" "${gcs_tarball}"
374- rm "${local_tarball}"
373+ tar czvf "${local_tarball}" "../${build_path}"
375374 make clean
376375 popd
376+ tar xzvf "${local_tarball}"
377+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
378+ rm "${local_tarball}"
377379 fi
378380 gcloud storage cat "${gcs_tarball}" | tar xz
379381 }
@@ -415,16 +417,16 @@ function install_nvidia_cudnn() {
415417 apt-get -y install nvidia-cudnn
416418 else
417419 if is_cudnn8 ; then
418- install_local_cudnn8_repo
420+ add_repo_cuda
419421
420422 apt-get update -qq
423+ # Ignore version requested and use the latest version in the package index
424+ cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
421425
422426 execute_with_retries \
423427 apt-get -y install --no-install-recommends \
424428 "libcudnn8=${cudnn_pkg_version}" \
425429 "libcudnn8-dev=${cudnn_pkg_version}"
426-
427- uninstall_local_cudnn8_repo
428430 sync
429431 elif is_cudnn9 ; then
430432 install_cuda_keyring_pkg
@@ -452,6 +454,48 @@ function install_nvidia_cudnn() {
452454 mark_complete cudnn
453455}
454456
457+ function install_pytorch() {
458+ if test -f "${workdir}/complete/pytorch" ; then return ; fi
459+ local env
460+ env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
461+ local mc3=/opt/conda/miniconda3
462+ local envpath="${mc3}/envs/${env}"
463+ # Set numa node to 0 for all GPUs
464+ for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
465+ local verb=create
466+ if test -d "${envpath}" ; then verb=install ; fi
467+
468+ readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
469+ case "${INCLUDE_PYTORCH^^}" in
470+ "1" | "YES" | "TRUE" )
471+ local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
472+ local local_tarball="${workdir}/${build_tarball}"
473+ local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
474+
475+ output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
476+ if echo "${output}" | grep -q "${gcs_tarball}" ; then
477+ # cache hit - unpack from cache
478+ echo "cache hit"
479+ mkdir -p "${envpath}"
480+ gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
481+ else
482+ cudart_spec="cuda-cudart"
483+ if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
484+ "${mc3}/bin/mamba" "${verb}" -n "${env}" \
485+ -c conda-forge -c nvidia -c rapidsai \
486+ numba pytorch tensorflow[and-cuda] rapids pyspark \
487+ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
488+ pushd "${envpath}"
489+ tar czf "${local_tarball}" .
490+ popd
491+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
492+ fi
493+ ;;
494+ * ) echo "skip pytorch install" ;;
495+ esac
496+ touch "${workdir}/complete/pytorch"
497+ }
498+
455499function add_nonfree_components() {
456500 if is_src_nvidia ; then return; fi
457501 if ge_debian12 ; then
0 commit comments