Skip to content

Commit 7662215

Browse files
committed
exercised older CUDA and mig a100 use case more ; added pytorch installation functionality
1 parent 5a37d94 commit 7662215

File tree

3 files changed

+131
-43
lines changed

3 files changed

+131
-43
lines changed

templates/gpu/install_functions

Lines changed: 78 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
function set_cudnn_version() {
2-
readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
2+
readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
3+
readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
34
readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
45

56
# Parameters for NVIDIA-provided cuDNN library
67
DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
78
readonly DEFAULT_CUDNN_VERSION
89
CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
9-
# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
10-
if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
11-
CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
10+
# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
11+
if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
12+
CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
1213
elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
1314
# cuDNN v8 is not distribution for ubuntu20+, debian12
1415
CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
@@ -303,30 +304,6 @@ function install_nvidia_nccl() {
303304

304305
local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
305306

306-
# https://github.com/NVIDIA/nccl/blob/master/README.md
307-
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
308-
# Fermi: SM_20, compute_30
309-
# Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
310-
# Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
311-
# Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
312-
313-
# The following architectures are suppored by open kernel driver
314-
# Volta: SM_70,SM_72, compute_70,compute_72
315-
# Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
316-
317-
# The following architectures are supported by CUDA v11.8+
318-
# Ada: SM_89, compute_89
319-
# Hopper: SM_90,SM_90a compute_90,compute_90a
320-
# Blackwell: SM_100, compute_100
321-
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
322-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
323-
if version_gt "${CUDA_VERSION}" "11.6" ; then
324-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
325-
if version_ge "${CUDA_VERSION}" "11.8" ; then
326-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
327-
if version_ge "${CUDA_VERSION}" "12.0" ; then
328-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
329-
330307
mkdir -p "${workdir}"
331308
pushd "${workdir}"
332309

@@ -347,6 +324,30 @@ function install_nvidia_nccl() {
347324
local local_tarball="${workdir}/${build_tarball}"
348325
local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
349326

327+
# https://github.com/NVIDIA/nccl/blob/master/README.md
328+
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
329+
# Fermi: SM_20, compute_30
330+
# Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
331+
# Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
332+
# Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
333+
334+
# The following architectures are suppored by open kernel driver
335+
# Volta: SM_70,SM_72, compute_70,compute_72
336+
# Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
337+
338+
# The following architectures are supported by CUDA v11.8+
339+
# Ada: SM_89, compute_89
340+
# Hopper: SM_90,SM_90a compute_90,compute_90a
341+
# Blackwell: SM_100, compute_100
342+
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
343+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
344+
if version_gt "${CUDA_VERSION}" "11.6" ; then
345+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
346+
if version_ge "${CUDA_VERSION}" "11.8" ; then
347+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
348+
if version_ge "${CUDA_VERSION}" "12.0" ; then
349+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
350+
350351
output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
351352
if echo "${output}" | grep -q "${gcs_tarball}" ; then
352353
# cache hit - unpack from cache
@@ -369,11 +370,12 @@ function install_nvidia_nccl() {
369370
export NVCC_GENCODE
370371
execute_with_retries make -j$(nproc) pkg.redhat.build
371372
fi
372-
tar czvf "/${local_tarball}" "../${build_path}"
373-
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
374-
rm "${local_tarball}"
373+
tar czvf "${local_tarball}" "../${build_path}"
375374
make clean
376375
popd
376+
tar xzvf "${local_tarball}"
377+
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
378+
rm "${local_tarball}"
377379
fi
378380
gcloud storage cat "${gcs_tarball}" | tar xz
379381
}
@@ -415,16 +417,16 @@ function install_nvidia_cudnn() {
415417
apt-get -y install nvidia-cudnn
416418
else
417419
if is_cudnn8 ; then
418-
install_local_cudnn8_repo
420+
add_repo_cuda
419421

420422
apt-get update -qq
423+
# Ignore version requested and use the latest version in the package index
424+
cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
421425

422426
execute_with_retries \
423427
apt-get -y install --no-install-recommends \
424428
"libcudnn8=${cudnn_pkg_version}" \
425429
"libcudnn8-dev=${cudnn_pkg_version}"
426-
427-
uninstall_local_cudnn8_repo
428430
sync
429431
elif is_cudnn9 ; then
430432
install_cuda_keyring_pkg
@@ -452,6 +454,48 @@ function install_nvidia_cudnn() {
452454
mark_complete cudnn
453455
}
454456

457+
function install_pytorch() {
458+
if test -f "${workdir}/complete/pytorch" ; then return ; fi
459+
local env
460+
env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
461+
local mc3=/opt/conda/miniconda3
462+
local envpath="${mc3}/envs/${env}"
463+
# Set numa node to 0 for all GPUs
464+
for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
465+
local verb=create
466+
if test -d "${envpath}" ; then verb=install ; fi
467+
468+
readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
469+
case "${INCLUDE_PYTORCH^^}" in
470+
"1" | "YES" | "TRUE" )
471+
local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
472+
local local_tarball="${workdir}/${build_tarball}"
473+
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
474+
475+
output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
476+
if echo "${output}" | grep -q "${gcs_tarball}" ; then
477+
# cache hit - unpack from cache
478+
echo "cache hit"
479+
mkdir -p "${envpath}"
480+
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
481+
else
482+
cudart_spec="cuda-cudart"
483+
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
484+
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
485+
-c conda-forge -c nvidia -c rapidsai \
486+
numba pytorch tensorflow[and-cuda] rapids pyspark \
487+
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
488+
pushd "${envpath}"
489+
tar czf "${local_tarball}" .
490+
popd
491+
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
492+
fi
493+
;;
494+
* ) echo "skip pytorch install" ;;
495+
esac
496+
touch "${workdir}/complete/pytorch"
497+
}
498+
455499
function add_nonfree_components() {
456500
if is_src_nvidia ; then return; fi
457501
if ge_debian12 ; then

templates/gpu/mig_functions

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,38 +65,81 @@ function configure_mig_cgi() {
6565
function enable_mig() {
6666
is_complete enable-mig && return
6767

68-
# Start persistenced if it's not already running
69-
# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
68+
# All devices on the same numa node
7069
for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
7170
# Write an ascii zero to the numa node indicator
7271
echo "0" | dd of="${f}" status=none
7372
done
73+
74+
echo "Stopping services and kernel modules in preparation for enabling mig."
75+
if ( ps auwx | grep -i nvidia\\-persistenced ) ; then killall -9 nvidia-persistenced ; fi
76+
7477
# nvidia-smi --query-compute-apps=pid --format=csv,noheader
7578
for svc in resourcemanager nodemanager; do
7679
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
7780
systemctl stop "hadoop-yarn-${svc}.service"
7881
fi
7982
done
83+
# can lsof be used to determine what processes have a file with name =~ /nvidia/ under the /dev/ directory ?
84+
# if so, stop the service which launches the process with the open filehandle
85+
86+
MIG_GPU_LIST="`nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n ""`"
87+
NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
88+
89+
# root@cluster-1718310842-m:/tmp# for m in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do sudo rmmod $m ; done
90+
# rmmod: ERROR: Module nvidia_drm is not currently loaded
91+
# rmmod: ERROR: Module nvidia_modeset is not currently loaded
92+
# rmmod: ERROR: Module nvidia_uvm is not currently loaded
93+
# rmmod: ERROR: Module nvidia is not currently loaded
94+
# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --gpu-reset
95+
# Resetting GPU 00000000:00:04.0 is not supported.
96+
# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --multi-instance-gpu=1
97+
# Warning: MIG mode is in pending enable state for GPU 00000000:00:04.0:Not Supported
98+
# Reboot the system or try nvidia-smi --gpu-reset to make MIG mode effective on GPU 00000000:00:04.0
99+
# All done.
100+
# root@cluster-1718310842-m:/tmp# echo $?
101+
# 0
102+
# root@cluster-1718310842-m:/tmp# /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader
103+
# Disabled
104+
105+
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
106+
80107
time nvsmi --gpu-reset || { # 30s
81-
echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock."
82108
# TODO: find a way to reset the A100 without reboot
109+
removed="1"
83110
for tryno in {1..25} ; do ; removed="1"
84111
for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do
85112
if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done
86113
if [[ "${removed}" == "1" ]] ; then
87114
echo "modules removed successfully"
88-
nvsmi --gpu-reset
89-
break
115+
nvsmi --gpu-reset && break
90116
fi
91117
done
92118
}
93-
nvsmi -mig 1
119+
120+
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
121+
for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
122+
if version_le "${CUDA_VERSION}" "11.6" ; then
123+
nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
124+
else
125+
nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
126+
fi
127+
done
128+
fi
129+
if test -n "$(nvsmi -L)" ; then
130+
# cache the result of the gpu query
131+
ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
132+
echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
133+
chmod a+r "/var/run/nvidia-gpu-index.txt"
134+
fi
94135
for svc in resourcemanager nodemanager; do
95136
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
96137
systemctl start "hadoop-yarn-${svc}.service"
97138
fi
98139
done
99140
clear_nvsmi_cache
141+
# Start persistenced if it's not already running
142+
if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
100143

101144
mark_complete enable-mig
102145
}

templates/gpu/util_functions

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,11 @@ function prepare_gpu_env(){
200200
readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
201201
fi
202202

203-
# Verify SPARK compatability
204-
RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
203+
# Set variables from metadata
204+
RAPIDS_RUNTIME="$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")"
205205
INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
206-
readonly RAPIDS_RUNTIME INCLUDE_GPUS
206+
INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
207+
readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH
207208

208209
# determine whether we have nvidia-smi installed and working
209210
nvsmi

0 commit comments

Comments
 (0)