Skip to content

Commit 5a37d94

Browse files
committed
tested with much older versions of CUDA on an old dataproc image from pre-2023
1 parent 374ff96 commit 5a37d94

File tree

4 files changed

+114
-51
lines changed

4 files changed

+114
-51
lines changed

templates/common/util_functions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ function prepare_common_env() {
554554

555555
if is_debuntu ; then
556556
clean_up_sources_lists
557-
apt-get update -qq
557+
apt-get update -qq --allow-releaseinfo-change
558558
apt-get -y clean
559559
apt-get -o DPkg::Lock::Timeout=60 -y autoremove
560560
if ge_debian12 ; then

templates/gpu/install_functions

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ function set_cuda_runfile_url() {
4646
local MAX_DRIVER_VERSION
4747
local MAX_CUDA_VERSION
4848

49-
local MIN_OPEN_DRIVER_VER="515.48.07"
49+
MIN_OPEN_DRIVER_VER="515.43.04"
5050
local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
5151
local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
5252

@@ -84,7 +84,33 @@ function set_cuda_runfile_url() {
8484

8585
# driver version named in cuda runfile filename
8686
# (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
87+
# 10.0.130/410.48 =https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
88+
# 10.1.234/418.87.00=https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run
89+
# 10.2.89/440.33.01 =https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
90+
# 11.0.3/450.51.06 =https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run
91+
# 11.1.1/455.42.00 =https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
92+
# 11.2.2/460.32.03 =https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
93+
# 11.3.1/465.19.01 =https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
94+
# 11.4.4/470.82.01 =https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run
95+
# 11.5.2/495.29.05 =https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
96+
# 11.6.2/510.47.03 =https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
97+
# 11.7.1/515.65.01 =https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run
98+
# 11.8.0/520.61.05 =https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
99+
# 12.0.1/525.85.12 =https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run
100+
# 12.1.1/530.30.02 =https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
101+
# 12.2.2/535.104.05 =https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
102+
# 12.3.2/545.23.08 =https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
87103
readonly -A drv_for_cuda=(
104+
["10.0.130"]="410.48"
105+
["10.1.234"]="418.87.00"
106+
["10.2.89"]="440.33.01"
107+
["11.0.3"]="450.51.06"
108+
["11.1.1"]="455.42.00"
109+
["11.2.2"]="460.32.03"
110+
["11.3.1"]="465.19.01"
111+
["11.4.4"]="470.82.01"
112+
["11.5.2"]="495.29.05"
113+
["11.6.2"]="510.47.03"
88114
["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
89115
["11.8.0"]="520.61.05"
90116
["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
@@ -108,7 +134,8 @@ function set_cuda_runfile_url() {
108134
CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
109135
readonly CUDA_RUNFILE
110136

111-
if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
137+
# version naming and archive url were erratic prior to 11.0.3
138+
if ( version_ge "${CUDA_FULL_VERSION}" "11.0.3" && ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ) ; then
112139
echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
113140
exit 1
114141
fi
@@ -292,13 +319,13 @@ function install_nvidia_nccl() {
292319
# Hopper: SM_90,SM_90a compute_90,compute_90a
293320
# Blackwell: SM_100, compute_100
294321
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
295-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
322+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
323+
if version_gt "${CUDA_VERSION}" "11.6" ; then
324+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
296325
if version_ge "${CUDA_VERSION}" "11.8" ; then
297-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
298-
fi
326+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
299327
if version_ge "${CUDA_VERSION}" "12.0" ; then
300-
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
301-
fi
328+
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
302329

303330
mkdir -p "${workdir}"
304331
pushd "${workdir}"
@@ -464,8 +491,8 @@ function add_repo_cuda() {
464491
}
465492

466493
function build_driver_from_github() {
467-
# non-GPL driver will have been built on rocky8
468-
if is_rocky8 ; then return 0 ; fi
494+
# non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version
495+
if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then return 0 ; fi
469496
pushd "${workdir}"
470497

471498
test -d "${workdir}/open-gpu-kernel-modules" || {
@@ -592,7 +619,7 @@ function install_nvidia_userspace_runfile() {
592619
local cache_hit="0"
593620
local local_tarball
594621

595-
if is_rocky8 ; then
622+
if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
596623
local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
597624
test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
598625
local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
@@ -604,7 +631,9 @@ function install_nvidia_userspace_runfile() {
604631

605632
if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
606633
cache_hit="1"
607-
runfile_args="--no-kernel-modules"
634+
if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
635+
runfile_args="${runfile_args} --no-kernel-modules"
636+
fi
608637
echo "cache hit"
609638
else
610639
install_build_dependencies
@@ -619,10 +648,13 @@ function install_nvidia_userspace_runfile() {
619648
--module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
620649
"
621650
fi
622-
runfile_args="--no-dkms ${signing_options}"
651+
runfile_args="${signing_options}"
652+
if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
653+
runfile_args="${runfile_args} --no-dkms"
654+
fi
623655
fi
624656
}
625-
else
657+
elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
626658
runfile_args="--no-kernel-modules"
627659
fi
628660

@@ -632,7 +664,7 @@ function install_nvidia_userspace_runfile() {
632664
--install-libglvnd \
633665
--tmpdir="${tmpdir}"
634666

635-
if is_rocky8 ; then
667+
if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then
636668
if [[ "${cache_hit}" == "1" ]] ; then
637669
gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
638670
depmod -a

templates/gpu/mig_functions

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,36 @@ function enable_mig() {
6666
is_complete enable-mig && return
6767

6868
# Start persistenced if it's not already running
69-
if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
69+
# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
7070
for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
7171
# Write an ascii zero to the numa node indicator
7272
echo "0" | dd of="${f}" status=none
7373
done
74-
time nvsmi --gpu-reset # 30s
74+
# nvidia-smi --query-compute-apps=pid --format=csv,noheader
75+
for svc in resourcemanager nodemanager; do
76+
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
77+
systemctl stop "hadoop-yarn-${svc}.service"
78+
fi
79+
done
80+
time nvsmi --gpu-reset || { # 30s
81+
echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock."
82+
# TODO: find a way to reset the A100 without reboot
83+
for tryno in {1..25} ; do ; removed="1"
84+
for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do
85+
if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done
86+
if [[ "${removed}" == "1" ]] ; then
87+
echo "modules removed successfully"
88+
nvsmi --gpu-reset
89+
break
90+
fi
91+
done
92+
}
7593
nvsmi -mig 1
94+
for svc in resourcemanager nodemanager; do
95+
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
96+
systemctl start "hadoop-yarn-${svc}.service"
97+
fi
98+
done
7699
clear_nvsmi_cache
77100

78101
mark_complete enable-mig

templates/gpu/util_functions

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,56 @@
11
function set_support_matrix() {
22
# CUDA version and Driver version
33
# https://docs.nvidia.com/deploy/cuda-compatibility/
4-
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
4+
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html#framework-matrix
55
# https://developer.nvidia.com/cuda-downloads
66

77
# Minimum supported version for open kernel driver is 515.43.04
88
# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
9-
# Rocky8: 12.0: 525.147.05
109
local latest
1110
latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
1211
readonly -A DRIVER_FOR_CUDA=(
13-
["11.7"]="515.65.01" ["11.8"]="525.147.05"
14-
["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
12+
["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
13+
["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
14+
["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
15+
["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
16+
["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
17+
["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
1518
)
1619
readonly -A DRIVER_SUBVER=(
17-
["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01"
18-
["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01"
20+
["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64"
21+
["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03"
22+
["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31"
23+
["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03"
24+
["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05"
25+
["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142"
26+
["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77"
1927
)
2028
# https://developer.nvidia.com/cudnn-downloads
21-
if is_debuntu ; then
2229
readonly -A CUDNN_FOR_CUDA=(
23-
["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17"
24-
["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17"
30+
["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4"
31+
["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15"
32+
["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29"
33+
["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28"
34+
["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70"
35+
["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74"
2536
)
26-
elif is_rocky ; then
27-
# rocky:
28-
# 12.0: 8.8.1.3
29-
# 12.1: 8.9.3.28
30-
# 12.2: 8.9.7.29
31-
# 12.3: 9.0.0.312
32-
# 12.4: 9.1.1.17
33-
# 12.5: 9.2.1.18
34-
# 12.6: 9.5.1.17
35-
readonly -A CUDNN_FOR_CUDA=(
36-
["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
37-
["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17"
38-
)
39-
fi
4037
# https://developer.nvidia.com/nccl/nccl-download
41-
# 12.2: 2.19.3, 12.5: 2.21.5
4238
readonly -A NCCL_FOR_CUDA=(
43-
["11.7"]="2.21.5" ["11.8"]="2.21.5"
44-
["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4"
39+
["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
40+
["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4"
41+
["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5"
42+
["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3"
43+
["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3"
44+
["12.6"]="2.23.4"
4545
)
4646
readonly -A CUDA_SUBVER=(
47-
["11.7"]="11.7.1" ["11.8"]="11.8.0"
48-
["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2"
47+
["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
48+
["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
49+
["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
50+
["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
51+
["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
52+
["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
53+
["12.6"]="12.6.3"
4954
)
5055
}
5156

@@ -131,7 +136,7 @@ function set_driver_version() {
131136

132137
export DRIVER_VERSION DRIVER
133138

134-
gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
139+
gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
135140
if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
136141
echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
137142
exit 1
@@ -197,19 +202,22 @@ function prepare_gpu_env(){
197202

198203
# Verify SPARK compatability
199204
RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
200-
readonly RAPIDS_RUNTIME
205+
INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
206+
readonly RAPIDS_RUNTIME INCLUDE_GPUS
201207

202208
# determine whether we have nvidia-smi installed and working
203209
nvsmi
204210
}
205211

206-
# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
207-
# Users should run apt-mark unhold before they wish to upgrade these packages
212+
# Hold all NVIDIA-related packages from upgrading either unintenionally or
213+
# through use of services like unattended-upgrades
214+
#
215+
# Users should run apt-mark unhold before upgrading these packages
208216
function hold_nvidia_packages() {
209217
if ! is_debuntu ; then return ; fi
210218

211-
apt-mark hold nvidia-*
212-
apt-mark hold libnvidia-*
219+
apt-mark hold nvidia-* > /dev/null 2>&1
220+
apt-mark hold libnvidia-* > /dev/null 2>&1
213221
if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
214222
apt-mark hold xserver-xorg-video-nvidia*
215223
fi

0 commit comments

Comments
 (0)