determined-ai · will-HPE · Jun 27, 2024 · Jul 15, 2024 · Jul 16, 2024 · Jul 16, 2024
@@ -0,0 +1,88 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# MAY NOT BE IMPORTANT ANYMORE
+RUN apt install rocm-libs
+
+
+# THIS FIX IS FOR SAWMILL, UNCLEAR IF NECESSARY FOR GENERAL USERS
+#TODO: is this necessary?
+RUN apt remove -y openmpi ucx
+#Let's remove existing /opt/ompi; and, link to our version.
+RUN rm -rf /opt/ompi
+RUN ln -s /container/ompi /opt
+COPY dockerfile_scripts /tmp/det_dockerfile_scripts
+
+# SHOULDN'T NEED TO SET SOME OF THESE VARIABLES
+#USING OFI
+#TODO: up until line 63 should be a separate shell script
+ARG WITH_MPI=1
+ARG WITH_OFI=1
+ARG WITH_MPICH
+ARG UCX_INSTALL_DIR=/container/ucx
+ARG OMPI_INSTALL_DIR=/container/ompi
+ARG MPICH_INSTALL_DIR=/container/mpich
+ARG OFI_INSTALL_DIR=/container/ofi
+ARG OMPI_WITH_CUDA=0
+ARG OMPI_WITH_ROCM=1
+RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi
+
+# Make sure OMPI/UCX show up in the right paths
+ARG VERBS_LIB_DIR=/usr/lib/libibverbs
+ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
+ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
+ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
+ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
+ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
+ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
+ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
+ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin
+
+# Set up UCX_LIBS and OFI_LIBS
+ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
+ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"
+
+# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
+ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"
+
+# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
+ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"
+
+# But, only add them if WITH_MPI
+ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH
+
+#USING OFI
+ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}
+
+#USING UCX
+ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
+
+ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
+
+# Enable running OMPI as root
+ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}
+
+
+ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
+ARG WITH_AWS_TRACE
+ARG INTERNAL_AWS_DS
+ARG INTERNAL_AWS_PATH
+ARG ROCM_DIR=/opt/rocm
+ENV ROCM_DIR $ROCM_DIR
+RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
+ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
+
+# Set an entrypoint that can scrape up the host libfabric.so and then
+# run the user command. This is intended to enable performant execution
+# on non-IB systems that have a proprietary libfabric.
+
+ARG WITH_RCCL=1
+ENV WITH_RCCL=$WITH_RCCL
+ARG WITH_NFS_WORKAROUND=1
+ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND
+
+RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
+ENTRYPOINT ["/container/bin/scrape_libs.sh"]
+
+RUN rm -r /tmp/*
@@ -0,0 +1,74 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+#why no highlighting?
+ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 TT=0
+
+RUN mkdir -p /var/run/sshd
+RUN rm /etc/apt/sources.list.d/rocm.list
+RUN pip install --upgrade pip
+
+COPY dockerfile_scripts /tmp/det_dockerfile_scripts
+
+RUN apt-get update && /tmp/det_dockerfile_scripts/install_deb_packages.sh
+RUN python --version
+#RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh
+
+# LIBFABRIC ISSUE
+# USE CONDA FOR WORKAROUND
+#TODO: MAY NOT BE A PROBLEM ANYMORE?
+# protect this image from slurm
+#ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
+ARG CONDA="${PATH}"
+
+#RUN exit 1
+# Install fixed version of FFI package for Ubuntu 20.04.
+# This is done after above stuff to make sure we get right version.
+RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
+RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
+RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh
+
+
+RUN pip install determined && pip uninstall -y determined
+RUN apt update 
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cargo
+
+RUN python -m pip install  -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt
+
+RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \
+    jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
+
+ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
+ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
+ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime
+
+#ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary
+
+#RUN ldconfig  #TODO: check if this is necessary
+RUN echo A 
+#TODO: finish iterating here, preferably turn it into a shell script.
+ARG DEEPSPEED_PIP
+ARG DS_BUILD_CUTLASS_OPS=0
+ENV DS_BUILD_CUTLASS_OPS=0
+
+#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi
+#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi
+RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&&pip3 install ninja cmake;pip3 install triton==2.3.1;fi  
+#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install  pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi
+RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install  pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed &&  DS_BUILD_OPS=1 DS_BUILD_RAGGED_DEVICE_OPS=0  DS_BUILD_CUTLASS_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0  DS_BUILD_RANDOM_LTD=0 DS_BUILD_FUSED_ADAM=0 DS_BUILD_CCL_COMM=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi
+#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install  pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py  build && python3 setup.py install && python -m deepspeed.env_report; fi
+RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi
+RUN pip list | grep -i deepspeed
+RUN echo "$DEEPSPEED_PIP"
+
+RUN pip install tokenizers>=0.19
+RUN pip install transformers==4.43.3 
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
+#RUN exit 1
+# MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent
+# PAD-133
+ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1
+
+CMD ["/bin/bash"]
+USER root
+
+RUN rm -r /tmp/*
@@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10-
 CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
 CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
 ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-
+ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7-
+ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0-
+ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1-
+ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub
+
 
 CPU_SUFFIX := -cpu
 CUDA_SUFFIX := -cuda
@@ -140,6 +145,15 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev
 NGC_TF_REPO := tensorflow-ngc-dev
 NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev
 
+INFINITYHUB_PYTORCH_PREFIX := rocm/pytorch
+INFINITYHUB_TENSORFLOW_PREFIX := rocm/tensorflow
+INFINITYHUB_PYTORCH_VERSION := 2.1.2
+INFINITYHUB_TENSORFLOW_VERSION := 
+export INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev
+INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev
+INFINITYHUB_TF_REPO := tensorflow-infinityhub-dev
+INFINITYHUB_TF_HPC_REPO := tensorflow-infinityhub-hpc-dev
+
 # build hpc together since hpc is dependent on the normal build
 .PHONY: build-pytorch-ngc
 build-pytorch-ngc:
@@ -163,39 +177,157 @@ build-tensorflow-ngc:
 		-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
 		.
 
+
+	#DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
+	#docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
+                --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
+
+DEEPSPEED_VERSION := 0.13.0
+export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
+.PHONY: build-pytorch-infinityhub
+build-pytorch-infinityhub:
+	docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
+                --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
+                --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
+                --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
+                -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \
+                .
+	docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \
+                --build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO)-$(SHORT_GIT_HASH) \
+                .
+
+
 ifeq ($(WITH_MPICH),1)
-ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
+ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
 else
-ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
+ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
 endif
-export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
-.PHONY: build-pytorch13-tf210-rocm56
-build-pytorch13-tf210-rocm56:
+export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI)
+.PHONY: build-pytorch13-tf210-rocm60
+build-pytorch13-tf210-rocm60:
 	docker build -f Dockerfile-default-rocm \
-		--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
-		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
-		--build-arg HOROVOD_PIP="horovod==0.28.1" \
-		--build-arg WITH_MPICH=$(WITH_MPICH) \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
-		.
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg HOROVOD_PIP="horovod==0.28.1" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
+                .
+
+
 
 ifeq ($(WITH_MPICH),1)
-ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
+ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
 else
-ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
+ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
 endif
-export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
-.PHONY: build-pytorch20-tf210-rocm56
-build-pytorch20-tf210-rocm56:
+
+export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI)
+.PHONY: build-pytorch20-tf210-rocm60
+build-pytorch20-tf210-rocm60:
 	docker build -f Dockerfile-default-rocm \
-		--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
-		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
-		--build-arg HOROVOD_PIP="horovod==0.28.1" \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg HOROVOD_PIP="0" \
                 --build-arg WITH_MPICH=$(WITH_MPICH) \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
-		.
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
+                .
+
+
+
+ifeq ($(WITH_MPICH),1)
+ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
+else
+ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
+endif
+export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
+.PHONY: build-pytorch20-tf210-rocm61
+build-pytorch20-tf210-rocm61:
+	docker build -f Dockerfile-default-rocm \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg HOROVOD_PIP="0" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
+                .
+
+ifeq ($(WITH_MPICH),1)
+ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich
+else
+ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi
+endif
+export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
+.PHONY: build-pytorch20-rocm61
+build-pytorch20-rocm61:
+	docker build -f Dockerfile-default-rocm \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
+                --build-arg TENSORFLOW_PIP="0" \
+                --build-arg HOROVOD_PIP="0" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \
+                .
+
+
+
+
+export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX)
+build-tf210-rocm60:
+	docker build -f Dockerfile-tensorflow-rocm \
+                --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \
+                --build-arg HOROVOD_PIP="0" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(VERSION) \
+                .
+
+
+export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
+export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
+export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+
+export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
+.PHONY: build-pytorch20-tf210-rocm57-deepspeed
+build-pytorch20-tf210-rocm57-deepspeed:
+	docker build --shm-size='1gb' -f Dockerfile-default-rocm \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg HOROVOD_PIP="horovod==0.28.1" \
+                --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
+                --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
+                --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
+                --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
+                --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
+                .
+
+export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
+.PHONY: build-pytorch20-tf210-rocm61-deepspeed
+build-pytorch20-tf210-rocm61-deepspeed:
+	docker build --shm-size='1gb' -f Dockerfile-default-rocm \
+                --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
+                --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+                --build-arg HOROVOD_PIP="0" \
+                --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
+                --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
+                --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
+                --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
+                --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
+                -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
+                .
+
+
 
 DEEPSPEED_VERSION := 0.8.3
 export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox

@@ -1 +1 @@
-0.33.1
+0.33.2