Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions Dockerfile-infinityhub-hpc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we test these images?

Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# MAY NOT BE IMPORTANT ANYMORE

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it important?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we can remove it and if something breaks then we add it back in.

RUN apt install rocm-libs


# THIS FIX IS FOR SAWMILL, UNCLEAR IF NECESSARY FOR GENERAL USERS
#TODO: is this necessary?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO?

RUN apt remove -y openmpi ucx
#Let's remove existing /opt/ompi; and, link to our version.
RUN rm -rf /opt/ompi
RUN ln -s /container/ompi /opt
COPY dockerfile_scripts /tmp/det_dockerfile_scripts

# SHOULDN'T NEED TO SET SOME OF THESE VARIABLES
#USING OFI
#TODO: up until line 63 should be a separate shell script
ARG WITH_MPI=1
ARG WITH_OFI=1
ARG WITH_MPICH
ARG UCX_INSTALL_DIR=/container/ucx
ARG OMPI_INSTALL_DIR=/container/ompi
ARG MPICH_INSTALL_DIR=/container/mpich
ARG OFI_INSTALL_DIR=/container/ofi
ARG OMPI_WITH_CUDA=0
ARG OMPI_WITH_ROCM=1
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi

# Make sure OMPI/UCX show up in the right paths
ARG VERBS_LIB_DIR=/usr/lib/libibverbs
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin

# Set up UCX_LIBS and OFI_LIBS
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"

# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"

# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"

# But, only add them if WITH_MPI
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH

#USING OFI
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}

#USING UCX
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}

ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH

# Enable running OMPI as root
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}


ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
ARG ROCM_DIR=/opt/rocm
ENV ROCM_DIR $ROCM_DIR
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH

# Set an entrypoint that can scrape up the host libfabric.so and then
# run the user command. This is intended to enable performant execution
# on non-IB systems that have a proprietary libfabric.

ARG WITH_RCCL=1
ENV WITH_RCCL=$WITH_RCCL
ARG WITH_NFS_WORKAROUND=1
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND

RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
ENTRYPOINT ["/container/bin/scrape_libs.sh"]

RUN rm -r /tmp/*
74 changes: 74 additions & 0 deletions Dockerfile-infinityhub-pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
#why no highlighting?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove comment

ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 TT=0

RUN mkdir -p /var/run/sshd
RUN rm /etc/apt/sources.list.d/rocm.list
RUN pip install --upgrade pip

COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN apt-get update && /tmp/det_dockerfile_scripts/install_deb_packages.sh
RUN python --version
#RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh

# LIBFABRIC ISSUE
# USE CONDA FOR WORKAROUND
#TODO: MAY NOT BE A PROBLEM ANYMORE?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO?

# protect this image from slurm
#ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
ARG CONDA="${PATH}"

#RUN exit 1
# Install fixed version of FFI package for Ubuntu 20.04.
# This is done after above stuff to make sure we get right version.
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh


RUN pip install determined && pip uninstall -y determined
RUN apt update
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cargo

RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt

RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \
jupyter labextension disable "@jupyterlab/apputils-extension:announcements"

ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime

#ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary

#RUN ldconfig #TODO: check if this is necessary
RUN echo A
#TODO: finish iterating here, preferably turn it into a shell script.
ARG DEEPSPEED_PIP
ARG DS_BUILD_CUTLASS_OPS=0
ENV DS_BUILD_CUTLASS_OPS=0

#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi
#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi
RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&&pip3 install ninja cmake;pip3 install triton==2.3.1;fi
#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi
RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_FUSED_ADAM=0 DS_BUILD_CCL_COMM=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi
#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi
RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This deepspeed section definitely needs cleanup

RUN pip list | grep -i deepspeed
RUN echo "$DEEPSPEED_PIP"

RUN pip install tokenizers>=0.19
RUN pip install transformers==4.43.3
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
#RUN exit 1
# MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent
# PAD-133
ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1

CMD ["/bin/bash"]
USER root

RUN rm -r /tmp/*
178 changes: 155 additions & 23 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10-
CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-
ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7-
ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0-
ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1-
ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why other images are stored in REGISTRY_REPO := environments, or a repo with a -dev suffix, but this one is not?



CPU_SUFFIX := -cpu
CUDA_SUFFIX := -cuda
Expand Down Expand Up @@ -140,6 +145,15 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev
NGC_TF_REPO := tensorflow-ngc-dev
NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev

INFINITYHUB_PYTORCH_PREFIX := rocm/pytorch
INFINITYHUB_TENSORFLOW_PREFIX := rocm/tensorflow
INFINITYHUB_PYTORCH_VERSION := 2.1.2
INFINITYHUB_TENSORFLOW_VERSION :=

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this line

export INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev
INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev
INFINITYHUB_TF_REPO := tensorflow-infinityhub-dev
INFINITYHUB_TF_HPC_REPO := tensorflow-infinityhub-hpc-dev

# build hpc together since hpc is dependent on the normal build
.PHONY: build-pytorch-ngc
build-pytorch-ngc:
Expand All @@ -163,39 +177,157 @@ build-tensorflow-ngc:
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
.


#DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
#docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \

DEEPSPEED_VERSION := 0.13.0
export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch-infinityhub
build-pytorch-infinityhub:
docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
-t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \
.
docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \
--build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO)-$(SHORT_GIT_HASH) \
.


ifeq ($(WITH_MPICH),1)
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
else
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
.PHONY: build-pytorch13-tf210-rocm56
build-pytorch13-tf210-rocm56:
export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI)
.PHONY: build-pytorch13-tf210-rocm60
build-pytorch13-tf210-rocm60:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
.
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
.



ifeq ($(WITH_MPICH),1)
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
else
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm56
build-pytorch20-tf210-rocm56:

export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm60
build-pytorch20-tf210-rocm60:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.



ifeq ($(WITH_MPICH),1)
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
else
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
endif
export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm61
build-pytorch20-tf210-rocm61:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

ifeq ($(WITH_MPICH),1)
ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich
else
ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi
endif
export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
.PHONY: build-pytorch20-rocm61
build-pytorch20-rocm61:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="0" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \
.




export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX)
build-tf210-rocm60:
docker build -f Dockerfile-tensorflow-rocm \
--build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(VERSION) \
.


export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch20-tf210-rocm57-deepspeed
build-pytorch20-tf210-rocm57-deepspeed:
docker build --shm-size='1gb' -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
.

export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch20-tf210-rocm61-deepspeed
build-pytorch20-tf210-rocm61-deepspeed:
docker build --shm-size='1gb' -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="0" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
.



DEEPSPEED_VERSION := 0.8.3
export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.33.1
0.33.2
Loading