-
Notifications
You must be signed in to change notification settings - Fork 45
Dockerfile-default-rocm split into two separate files mainly aimed at different users. #272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d54724a
12d78ec
3d95ef1
e58c7c5
a1137b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| ARG BASE_IMAGE | ||
| FROM ${BASE_IMAGE} | ||
|
|
||
| # MAY NOT BE IMPORTANT ANYMORE | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it important?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we can remove it and if something breaks then we add it back in. |
||
| RUN apt install rocm-libs | ||
|
|
||
|
|
||
| # THIS FIX IS FOR SAWMILL, UNCLEAR IF NECESSARY FOR GENERAL USERS | ||
| #TODO: is this necessary? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO? |
||
| RUN apt remove -y openmpi ucx | ||
| #Let's remove existing /opt/ompi; and, link to our version. | ||
| RUN rm -rf /opt/ompi | ||
| RUN ln -s /container/ompi /opt | ||
| COPY dockerfile_scripts /tmp/det_dockerfile_scripts | ||
|
|
||
| # SHOULDN'T NEED TO SET SOME OF THESE VARIABLES | ||
| #USING OFI | ||
| #TODO: up until line 63 should be a separate shell script | ||
| ARG WITH_MPI=1 | ||
| ARG WITH_OFI=1 | ||
| ARG WITH_MPICH | ||
| ARG UCX_INSTALL_DIR=/container/ucx | ||
| ARG OMPI_INSTALL_DIR=/container/ompi | ||
| ARG MPICH_INSTALL_DIR=/container/mpich | ||
| ARG OFI_INSTALL_DIR=/container/ofi | ||
| ARG OMPI_WITH_CUDA=0 | ||
| ARG OMPI_WITH_ROCM=1 | ||
| RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi | ||
|
|
||
| # Make sure OMPI/UCX show up in the right paths | ||
| ARG VERBS_LIB_DIR=/usr/lib/libibverbs | ||
| ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64 | ||
| ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin | ||
| ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64 | ||
| ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin | ||
| ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib | ||
| ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin | ||
| ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib | ||
| ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin | ||
|
|
||
| # Set up UCX_LIBS and OFI_LIBS | ||
| ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:" | ||
| ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:" | ||
|
|
||
| # If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string | ||
| ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}" | ||
|
|
||
| # If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs | ||
| ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}" | ||
|
|
||
| # But, only add them if WITH_MPI | ||
| ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH | ||
|
|
||
| #USING OFI | ||
| ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}} | ||
|
|
||
| #USING UCX | ||
| ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}} | ||
|
|
||
| ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH | ||
|
|
||
| # Enable running OMPI as root | ||
| ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1} | ||
| ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1} | ||
|
|
||
|
|
||
| ARG AWS_PLUGIN_INSTALL_DIR=/container/aws | ||
| ARG WITH_AWS_TRACE | ||
| ARG INTERNAL_AWS_DS | ||
| ARG INTERNAL_AWS_PATH | ||
| ARG ROCM_DIR=/opt/rocm | ||
| ENV ROCM_DIR $ROCM_DIR | ||
| RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi | ||
| ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH | ||
|
|
||
| # Set an entrypoint that can scrape up the host libfabric.so and then | ||
| # run the user command. This is intended to enable performant execution | ||
| # on non-IB systems that have a proprietary libfabric. | ||
|
|
||
| ARG WITH_RCCL=1 | ||
| ENV WITH_RCCL=$WITH_RCCL | ||
| ARG WITH_NFS_WORKAROUND=1 | ||
| ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND | ||
|
|
||
| RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin | ||
| ENTRYPOINT ["/container/bin/scrape_libs.sh"] | ||
|
|
||
| RUN rm -r /tmp/* | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| ARG BASE_IMAGE | ||
| FROM ${BASE_IMAGE} | ||
| #why no highlighting? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove comment |
||
| ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 TT=0 | ||
|
|
||
| RUN mkdir -p /var/run/sshd | ||
| RUN rm /etc/apt/sources.list.d/rocm.list | ||
| RUN pip install --upgrade pip | ||
|
|
||
| COPY dockerfile_scripts /tmp/det_dockerfile_scripts | ||
|
|
||
| RUN apt-get update && /tmp/det_dockerfile_scripts/install_deb_packages.sh | ||
| RUN python --version | ||
| #RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh | ||
|
|
||
| # LIBFABRIC ISSUE | ||
| # USE CONDA FOR WORKAROUND | ||
| #TODO: MAY NOT BE A PROBLEM ANYMORE? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO? |
||
| # protect this image from slurm | ||
| #ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}" | ||
| ARG CONDA="${PATH}" | ||
|
|
||
| #RUN exit 1 | ||
| # Install fixed version of FFI package for Ubuntu 20.04. | ||
| # This is done after above stuff to make sure we get right version. | ||
| RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh | ||
| RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh | ||
| RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh | ||
|
|
||
|
|
||
| RUN pip install determined && pip uninstall -y determined | ||
| RUN apt update | ||
| RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cargo | ||
|
|
||
| RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt | ||
|
|
||
| RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \ | ||
| jupyter labextension disable "@jupyterlab/apputils-extension:announcements" | ||
|
|
||
| ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config | ||
| ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data | ||
| ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime | ||
|
|
||
| #ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary | ||
|
|
||
| #RUN ldconfig #TODO: check if this is necessary | ||
| RUN echo A | ||
| #TODO: finish iterating here, preferably turn it into a shell script. | ||
| ARG DEEPSPEED_PIP | ||
| ARG DS_BUILD_CUTLASS_OPS=0 | ||
| ENV DS_BUILD_CUTLASS_OPS=0 | ||
|
|
||
| #RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi | ||
| #RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi | ||
| RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&&pip3 install ninja cmake;pip3 install triton==2.3.1;fi | ||
| #RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi | ||
| RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_FUSED_ADAM=0 DS_BUILD_CCL_COMM=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi | ||
| #RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi | ||
| RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This deepspeed section definitely needs cleanup |
||
| RUN pip list | grep -i deepspeed | ||
| RUN echo "$DEEPSPEED_PIP" | ||
|
|
||
| RUN pip install tokenizers>=0.19 | ||
| RUN pip install transformers==4.43.3 | ||
| RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash | ||
| #RUN exit 1 | ||
| # MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent | ||
| # PAD-133 | ||
| ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 | ||
|
|
||
| CMD ["/bin/bash"] | ||
| USER root | ||
|
|
||
| RUN rm -r /tmp/* | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10- | |
| CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3- | ||
| CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8- | ||
| ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6- | ||
| ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7- | ||
| ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0- | ||
| ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1- | ||
| ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why other images are stored in |
||
|
|
||
|
|
||
| CPU_SUFFIX := -cpu | ||
| CUDA_SUFFIX := -cuda | ||
|
|
@@ -140,6 +145,15 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev | |
| NGC_TF_REPO := tensorflow-ngc-dev | ||
| NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev | ||
|
|
||
| INFINITYHUB_PYTORCH_PREFIX := rocm/pytorch | ||
| INFINITYHUB_TENSORFLOW_PREFIX := rocm/tensorflow | ||
| INFINITYHUB_PYTORCH_VERSION := 2.1.2 | ||
| INFINITYHUB_TENSORFLOW_VERSION := | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this line |
||
| export INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev | ||
| INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev | ||
| INFINITYHUB_TF_REPO := tensorflow-infinityhub-dev | ||
| INFINITYHUB_TF_HPC_REPO := tensorflow-infinityhub-hpc-dev | ||
|
|
||
| # build hpc together since hpc is dependent on the normal build | ||
| .PHONY: build-pytorch-ngc | ||
| build-pytorch-ngc: | ||
|
|
@@ -163,39 +177,157 @@ build-tensorflow-ngc: | |
| -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ | ||
| . | ||
|
|
||
|
|
||
| #DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ | ||
| #docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ | ||
| --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ | ||
|
|
||
| DEEPSPEED_VERSION := 0.13.0 | ||
| export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed | ||
| .PHONY: build-pytorch-infinityhub | ||
| build-pytorch-infinityhub: | ||
| docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ | ||
| --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ | ||
| --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ | ||
| --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ | ||
| . | ||
| docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \ | ||
| --build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO)-$(SHORT_GIT_HASH) \ | ||
| . | ||
|
|
||
|
|
||
| ifeq ($(WITH_MPICH),1) | ||
| ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich | ||
| ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich | ||
| else | ||
| ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi | ||
| ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi | ||
| endif | ||
| export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI) | ||
| .PHONY: build-pytorch13-tf210-rocm56 | ||
| build-pytorch13-tf210-rocm56: | ||
| export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI) | ||
| .PHONY: build-pytorch13-tf210-rocm60 | ||
| build-pytorch13-tf210-rocm60: | ||
| docker build -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="horovod==0.28.1" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="horovod==0.28.1" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
|
|
||
|
|
||
|
|
||
| ifeq ($(WITH_MPICH),1) | ||
| ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich | ||
| ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich | ||
| else | ||
| ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi | ||
| ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi | ||
| endif | ||
| export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI) | ||
| .PHONY: build-pytorch20-tf210-rocm56 | ||
| build-pytorch20-tf210-rocm56: | ||
|
|
||
| export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) | ||
| .PHONY: build-pytorch20-tf210-rocm60 | ||
| build-pytorch20-tf210-rocm60: | ||
| docker build -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="horovod==0.28.1" \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="0" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
|
|
||
|
|
||
|
|
||
| ifeq ($(WITH_MPICH),1) | ||
| ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich | ||
| else | ||
| ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi | ||
| endif | ||
| export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) | ||
| .PHONY: build-pytorch20-tf210-rocm61 | ||
| build-pytorch20-tf210-rocm61: | ||
| docker build -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="0" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
|
|
||
| ifeq ($(WITH_MPICH),1) | ||
| ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich | ||
| else | ||
| ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi | ||
| endif | ||
| export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) | ||
| .PHONY: build-pytorch20-rocm61 | ||
| build-pytorch20-rocm61: | ||
| docker build -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ | ||
| --build-arg TENSORFLOW_PIP="0" \ | ||
| --build-arg HOROVOD_PIP="0" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
|
|
||
|
|
||
|
|
||
|
|
||
| export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX) | ||
| build-tf210-rocm60: | ||
| docker build -f Dockerfile-tensorflow-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \ | ||
| --build-arg HOROVOD_PIP="0" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(VERSION) \ | ||
| . | ||
|
|
||
|
|
||
| export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) | ||
| export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) | ||
| export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html | ||
|
|
||
| export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed | ||
| .PHONY: build-pytorch20-tf210-rocm57-deepspeed | ||
| build-pytorch20-tf210-rocm57-deepspeed: | ||
| docker build --shm-size='1gb' -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="horovod==0.28.1" \ | ||
| --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ | ||
| --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ | ||
| --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ | ||
| --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ | ||
| --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ | ||
| . | ||
|
|
||
| export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed | ||
| .PHONY: build-pytorch20-tf210-rocm61-deepspeed | ||
| build-pytorch20-tf210-rocm61-deepspeed: | ||
| docker build --shm-size='1gb' -f Dockerfile-default-rocm \ | ||
| --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ | ||
| --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ | ||
| --build-arg HOROVOD_PIP="0" \ | ||
| --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ | ||
| --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ | ||
| --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ | ||
| --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ | ||
| --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ | ||
| --build-arg WITH_MPICH=$(WITH_MPICH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ | ||
| -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ | ||
| . | ||
|
|
||
|
|
||
|
|
||
| DEEPSPEED_VERSION := 0.8.3 | ||
| export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| 0.33.1 | ||
| 0.33.2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do we test these images?