File tree Expand file tree Collapse file tree 2 files changed +26
-3
lines changed Expand file tree Collapse file tree 2 files changed +26
-3
lines changed Original file line number Diff line number Diff line change @@ -212,9 +212,11 @@ build-deepspeed-gpu: build-gpu-cuda-113-base
212212 -t $(NGC_REGISTRY ) /$(GPU_DEEPSPEED_ENVIRONMENT_NAME ) -$(VERSION ) \
213213 .
214214
215- # This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed
216- # that we need for gpt-neox support.
215+ # This builds the environment for GPT-NeoX using EleutherAI's fork of DeepSpeed and our fork of the gpt-neox repo.
216+ # We need to disable BUILDKIT to build deepspeed ops which require access to nvidia-runtime during build.
217+ # See https://github.com/NVIDIA/nvidia-container-runtime/issues/153.
217218.PHONY : build-gpt-neox-deepspeed-gpu
219+ build-gpt-neox-deepspeed-gpu : export DOCKER_BUILDKIT=0
218220build-gpt-neox-deepspeed-gpu : build-gpu-cuda-113-base
219221 docker build -f Dockerfile-default-gpu \
220222 --build-arg BASE_IMAGE=" $( DOCKERHUB_REGISTRY) /$( GPU_CUDA_113_BASE_NAME) -$( SHORT_GIT_HASH) " \
@@ -223,7 +225,7 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
223225 --build-arg TORCH_CUDA_ARCH_LIST=" 6.0;6.1;6.2;7.0;7.5;8.0" \
224226 --build-arg APEX_GIT=" https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
225227 --build-arg DET_BUILD_NCCL=" " \
226- --build-arg DEEPSPEED_PIP=" git+https://github.com/determined-ai/deepspeed .git@eleuther_dai " \
228+ --build-arg DEEPSPEED_PIP=" git+https://github.com/EleutherAI/DeeperSpeed .git@0a237296f760efd4f58eb3c32b6cdc429a39041a#egg=deepspeed " \
227229 -t $(DOCKERHUB_REGISTRY ) /$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME ) -$(SHORT_GIT_HASH ) \
228230 -t $(DOCKERHUB_REGISTRY ) /$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME ) -$(VERSION ) \
229231 -t $(NGC_REGISTRY ) /$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME ) -$(SHORT_GIT_HASH ) \
Original file line number Diff line number Diff line change @@ -7,3 +7,24 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
77python -m pip install triton==1.0.0
88DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
99python -m deepspeed.env_report
10+
11+ if [[ " $DEEPSPEED_PIP " == * " EleutherAI" * ]]; then
12+ # This is a dependency of gpt-neox
13+ apt-get install -y mpich
14+ # Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894.
15+ pip install setuptools==59.5.0
16+ # Install gpt-neox and dependencies
17+ git clone -b determined2 https://github.com/determined-ai/gpt-neox.git
18+ python gpt-neox/megatron/fused_kernels/setup.py install
19+
20+ # Exclude DeeperSpeed reinstall since the version in requirements is not pinned.
21+ pip install $( grep -ivE " DeeperSpeed" gpt-neox/requirements/requirements.txt)
22+ pip install -r /gpt-neox/requirements/requirements-flashattention.txt
23+
24+ # Download sample data
25+ gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data
26+
27+ # Modify permissions to enable example to run in nonroot mode
28+ chmod -R 777 /gpt-neox
29+ chmod -R 777 /tmp
30+ fi
You can’t perform that action at this time.
0 commit comments