Skip to content

Commit 86be527

Browse files
committed
update gpt-neox image to gpt-neox v2.0 and combine with gpt-neox build
1 parent c84f857 commit 86be527

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

Makefile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,11 @@ build-deepspeed-gpu: build-gpu-cuda-113-base
212212
-t $(NGC_REGISTRY)/$(GPU_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
213213
.
214214

215-
# This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed
216-
# that we need for gpt-neox support.
215+
# This builds the environment for GPT-NeoX using EleutherAI's fork of DeepSpeed and our fork of the gpt-neox repo.
216+
# We need to disable BUILDKIT to build deepspeed ops which require access to nvidia-runtime during build.
217+
# See https://github.com/NVIDIA/nvidia-container-runtime/issues/153.
217218
.PHONY: build-gpt-neox-deepspeed-gpu
219+
build-gpt-neox-deepspeed-gpu: export DOCKER_BUILDKIT=0
218220
build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
219221
docker build -f Dockerfile-default-gpu \
220222
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_113_BASE_NAME)-$(SHORT_GIT_HASH)" \
@@ -223,7 +225,7 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
223225
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
224226
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
225227
--build-arg DET_BUILD_NCCL="" \
226-
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \
228+
--build-arg DEEPSPEED_PIP="git+https://github.com/EleutherAI/DeeperSpeed.git@0a237296f760efd4f58eb3c32b6cdc429a39041a#egg=deepspeed" \
227229
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
228230
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
229231
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \

dockerfile_scripts/install_deepspeed.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,24 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
77
python -m pip install triton==1.0.0
88
DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
99
python -m deepspeed.env_report
10+
11+
if [[ "$DEEPSPEED_PIP" == *"EleutherAI"* ]]; then
12+
# This is a dependency of gpt-neox
13+
apt-get install -y mpich
14+
# Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894.
15+
pip install setuptools==59.5.0
16+
# Install gpt-neox and dependencies
17+
git clone -b determined2 https://github.com/determined-ai/gpt-neox.git
18+
python gpt-neox/megatron/fused_kernels/setup.py install
19+
20+
# Exclude DeeperSpeed reinstall since the version in requirements is not pinned.
21+
pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt)
22+
pip install -r /gpt-neox/requirements/requirements-flashattention.txt
23+
24+
# Download sample data
25+
gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data
26+
27+
# Modify permissions to enable example to run in nonroot mode
28+
chmod -R 777 /gpt-neox
29+
chmod -R 777 /tmp
30+
fi

0 commit comments

Comments
 (0)