deepspeedai
diff --git a/‎.github/workflows/cpu-torch-latest.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cpu-torch-latest.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/hpu-gaudi2-nightly.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/hpu-gaudi2-nightly.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/hpu-gaudi2.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/hpu-gaudi2.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/nv-a6000.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/nv-a6000.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/nv-flash-attn.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/nv-flash-attn.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nv-human-eval.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/nv-human-eval.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nv-torch-nightly-v100.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nv-torch-nightly-v100.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 9 additions & 10 deletions b/‎CONTRIBUTING.md‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎blogs/deepspeed-gds/README.md‎
Lines changed: 2 additions & 2 deletions b/‎blogs/deepspeed-gds/README.md‎
Lines changed: 2 additions & 2 deletions
@@ -42,7 +42,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
 
@@ -45,6 +45,8 @@ jobs:
         test_zero_leaf_module.py
         test_zero_offloadpp.py
         test_zero_tiled.py
+        test_autotp_training.py
+        test_ulysses.py
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
 
@@ -94,6 +94,8 @@ jobs:
         test_zero_nesting_init.py
         test_zeropp.py
         (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
+        (test_linear.py and (TestLoRALinear or TestBasicLinear))
+        (test_ctx.py and TestEngine)
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -112,7 +114,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
 
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -43,7 +43,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if you need to use an older transformers version temporarily in case of breakage
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           python -m pip install .
       - name: Install deepspeed
@@ -58,8 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"
 
@@ -18,7 +18,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -53,7 +53,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2
 
@@ -11,7 +11,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.6" --cuda_ver="12"
@@ -37,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
 
@@ -48,16 +48,15 @@ pytest run_sanity_check.py
 ```
 Note that the `--forked` flag is not necessary for the model tests.
 
-## Contributor License Agreement
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
-actually do, grant us the rights to use your contribution. For details, visit
-https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
-follow the instructions provided by the bot. You will only need to do this once across
-all repos using our CLA.
+## Developer Certificate of Origin
+This project welcomes contributions and suggestions. All contributions to deepspeedai projects
+require commits to be signed off with a [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin)
+(DCO) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
+
+When you submit a pull request, the DCO app will check for the presence of signed commits.
+Information about how this check works is here: https://github.com/dcoapp/app?tab=readme-ov-file#how-it-works
+
+To sign commits, you will need to include `-s` when running `git commit`. For example, `git commit -s -m "Commit message"`. One note, creating PRs via the GitHub interface do not appear to include this option.  If you forget this, clicking on the failing check in your PR will point you to commands you can run to rebase and sign previous commits.
 
 ## Code of Conduct
 This project has adopted the [Microsoft Open Source Code of
 
@@ -15,7 +15,7 @@
 
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
-
+* [2025/03] [DeepSpeed-AutoTP: Automatic Tensor Parallel Training of Hugging Face models](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/huggingface-tp/README.md)
 * [2024/12] [Ulysses-Offload: Democratizing Long Context LLM Training ](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/ulysses-offload/README.md)
 * [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
 * [2024/08] [DeepSpeed on Windows](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)]  [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/chinese/README.md)]
 
@@ -47,7 +47,7 @@ We used three benchmarking tools for our evaluations. The first is fio, the popu
 
 ## High-Performance I/O with CPU Buffers via NVMe Scaling
 
-Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
 
 <img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
 
@@ -85,4 +85,4 @@ In this blog post, we introduced DeepNVMe, an I/O optimization technology create
 
 
 # Acknowlegements
-This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nvidia.