Skip to content

Commit bd47e5b

Browse files
Add example for Deepspeed-AutoTP (#964)
* update tp example Signed-off-by: inkcherry <[email protected]> * update Signed-off-by: inkcherry <[email protected]> * add length bench file Signed-off-by: inkcherry <[email protected]> --------- Signed-off-by: inkcherry <[email protected]> Co-authored-by: Hongwei Chen <[email protected]>
1 parent 65bc536 commit bd47e5b

File tree

9 files changed

+260883
-0
lines changed

9 files changed

+260883
-0
lines changed

training/tensor_parallel/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# tensor parallel example
2+
This project is adapted from https://github.com/tatsu-lab/stanford_alpaca.
3+
We only modified the ds_config to enable tensor parallelism and more detailed logging, as an example use case.
4+
5+
**Script**
6+
7+
``` bash run.sh ``` or ```bash run.sh MODE```
8+
9+

training/tensor_parallel/alpaca_data.json

Lines changed: 260012 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"bf16": {
3+
"enabled": "auto"
4+
},
5+
"optimizer": {
6+
"type": "AdamW",
7+
"params": {
8+
"lr": "auto",
9+
"betas": "auto",
10+
"eps": "auto",
11+
"weight_decay": "auto"
12+
}
13+
},
14+
"scheduler": {
15+
"type": "WarmupDecayLR",
16+
"params": {
17+
"total_num_steps": "auto",
18+
"warmup_min_lr": "auto",
19+
"warmup_max_lr": "auto",
20+
"warmup_num_steps": "auto"
21+
}
22+
},
23+
"zero_optimization": {
24+
"stage": 1,
25+
"gather_16bit_weights_on_model_save": true
26+
},
27+
"tensor_parallel":{
28+
"autotp_size": 4
29+
},
30+
"gradient_accumulation_steps": "auto",
31+
"gradient_clipping": "auto",
32+
"steps_per_print": 1,
33+
"train_batch_size": "auto",
34+
"train_micro_batch_size_per_gpu": "auto",
35+
"wall_clock_breakdown": false
36+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"bf16": {
3+
"enabled": "auto"
4+
},
5+
"optimizer": {
6+
"type": "AdamW",
7+
"params": {
8+
"lr": "auto",
9+
"betas": "auto",
10+
"eps": "auto",
11+
"weight_decay": "auto"
12+
}
13+
},
14+
"scheduler": {
15+
"type": "WarmupDecayLR",
16+
"params": {
17+
"total_num_steps": "auto",
18+
"warmup_min_lr": "auto",
19+
"warmup_max_lr": "auto",
20+
"warmup_num_steps": "auto"
21+
}
22+
},
23+
"zero_optimization": {
24+
"stage": ${zero_stage},
25+
"gather_16bit_weights_on_model_save": true
26+
},
27+
"tensor_parallel":{
28+
"autotp_size": ${autotp_size}
29+
},
30+
"gradient_accumulation_steps": "auto",
31+
"gradient_clipping": "auto",
32+
"steps_per_print": 1,
33+
"train_batch_size": "auto",
34+
"train_micro_batch_size_per_gpu": "auto",
35+
"wall_clock_breakdown": false
36+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
transformers==4.50.1
2+
deepspeed>=0.16.4
3+
accelerate==1.6.0
4+
numpy
5+
rouge_score
6+
fire
7+
openai==0.28.0
8+
torch
9+
sentencepiece
10+
tokenizers>=0.13.3

training/tensor_parallel/run.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
weight_path=/host/ssd/hf_models/llama2-7b-hf
2+
# weight_path=/host/ssd/hf_models/Meta-Llama-3.1-8B
3+
export WANDB_MODE=disabled
4+
num_gpus=8
5+
epoch=3
6+
mbs=2
7+
MODE=${1:-zero1tp}
8+
if [ "$MODE" == "zero1tp" ]; then
9+
ZERO_STAGE=1
10+
AUTOTP_SIZE=4
11+
per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
12+
elif [ "$MODE" == "zero2tp" ]; then
13+
ZERO_STAGE=2
14+
AUTOTP_SIZE=4
15+
per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
16+
elif [ "$MODE" == "zero1" ]; then
17+
ZERO_STAGE=1
18+
AUTOTP_SIZE=0
19+
per_device_train_batch_size=$mbs
20+
elif [ "$MODE" == "zero2" ]; then
21+
ZERO_STAGE=2
22+
AUTOTP_SIZE=0
23+
per_device_train_batch_size=$mbs
24+
elif [ "$MODE" == "zero3" ]; then
25+
ZERO_STAGE=3
26+
AUTOTP_SIZE=0
27+
per_device_train_batch_size=$mbs
28+
elif [ "$MODE" == "tp" ]; then
29+
ZERO_STAGE=0
30+
AUTOTP_SIZE=8
31+
per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
32+
else
33+
echo "error '$MODE',please use 'zero' or 'tp'。"
34+
exit 1
35+
fi
36+
TEMPLATE_FILE="configs/ds_config_temp.json"
37+
OUTPUT_FILE="configs/ds_config.json"
38+
sed -e "s/\${zero_stage}/${ZERO_STAGE}/g" \
39+
-e "s/\${autotp_size}/${AUTOTP_SIZE}/g" \
40+
$TEMPLATE_FILE > $OUTPUT_FILE
41+
42+
43+
deepspeed --num_gpus $num_gpus \
44+
--master_port 51336 train.py \
45+
--model_name_or_path $weight_path \
46+
--data_path ./alpaca_data.json \
47+
--bf16 True \
48+
--output_dir out_load_test/$MODE \
49+
--num_train_epochs $epoch \
50+
--gradient_checkpointing false \
51+
--per_device_train_batch_size $per_device_train_batch_size \
52+
--per_device_eval_batch_size 1 \
53+
--evaluation_strategy no \
54+
--save_strategy steps \
55+
--save_steps 10000 \
56+
--gradient_accumulation_steps 4 \
57+
--learning_rate 0 \
58+
--learning_rate 2e-5 \
59+
--weight_decay 0. \
60+
--warmup_ratio 0.03 \
61+
--lr_scheduler_type cosine \
62+
--logging_steps 1 \
63+
--tf32 True \
64+
--deepspeed "./configs/ds_config.json"

0 commit comments

Comments
 (0)