Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
6e10122
Rl weight (#1143)
shihaobai Dec 8, 2025
bc0301e
refactor norm and add platform
shihaobai Jan 11, 2026
2cd361a
norm
shihaobai Jan 12, 2026
efcaa4e
mm weight refactor
shihaobai Jan 12, 2026
1e40cb7
Embedding and LMHead
sufubao Jan 12, 2026
4c2b33f
fix LMHeadWeight
sufubao Jan 12, 2026
c901ce9
fix gemma norm & slicer
shihaobai Jan 12, 2026
94bf9a0
fix
shihaobai Jan 12, 2026
7b2595a
MOE
sufubao Jan 12, 2026
4df3637
remove data_type
shihaobai Jan 12, 2026
101e89a
remove fused_moe_weight_tp
shihaobai Jan 12, 2026
b90666a
qk norm
shihaobai Jan 12, 2026
d5d9192
remove PlatformAwareOp.__init__()
shihaobai Jan 12, 2026
2a70eae
fix model call
sufubao Jan 13, 2026
0e17bf6
remove torchao
sufubao Jan 13, 2026
b7393ab
quantization draft
sufubao Jan 13, 2026
96a15fa
refactor quantization (draft)
shihaobai Jan 15, 2026
a63cc8c
fix
shihaobai Jan 15, 2026
164a299
unit_test
shihaobai Jan 15, 2026
e301d47
fix
shihaobai Jan 19, 2026
72c8f17
update docs
shihaobai Jan 19, 2026
60f8bc8
fix pre-weight
shihaobai Jan 19, 2026
7e85185
fix deepseek
shihaobai Jan 20, 2026
ff76f57
fix unitest
shihaobai Jan 20, 2026
67c5823
refactor fuse_moe
shihaobai Jan 22, 2026
b620c95
redunancy_expert(draft)
shihaobai Jan 22, 2026
461a7ad
remove weight_ep
shihaobai Jan 22, 2026
4563028
add redundancy assert
shihaobai Jan 23, 2026
5798c72
fix mm weight with bias
shihaobai Jan 23, 2026
9a0db71
fix internvl
shihaobai Jan 26, 2026
29d0f48
fix unitest
shihaobai Jan 26, 2026
4734f3e
lmhead fix
shihaobai Jan 26, 2026
ea486b4
remove cnt
shihaobai Jan 26, 2026
3a0009b
remove rmsnorm bias_name input.
Jan 26, 2026
b917009
fix att sink
Jan 26, 2026
b1b38e2
add cpu weight_buffer
shihaobai Jan 26, 2026
98802c6
fix att sink + gpt oss moe
Jan 26, 2026
84486f3
simplify fuse_moe
shihaobai Jan 26, 2026
ee9dc78
remove weight cpu buffer and add weight_list
shihaobai Jan 26, 2026
aec881f
per-channel weight moe
shihaobai Jan 26, 2026
9d1073a
moe weight buffer remove
shihaobai Jan 26, 2026
ba98c62
fix deepgemm
shihaobai Jan 26, 2026
2f9fa56
fix internvl 26b
shihaobai Jan 26, 2026
3aaf235
mini fix typing
Jan 27, 2026
74bb0ab
internvl fix
shihaobai Jan 27, 2026
17e1f13
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
34dd483
fix
Jan 27, 2026
991aa56
start_args_type.py add enable_ep_moe
Jan 27, 2026
4d4d14a
add log.
Jan 27, 2026
be2264d
fix mm weight
shihaobai Jan 27, 2026
6c8655e
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
a6ecf96
moe matmul use per token quant for all.
Jan 27, 2026
64bf501
fix mm weight
shihaobai Jan 27, 2026
56aaaec
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
89cd6db
fix
Jan 27, 2026
e9f72f5
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
Jan 27, 2026
9683167
fix tpsp ep
shihaobai Jan 27, 2026
e1b1981
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
22f0d2f
fix
Jan 27, 2026
4fd6701
fix mtp
shihaobai Jan 27, 2026
9d83579
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
a2dd072
fix redundancy
Jan 27, 2026
5d09cea
fix
Jan 27, 2026
2b46964
fix
Jan 27, 2026
e348775
fix
Jan 27, 2026
3c1f80f
fix
Jan 27, 2026
7dfb650
fix
Jan 27, 2026
739c479
fix bloom.
Jan 27, 2026
adbe97c
fix qwen3 235b online quant
shihaobai Jan 27, 2026
0655b42
Merge branch 'weight_refactor_rebase' of https://github.com/ModelTC/l…
shihaobai Jan 27, 2026
577e08f
add assert for 235 tp=8, deepgemm
shihaobai Jan 27, 2026
795ab56
fix
Jan 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 0 additions & 37 deletions docs/CN/source/models/add_new_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,19 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
self.lm_head_weight_ = self.wte_weight_
return

def verify_load(self):
errors = "weights load not ok"
weights = [self.pre_norm_weight_,
self.pre_norm_bias_,
self.final_norm_weight_,
self.final_norm_bias_,
self.wte_weight_,
self.lm_head_weight_]
for i in range(len(weights)):
assert weights[i] is not None, "index:" + str(i) + " " + errors
return

~~~

***transformer_layer_weight.py***
Expand Down Expand Up @@ -204,30 +191,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
self._load_qkvo_weights(weights)
self._load_ffn_weights(weights)
return

def verify_load(self):
errors = "weights load not ok"
weights = [self.att_norm_weight_,
self.att_norm_bias_,
self.q_weight_,
self.k_weight_,
self.v_weight_,
self.q_bias_,
self.k_bias_,
self.v_bias_,
self.o_weight_,
self.o_bias_,

self.ffn_norm_weight_,
self.ffn_norm_bias_,
self.ffn_1_weight_,
self.ffn_1_bias_,
self.ffn_2_weight_,
self.ffn_2_bias_,
]
for i in range(len(weights)):
assert weights[i] is not None, "index:" + str(i) + " " + errors
return

def _load_qkvo_weights(self, weights):
if f"h.{self.layer_num_}.input_layernorm.weight" in weights:
Expand Down
19 changes: 5 additions & 14 deletions docs/CN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -367,17 +367,14 @@ PD 分离模式参数
.. option:: --quant_type

量化方法,可选值:

* ``ppl-w4a16-128``
* ``flashllm-w6a16``
* ``ao-int4wo-[32,64,128,256]``
* ``ao-int8wo``
* ``ao-fp8w8a16``
* ``ao-fp6w6a16``

* ``vllm-w8a8``
* ``vllm-fp8w8a8``
* ``vllm-fp8w8a8-b128``
* ``deepgemm-fp8w8a8-b128``
* ``triton-fp8w8a8-block128``
* ``awq``
* ``awq_marlin``
* ``none`` (默认)

.. option:: --quant_cfg
Expand All @@ -389,13 +386,7 @@ PD 分离模式参数
.. option:: --vit_quant_type

ViT 量化方法,可选值:

* ``ppl-w4a16-128``
* ``flashllm-w6a16``
* ``ao-int4wo-[32,64,128,256]``
* ``ao-int8wo``
* ``ao-fp8w8a16``
* ``ao-fp6w6a16``

* ``vllm-w8a8``
* ``vllm-fp8w8a8``
* ``none`` (默认)
Expand Down
35 changes: 20 additions & 15 deletions docs/CN/source/tutorial/deepseek_deployment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@ LightLLM 支持以下几种部署模式:
.. code-block:: bash
# H200 单机 DeepSeek-R1 DP + EP 模式
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
--model_dir /path/DeepSeek-R1 \
--tp 8 \
--dp 8
--dp 8 \
--enable_ep_moe
**参数说明:**
- `MOE_MODE=EP`: 设置专家并行模式
- `--enable_ep_moe`: 设置专家并行模式
- `--tp 8`: 张量并行度
- `--dp 8`: 数据并行度,通常设置为与 tp 相同的值

Expand Down Expand Up @@ -119,14 +120,14 @@ LightLLM 支持以下几种部署模式:
# H200 多机 DeepSeek-R1 EP 模式 Node 0
# 使用方法: sh multi_node_ep_node0.sh <nccl_host>
export nccl_host=$1
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
--model_dir /path/DeepSeek-R1 \
--tp 16 \
--dp 16 \
--nnodes 2 \
--node_rank 0 \
--nccl_host $nccl_host \
--nccl_port 2732
--nccl_port 2732 --enable_ep_moe
**Node 1 启动命令:**

Expand All @@ -135,14 +136,14 @@ LightLLM 支持以下几种部署模式:
# H200 多机 DeepSeek-R1 EP 模式 Node 1
# 使用方法: sh multi_node_ep_node1.sh <nccl_host>
export nccl_host=$1
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
--model_dir /path/DeepSeek-R1 \
--tp 16 \
--dp 16 \
--nnodes 2 \
--node_rank 1 \
--nccl_host $nccl_host \
--nccl_port 2732
--nccl_port 2732 --enable_ep_moe
**可选优化参数:**
- `--enable_prefill_microbatch_overlap`: 启用预填充微批次重叠
Expand Down Expand Up @@ -179,7 +180,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
export host=$1
export pd_master_ip=$2
nvidia-cuda-mps-control -d
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
LOADWORKER=18 python -m lightllm.server.api_server \
--model_dir /path/DeepSeek-R1 \
--run_mode "prefill" \
--tp 8 \
Expand All @@ -189,7 +190,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
--nccl_port 2732 \
--disable_cudagraph \
--pd_master_ip $pd_master_ip \
--pd_master_port 60011
--pd_master_port 60011 \
--enable_ep_moe
# 如果需要启用微批次重叠,可以取消注释以下行
#--enable_prefill_microbatch_overlap
Expand All @@ -202,7 +204,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
export host=$1
export pd_master_ip=$2
nvidia-cuda-mps-control -d
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
LOADWORKER=18 python -m lightllm.server.api_server \
--model_dir /path/DeepSeek-R1 \
--run_mode "decode" \
--tp 8 \
Expand All @@ -212,7 +214,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
--nccl_port 12322 \
--disable_cudagraph \
--pd_master_ip $pd_master_ip \
--pd_master_port 60011
--pd_master_port 60011 \
--enable_ep_moe
# 如果需要启用微批次重叠,可以取消注释以下行
#--enable_decode_microbatch_overlap
Expand Down Expand Up @@ -269,7 +272,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
export host=$1
export config_server_host=$2
nvidia-cuda-mps-control -d
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
LOADWORKER=18 python -m lightllm.server.api_server \
--model_dir /path/DeepSeek-R1 \
--run_mode "prefill" \
--host $host \
Expand All @@ -279,15 +282,16 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
--nccl_port 2732 \
--disable_cudagraph \
--config_server_host $config_server_host \
--config_server_port 60088
--config_server_port 60088 \
--enable_ep_moe
# 如果需要启用微批次重叠,可以取消注释以下行
#--enable_prefill_microbatch_overlap
# Decode 服务
export host=$1
export config_server_host=$2
nvidia-cuda-mps-control -d
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
LOADWORKER=18 python -m lightllm.server.api_server \
--model_dir /path/DeepSeek-R1 \
--run_mode "decode" \
--host $host \
Expand All @@ -296,7 +300,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
--tp 8 \
--dp 8 \
--config_server_host $config_server_host \
--config_server_port 60088
--config_server_port 60088 \
--enable_ep_moe
# 如果需要启用微批次重叠,可以取消注释以下行
#--enable_decode_microbatch_overlap
Expand Down
36 changes: 0 additions & 36 deletions docs/EN/source/models/add_new_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,18 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
self.lm_head_weight_ = self.wte_weight_
return

def verify_load(self):
errors = "weights load not ok"
weights = [self.pre_norm_weight_,
self.pre_norm_bias_,
self.final_norm_weight_,
self.final_norm_bias_,
self.wte_weight_,
self.lm_head_weight_]
for i in range(len(weights)):
assert weights[i] is not None, "index:" + str(i) + " " + errors
return

~~~

Expand Down Expand Up @@ -204,30 +192,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
self._load_qkvo_weights(weights)
self._load_ffn_weights(weights)
return

def verify_load(self):
errors = "weights load not ok"
weights = [self.att_norm_weight_,
self.att_norm_bias_,
self.q_weight_,
self.k_weight_,
self.v_weight_,
self.q_bias_,
self.k_bias_,
self.v_bias_,
self.o_weight_,
self.o_bias_,

self.ffn_norm_weight_,
self.ffn_norm_bias_,
self.ffn_1_weight_,
self.ffn_1_bias_,
self.ffn_2_weight_,
self.ffn_2_bias_,
]
for i in range(len(weights)):
assert weights[i] is not None, "index:" + str(i) + " " + errors
return

def _load_qkvo_weights(self, weights):
if f"h.{self.layer_num_}.input_layernorm.weight" in weights:
Expand Down
19 changes: 5 additions & 14 deletions docs/EN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -359,17 +359,14 @@ Quantization Parameters
.. option:: --quant_type

Quantization method, optional values:

* ``ppl-w4a16-128``
* ``flashllm-w6a16``
* ``ao-int4wo-[32,64,128,256]``
* ``ao-int8wo``
* ``ao-fp8w8a16``
* ``ao-fp6w6a16``

* ``vllm-w8a8``
* ``vllm-fp8w8a8``
* ``vllm-fp8w8a8-b128``
* ``deepgemm-fp8w8a8-b128``
* ``triton-fp8w8a8-block128``
* ``awq``
* ``awq_marlin``
* ``none`` (default)

.. option:: --quant_cfg
Expand All @@ -381,13 +378,7 @@ Quantization Parameters
.. option:: --vit_quant_type

ViT quantization method, optional values:

* ``ppl-w4a16-128``
* ``flashllm-w6a16``
* ``ao-int4wo-[32,64,128,256]``
* ``ao-int8wo``
* ``ao-fp8w8a16``
* ``ao-fp6w6a16``

* ``vllm-w8a8``
* ``vllm-fp8w8a8``
* ``none`` (default)
Expand Down
Loading