ModelTC · hiworldwzj · Jan 27, 2026 · Dec 8, 2025 · Jan 11, 2026 · Jan 12, 2026
diff --git a/docs/CN/source/models/add_new_model.md b/docs/CN/source/models/add_new_model.md
@@ -162,19 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
                                                                  self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
             self.lm_head_weight_ = self.wte_weight_
         return
-
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.pre_norm_weight_, 
-                   self.pre_norm_bias_, 
-                   self.final_norm_weight_, 
-                   self.final_norm_bias_,
-                   self.wte_weight_,
-                   self.lm_head_weight_]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
-
 ~~~
 
 ***transformer_layer_weight.py***
@@ -204,30 +191,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
         self._load_qkvo_weights(weights)
         self._load_ffn_weights(weights)
         return
-
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.att_norm_weight_,
-                   self.att_norm_bias_,
-                   self.q_weight_,
-                   self.k_weight_,
-                   self.v_weight_,
-                   self.q_bias_,
-                   self.k_bias_,
-                   self.v_bias_,
-                   self.o_weight_,
-                   self.o_bias_,
-
-                   self.ffn_norm_weight_,
-                   self.ffn_norm_bias_,
-                   self.ffn_1_weight_,
-                   self.ffn_1_bias_,
-                   self.ffn_2_weight_,
-                   self.ffn_2_bias_,
-                   ]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
     def _load_qkvo_weights(self, weights):
         if f"h.{self.layer_num_}.input_layernorm.weight" in weights:

diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst
@@ -367,17 +367,14 @@ PD 分离模式参数
 .. option:: --quant_type
 
     量化方法，可选值：
-
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``vllm-fp8w8a8-b128``
+    * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``awq``
+    * ``awq_marlin``
     * ``none`` (默认)
 
 .. option:: --quant_cfg
@@ -389,13 +386,7 @@ PD 分离模式参数
 .. option:: --vit_quant_type
 
     ViT 量化方法，可选值：
-
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``none`` (默认)

diff --git a/docs/CN/source/tutorial/deepseek_deployment.rst b/docs/CN/source/tutorial/deepseek_deployment.rst
@@ -49,13 +49,14 @@ LightLLM 支持以下几种部署模式：
 .. code-block:: bash
 
     # H200 单机 DeepSeek-R1 DP + EP 模式
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 8 \
-    --dp 8
+    --dp 8 \
+    --enable_ep_moe
 
 **参数说明:**
-- `MOE_MODE=EP`: 设置专家并行模式
+- `--enable_ep_moe`: 设置专家并行模式
 - `--tp 8`: 张量并行度
 - `--dp 8`: 数据并行度，通常设置为与 tp 相同的值
 
@@ -119,14 +120,14 @@ LightLLM 支持以下几种部署模式：
     # H200 多机 DeepSeek-R1 EP 模式 Node 0
     # 使用方法: sh multi_node_ep_node0.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 0 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **Node 1 启动命令:**
 
@@ -135,14 +136,14 @@ LightLLM 支持以下几种部署模式：
     # H200 多机 DeepSeek-R1 EP 模式 Node 1
     # 使用方法: sh multi_node_ep_node1.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 1 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **可选优化参数:**
 - `--enable_prefill_microbatch_overlap`: 启用预填充微批次重叠
@@ -179,7 +180,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d 
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --tp 8 \
@@ -189,7 +190,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 2732 \
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
-    --pd_master_port 60011
+    --pd_master_port 60011 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
@@ -202,7 +204,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --tp 8 \
@@ -212,7 +214,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 12322 \
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
-    --pd_master_port 60011
+    --pd_master_port 60011 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
@@ -269,7 +272,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --host $host \
@@ -279,15 +282,16 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 2732 \
     --disable_cudagraph \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
     # Decode 服务
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --host $host \
@@ -296,7 +300,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --tp 8 \
     --dp 8 \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 

diff --git a/docs/EN/source/models/add_new_model.md b/docs/EN/source/models/add_new_model.md
@@ -162,18 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
                                                                  self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
             self.lm_head_weight_ = self.wte_weight_
         return
-
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.pre_norm_weight_, 
-                   self.pre_norm_bias_, 
-                   self.final_norm_weight_, 
-                   self.final_norm_bias_,
-                   self.wte_weight_,
-                   self.lm_head_weight_]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
 ~~~
 
@@ -204,30 +192,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
         self._load_qkvo_weights(weights)
         self._load_ffn_weights(weights)
         return
-
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.att_norm_weight_,
-                   self.att_norm_bias_,
-                   self.q_weight_,
-                   self.k_weight_,
-                   self.v_weight_,
-                   self.q_bias_,
-                   self.k_bias_,
-                   self.v_bias_,
-                   self.o_weight_,
-                   self.o_bias_,
-
-                   self.ffn_norm_weight_,
-                   self.ffn_norm_bias_,
-                   self.ffn_1_weight_,
-                   self.ffn_1_bias_,
-                   self.ffn_2_weight_,
-                   self.ffn_2_bias_,
-                   ]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
     def _load_qkvo_weights(self, weights):
         if f"h.{self.layer_num_}.input_layernorm.weight" in weights:

diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
@@ -359,17 +359,14 @@ Quantization Parameters
 .. option:: --quant_type
 
     Quantization method, optional values:
-
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``vllm-fp8w8a8-b128``
+    * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``awq``
+    * ``awq_marlin``
     * ``none`` (default)
 
 .. option:: --quant_cfg
@@ -381,13 +378,7 @@ Quantization Parameters
 .. option:: --vit_quant_type
 
     ViT quantization method, optional values:
-
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``none`` (default)